Data/HashTable.hs

   1 {-# LANGUAGE CPP, NoImplicitPrelude #-}
   2 {-# OPTIONS_GHC -funbox-strict-fields -fno-warn-name-shadowing #-}
   3
   4 -----------------------------------------------------------------------------
   5 -- |
   6 -- Module      :  Data.HashTable
   7 -- Copyright   :  (c) The University of Glasgow 2003
   8 -- License     :  BSD-style (see the file libraries/base/LICENSE)
   9 --
  10 -- Maintainer  :  libraries@haskell.org
  11 -- Stability   :  provisional
  12 -- Portability :  portable
  13 --
  14 -- An implementation of extensible hash tables, as described in
  15 -- Per-Ake Larson, /Dynamic Hash Tables/, CACM 31(4), April 1988,
  16 -- pp. 446--457.  The implementation is also derived from the one
  17 -- in GHC's runtime system (@ghc\/rts\/Hash.{c,h}@).
  18 --
  19 -----------------------------------------------------------------------------
  20
  21 module Data.HashTable (
  22         -- * Basic hash table operations
  23         HashTable, new, newHint, insert, delete, lookup, update,
  24         -- * Converting to and from lists
  25         fromList, toList,
  26         -- * Hash functions
  27         -- $hash_functions
  28         hashInt, hashString,
  29         prime,
  30         -- * Diagnostics
  31         longestChain
  32  ) where
  33
  34 -- This module is imported by Data.Dynamic, which is pretty low down in the
  35 -- module hierarchy, so don't import "high-level" modules
  36
  37 #ifdef __GLASGOW_HASKELL__
  38 import GHC.Base
  39 #else
  40 import Prelude  hiding  ( lookup )
  41 #endif
  42 import Data.Tuple       ( fst )
  43 import Data.Bits
  44 import Data.Maybe
  45 import Data.List        ( maximumBy, length, concat, foldl', partition )
  46 import Data.Int         ( Int32 )
  47
  48 #if defined(__GLASGOW_HASKELL__)
  49 import GHC.Num
  50 import GHC.Real         ( fromIntegral )
  51 import GHC.Show         ( Show(..) )
  52 import GHC.Int          ( Int64 )
  53
  54 import GHC.IO
  55 import GHC.IOArray
  56 import GHC.IORef
  57 #else
  58 import Data.Char        ( ord )
  59 import Data.IORef       ( IORef, newIORef, readIORef, writeIORef )
  60 import System.IO.Unsafe ( unsafePerformIO )
  61 import Data.Int         ( Int64 )
  62 #  if defined(__HUGS__)
  63 import Hugs.IOArray     ( IOArray, newIOArray,
  64                           unsafeReadIOArray, unsafeWriteIOArray )
  65 #  elif defined(__NHC__)
  66 import NHC.IOExtras     ( IOArray, newIOArray, readIOArray, writeIOArray )
  67 #  endif
  68 #endif
  69 import Control.Monad    ( mapM, mapM_, sequence_ )
  70
  71
  72 -----------------------------------------------------------------------
  73
  74 iNSTRUMENTED :: Bool
  75 iNSTRUMENTED = False
  76
  77 -----------------------------------------------------------------------
  78
  79 readHTArray  :: HTArray a -> Int32 -> IO a
  80 writeMutArray :: MutArray a -> Int32 -> a -> IO ()
  81 newMutArray   :: (Int32, Int32) -> a -> IO (MutArray a)
  82 newMutArray = newIOArray
  83 type MutArray a = IOArray Int32 a
  84 type HTArray a = MutArray a
  85 #if defined(DEBUG) || defined(__NHC__)
  86 readHTArray  = readIOArray
  87 writeMutArray = writeIOArray
  88 #else
  89 readHTArray arr i = unsafeReadIOArray arr (fromIntegral i)
  90 writeMutArray arr i x = unsafeWriteIOArray arr (fromIntegral i) x
  91 #endif
  92
  93 data HashTable key val = HashTable {
  94                                      cmp     :: !(key -> key -> Bool),
  95                                      hash_fn :: !(key -> Int32),
  96                                      tab     :: !(IORef (HT key val))
  97                                    }
  98 -- TODO: the IORef should really be an MVar.
  99
 100 data HT key val
 101   = HT {
 102         kcount  :: !Int32,              -- Total number of keys.
 103         bmask   :: !Int32,
 104         buckets :: !(HTArray [(key,val)])
 105        }
 106
 107 -- ------------------------------------------------------------
 108 -- Instrumentation for performance tuning
 109
 110 -- This ought to be roundly ignored after optimization when
 111 -- iNSTRUMENTED=False.
 112
 113 -- STRICT version of modifyIORef!
 114 modifyIORef :: IORef a -> (a -> a) -> IO ()
 115 modifyIORef r f = do
 116   v <- readIORef r
 117   let z = f v in z `seq` writeIORef r z
 118
 119 data HashData = HD {
 120   tables :: !Integer,
 121   insertions :: !Integer,
 122   lookups :: !Integer,
 123   totBuckets :: !Integer,
 124   maxEntries :: !Int32,
 125   maxChain :: !Int,
 126   maxBuckets :: !Int32
 127 } deriving (Eq, Show)
 128
 129 {-# NOINLINE hashData #-}
 130 hashData :: IORef HashData
 131 hashData =  unsafePerformIO (newIORef (HD { tables=0, insertions=0, lookups=0,
 132                                             totBuckets=0, maxEntries=0,
 133                                             maxChain=0, maxBuckets=tABLE_MIN } ))
 134
 135 instrument :: (HashData -> HashData) -> IO ()
 136 instrument i | iNSTRUMENTED = modifyIORef hashData i
 137              | otherwise    = return ()
 138
 139 recordNew :: IO ()
 140 recordNew = instrument rec
 141   where rec hd@HD{ tables=t, totBuckets=b } =
 142                hd{ tables=t+1, totBuckets=b+fromIntegral tABLE_MIN }
 143
 144 recordIns :: Int32 -> Int32 -> [a] -> IO ()
 145 recordIns i sz bkt = instrument rec
 146   where rec hd@HD{ insertions=ins, maxEntries=mx, maxChain=mc } =
 147                hd{ insertions=ins+fromIntegral i, maxEntries=mx `max` sz,
 148                    maxChain=mc `max` length bkt }
 149
 150 recordResize :: Int32 -> Int32 -> IO ()
 151 recordResize older newer = instrument rec
 152   where rec hd@HD{ totBuckets=b, maxBuckets=mx } =
 153                hd{ totBuckets=b+fromIntegral (newer-older),
 154                    maxBuckets=mx `max` newer }
 155
 156 recordLookup :: IO ()
 157 recordLookup = instrument lkup
 158   where lkup hd@HD{ lookups=l } = hd{ lookups=l+1 }
 159
 160 -- stats :: IO String
 161 -- stats =  fmap show $ readIORef hashData
 162
 163 -- ----------------------------------------------------------------------------
 164 -- Sample hash functions
 165
 166 -- $hash_functions
 167 --
 168 -- This implementation of hash tables uses the low-order /n/ bits of the hash
 169 -- value for a key, where /n/ varies as the hash table grows.  A good hash
 170 -- function therefore will give an even distribution regardless of /n/.
 171 --
 172 -- If your keyspace is integrals such that the low-order bits between
 173 -- keys are highly variable, then you could get away with using 'fromIntegral'
 174 -- as the hash function.
 175 --
 176 -- We provide some sample hash functions for 'Int' and 'String' below.
 177
 178 golden :: Int32
 179 golden = 1013904242 -- = round ((sqrt 5 - 1) * 2^32) :: Int32
 180 -- was -1640531527 = round ((sqrt 5 - 1) * 2^31) :: Int32
 181 -- but that has bad mulHi properties (even adding 2^32 to get its inverse)
 182 -- Whereas the above works well and contains no hash duplications for
 183 -- [-32767..65536]
 184
 185 hashInt32 :: Int32 -> Int32
 186 hashInt32 x = mulHi x golden + x
 187
 188 -- | A sample (and useful) hash function for Int and Int32,
 189 -- implemented by extracting the uppermost 32 bits of the 64-bit
 190 -- result of multiplying by a 33-bit constant.  The constant is from
 191 -- Knuth, derived from the golden ratio:
 192 --
 193 -- > golden = round ((sqrt 5 - 1) * 2^32)
 194 --
 195 -- We get good key uniqueness on small inputs
 196 -- (a problem with previous versions):
 197 --  (length $ group $ sort $ map hashInt [-32767..65536]) == 65536 + 32768
 198 --
 199 hashInt :: Int -> Int32
 200 hashInt x = hashInt32 (fromIntegral x)
 201
 202 -- hi 32 bits of a x-bit * 32 bit -> 64-bit multiply
 203 mulHi :: Int32 -> Int32 -> Int32
 204 mulHi a b = fromIntegral (r `shiftR` 32)
 205    where r :: Int64
 206          r = fromIntegral a * fromIntegral b
 207
 208 -- | A sample hash function for Strings.  We keep multiplying by the
 209 -- golden ratio and adding.  The implementation is:
 210 --
 211 -- > hashString = foldl' f golden
 212 -- >   where f m c = fromIntegral (ord c) * magic + hashInt32 m
 213 -- >         magic = 0xdeadbeef
 214 --
 215 -- Where hashInt32 works just as hashInt shown above.
 216 --
 217 -- Knuth argues that repeated multiplication by the golden ratio
 218 -- will minimize gaps in the hash space, and thus it's a good choice
 219 -- for combining together multiple keys to form one.
 220 --
 221 -- Here we know that individual characters c are often small, and this
 222 -- produces frequent collisions if we use ord c alone.  A
 223 -- particular problem are the shorter low ASCII and ISO-8859-1
 224 -- character strings.  We pre-multiply by a magic twiddle factor to
 225 -- obtain a good distribution.  In fact, given the following test:
 226 --
 227 -- > testp :: Int32 -> Int
 228 -- > testp k = (n - ) . length . group . sort . map hs . take n $ ls
 229 -- >   where ls = [] : [c : l | l <- ls, c <- ['\0'..'\xff']]
 230 -- >         hs = foldl' f golden
 231 -- >         f m c = fromIntegral (ord c) * k + hashInt32 m
 232 -- >         n = 100000
 233 --
 234 -- We discover that testp magic = 0.
 235
 236 hashString :: String -> Int32
 237 hashString = foldl' f golden
 238    where f m c = fromIntegral (ord c) * magic + hashInt32 m
 239          magic = 0xdeadbeef
 240
 241 -- | A prime larger than the maximum hash table size
 242 prime :: Int32
 243 prime = 33554467
 244
 245 -- -----------------------------------------------------------------------------
 246 -- Parameters
 247
 248 tABLE_MAX :: Int32
 249 tABLE_MAX  = 32 * 1024 * 1024   -- Maximum size of hash table
 250 tABLE_MIN :: Int32
 251 tABLE_MIN  = 8
 252
 253 hLOAD :: Int32
 254 hLOAD = 7                       -- Maximum average load of a single hash bucket
 255
 256 hYSTERESIS :: Int32
 257 hYSTERESIS = 64                 -- entries to ignore in load computation
 258
 259 {- Hysteresis favors long association-list-like behavior for small tables. -}
 260
 261 -- -----------------------------------------------------------------------------
 262 -- Creating a new hash table
 263
 264 -- | Creates a new hash table.  The following property should hold for the @eq@
 265 -- and @hash@ functions passed to 'new':
 266 --
 267 -- >   eq A B  =>  hash A == hash B
 268 --
 269 new
 270   :: (key -> key -> Bool)    -- ^ @eq@: An equality comparison on keys
 271   -> (key -> Int32)          -- ^ @hash@: A hash function on keys
 272   -> IO (HashTable key val)  -- ^ Returns: an empty hash table
 273
 274 new cmpr hash = do
 275   recordNew
 276   -- make a new hash table with a single, empty, segment
 277   let mask = tABLE_MIN-1
 278   bkts <- newMutArray (0,mask) []
 279
 280   let
 281     kcnt = 0
 282     ht = HT {  buckets=bkts, kcount=kcnt, bmask=mask }
 283
 284   table <- newIORef ht
 285   return (HashTable { tab=table, hash_fn=hash, cmp=cmpr })
 286
 287 {-
 288    bitTwiddleSameAs takes as arguments positive Int32s less than maxBound/2 and
 289    returns the smallest power of 2 that is greater than or equal to the
 290    argument.
 291    http://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2
 292 -}
 293 bitTwiddleSameAs :: Int32 -> Int32
 294 bitTwiddleSameAs v0 =
 295     let v1 = v0-1
 296         v2 = v1 .|. (v1`shiftR`1)
 297         v3 = v2 .|. (v2`shiftR`2)
 298         v4 = v3 .|. (v3`shiftR`4)
 299         v5 = v4 .|. (v4`shiftR`8)
 300         v6 = v5 .|. (v5`shiftR`16)
 301     in v6+1
 302
 303 {-
 304   powerOver takes as arguments Int32s and returns the smallest power of 2
 305   that is greater than or equal to the argument if that power of 2 is
 306   within [tABLE_MIN,tABLE_MAX]
 307 -}
 308 powerOver :: Int32 -> Int32
 309 powerOver n =
 310     if n <= tABLE_MIN
 311     then tABLE_MIN
 312     else if n >= tABLE_MAX
 313          then tABLE_MAX
 314          else bitTwiddleSameAs n
 315
 316 -- | Creates a new hash table with the given minimum size.
 317 newHint
 318   :: (key -> key -> Bool)    -- ^ @eq@: An equality comparison on keys
 319   -> (key -> Int32)          -- ^ @hash@: A hash function on keys
 320   -> Int                     -- ^ @minSize@: initial table size
 321   -> IO (HashTable key val)  -- ^ Returns: an empty hash table
 322
 323 newHint cmpr hash minSize = do
 324   recordNew
 325   -- make a new hash table with a single, empty, segment
 326   let mask = powerOver $ fromIntegral minSize
 327   bkts <- newMutArray (0,mask) []
 328
 329   let
 330     kcnt = 0
 331     ht = HT {  buckets=bkts, kcount=kcnt, bmask=mask }
 332
 333   table <- newIORef ht
 334   return (HashTable { tab=table, hash_fn=hash, cmp=cmpr })
 335
 336 -- -----------------------------------------------------------------------------
 337 -- Inserting a key\/value pair into the hash table
 338
 339 -- | Inserts a key\/value mapping into the hash table.
 340 --
 341 -- Note that 'insert' doesn't remove the old entry from the table -
 342 -- the behaviour is like an association list, where 'lookup' returns
 343 -- the most-recently-inserted mapping for a key in the table.  The
 344 -- reason for this is to keep 'insert' as efficient as possible.  If
 345 -- you need to update a mapping, then we provide 'update'.
 346 --
 347 insert :: HashTable key val -> key -> val -> IO ()
 348
 349 insert ht key val =
 350   updatingBucket CanInsert (\bucket -> ((key,val):bucket, 1, ())) ht key
 351
 352
 353 -- ------------------------------------------------------------
 354 -- The core of the implementation is lurking down here, in findBucket,
 355 -- updatingBucket, and expandHashTable.
 356
 357 tooBig :: Int32 -> Int32 -> Bool
 358 tooBig k b = k-hYSTERESIS > hLOAD * b
 359
 360 -- index of bucket within table.
 361 bucketIndex :: Int32 -> Int32 -> Int32
 362 bucketIndex mask h = h .&. mask
 363
 364 -- find the bucket in which the key belongs.
 365 -- returns (key equality, bucket index, bucket)
 366 --
 367 -- This rather grab-bag approach gives enough power to do pretty much
 368 -- any bucket-finding thing you might want to do.  We rely on inlining
 369 -- to throw away the stuff we don't want.  I'm proud to say that this
 370 -- plus updatingBucket below reduce most of the other definitions to a
 371 -- few lines of code, while actually speeding up the hashtable
 372 -- implementation when compared with a version which does everything
 373 -- from scratch.
 374 {-# INLINE findBucket #-}
 375 findBucket :: HashTable key val -> key -> IO (HT key val, Int32, [(key,val)])
 376 findBucket HashTable{ tab=ref, hash_fn=hash} key = do
 377   table@HT{ buckets=bkts, bmask=b } <- readIORef ref
 378   let indx = bucketIndex b (hash key)
 379   bucket <- readHTArray bkts indx
 380   return (table, indx, bucket)
 381
 382 data Inserts = CanInsert
 383              | Can'tInsert
 384              deriving (Eq)
 385
 386 -- updatingBucket is the real workhorse of all single-element table
 387 -- updates.  It takes a hashtable and a key, along with a function
 388 -- describing what to do with the bucket in which that key belongs.  A
 389 -- flag indicates whether this function may perform table insertions.
 390 -- The function returns the new contents of the bucket, the number of
 391 -- bucket entries inserted (negative if entries were deleted), and a
 392 -- value which becomes the return value for the function as a whole.
 393 -- The table sizing is enforced here, calling out to expandSubTable as
 394 -- necessary.
 395
 396 -- This function is intended to be inlined and specialized for every
 397 -- calling context (eg every provided bucketFn).
 398 {-# INLINE updatingBucket #-}
 399
 400 updatingBucket :: Inserts -> ([(key,val)] -> ([(key,val)], Int32, a)) ->
 401                   HashTable key val -> key ->
 402                   IO a
 403 updatingBucket canEnlarge bucketFn
 404                ht@HashTable{ tab=ref, hash_fn=hash } key = do
 405   (table@HT{ kcount=k, buckets=bkts, bmask=b },
 406    indx, bckt) <- findBucket ht key
 407   (bckt', inserts, result) <- return $ bucketFn bckt
 408   let k' = k + inserts
 409       table1 = table { kcount=k' }
 410   writeMutArray bkts indx bckt'
 411   table2 <- if canEnlarge == CanInsert && inserts > 0 then do
 412                recordIns inserts k' bckt'
 413                if tooBig k' b
 414                   then expandHashTable hash table1
 415                   else return table1
 416             else return table1
 417   writeIORef ref table2
 418   return result
 419
 420 expandHashTable :: (key -> Int32) -> HT key val -> IO (HT key val)
 421 expandHashTable hash table@HT{ buckets=bkts, bmask=mask } = do
 422    let
 423       oldsize = mask + 1
 424       newmask = mask + mask + 1
 425    recordResize oldsize (newmask+1)
 426    --
 427    if newmask > tABLE_MAX-1
 428       then return table
 429       else do
 430    --
 431     newbkts <- newMutArray (0,newmask) []
 432
 433     let
 434      splitBucket oldindex = do
 435        bucket <- readHTArray bkts oldindex
 436        let (oldb,newb) =
 437               partition ((oldindex==). bucketIndex newmask . hash . fst) bucket
 438        writeMutArray newbkts oldindex oldb
 439        writeMutArray newbkts (oldindex + oldsize) newb
 440     mapM_ splitBucket [0..mask]
 441
 442     return ( table{ buckets=newbkts, bmask=newmask } )
 443
 444 -- -----------------------------------------------------------------------------
 445 -- Deleting a mapping from the hash table
 446
 447 -- Remove a key from a bucket
 448 deleteBucket :: (key -> Bool) -> [(key,val)] -> ([(key, val)], Int32, ())
 449 deleteBucket _   [] = ([],0,())
 450 deleteBucket del (pair@(k,_):bucket) =
 451   case deleteBucket del bucket of
 452     (bucket', dels, _) | del k     -> dels' `seq` (bucket', dels', ())
 453                        | otherwise -> (pair:bucket', dels, ())
 454       where dels' = dels - 1
 455
 456 -- | Remove an entry from the hash table.
 457 delete :: HashTable key val -> key -> IO ()
 458
 459 delete ht@HashTable{ cmp=eq } key =
 460   updatingBucket Can'tInsert (deleteBucket (eq key)) ht key
 461
 462 -- -----------------------------------------------------------------------------
 463 -- Updating a mapping in the hash table
 464
 465 -- | Updates an entry in the hash table, returning 'True' if there was
 466 -- already an entry for this key, or 'False' otherwise.  After 'update'
 467 -- there will always be exactly one entry for the given key in the table.
 468 --
 469 -- 'insert' is more efficient than 'update' if you don't care about
 470 -- multiple entries, or you know for sure that multiple entries can't
 471 -- occur.  However, 'update' is more efficient than 'delete' followed
 472 -- by 'insert'.
 473 update :: HashTable key val -> key -> val -> IO Bool
 474
 475 update ht@HashTable{ cmp=eq } key val =
 476   updatingBucket CanInsert
 477     (\bucket -> let (bucket', dels, _) = deleteBucket (eq key) bucket
 478                 in  ((key,val):bucket', 1+dels, dels/=0))
 479     ht key
 480
 481 -- -----------------------------------------------------------------------------
 482 -- Looking up an entry in the hash table
 483
 484 -- | Looks up the value of a key in the hash table.
 485 lookup :: HashTable key val -> key -> IO (Maybe val)
 486
 487 lookup ht@HashTable{ cmp=eq } key = do
 488   recordLookup
 489   (_, _, bucket) <- findBucket ht key
 490   let firstHit (k,v) r | eq key k  = Just v
 491                        | otherwise = r
 492   return (foldr firstHit Nothing bucket)
 493
 494 -- -----------------------------------------------------------------------------
 495 -- Converting to/from lists
 496
 497 -- | Convert a list of key\/value pairs into a hash table.  Equality on keys
 498 -- is taken from the Eq instance for the key type.
 499 --
 500 fromList :: (Eq key) => (key -> Int32) -> [(key,val)] -> IO (HashTable key val)
 501 fromList hash list = do
 502   table <- new (==) hash
 503   sequence_ [ insert table k v | (k,v) <- list ]
 504   return table
 505
 506 -- | Converts a hash table to a list of key\/value pairs.
 507 --
 508 toList :: HashTable key val -> IO [(key,val)]
 509 toList = mapReduce id concat
 510
 511 {-# INLINE mapReduce #-}
 512 mapReduce :: ([(key,val)] -> r) -> ([r] -> r) -> HashTable key val -> IO r
 513 mapReduce m r HashTable{ tab=ref } = do
 514   HT{ buckets=bckts, bmask=b } <- readIORef ref
 515   fmap r (mapM (fmap m . readHTArray bckts) [0..b])
 516
 517 -- -----------------------------------------------------------------------------
 518 -- Diagnostics
 519
 520 -- | This function is useful for determining whether your hash
 521 -- function is working well for your data set.  It returns the longest
 522 -- chain of key\/value pairs in the hash table for which all the keys
 523 -- hash to the same bucket.  If this chain is particularly long (say,
 524 -- longer than 14 elements or so), then it might be a good idea to try
 525 -- a different hash function.
 526 --
 527 longestChain :: HashTable key val -> IO [(key,val)]
 528 longestChain = mapReduce id (maximumBy lengthCmp)
 529   where lengthCmp (_:x)(_:y) = lengthCmp x y
 530         lengthCmp []   []    = EQ
 531         lengthCmp []   _     = LT
 532         lengthCmp _    []    = GT