Data/HashTable.hs

   1 {-# OPTIONS_GHC -fno-implicit-prelude #-}
   2
   3 -----------------------------------------------------------------------------
   4 -- |
   5 -- Module      :  Data.HashTable
   6 -- Copyright   :  (c) The University of Glasgow 2003
   7 -- License     :  BSD-style (see the file libraries/base/LICENSE)
   8 --
   9 -- Maintainer  :  libraries@haskell.org
  10 -- Stability   :  provisional
  11 -- Portability :  portable
  12 --
  13 -- An implementation of extensible hash tables, as described in
  14 -- Per-Ake Larson, /Dynamic Hash Tables/, CACM 31(4), April 1988,
  15 -- pp. 446--457.  The implementation is also derived from the one
  16 -- in GHC's runtime system (@ghc\/rts\/Hash.{c,h}@).
  17 --
  18 -----------------------------------------------------------------------------
  19
  20 module Data.HashTable (
  21         -- * Basic hash table operations
  22         HashTable, new, insert, delete, lookup, update,
  23         -- * Converting to and from lists
  24         fromList, toList,
  25         -- * Hash functions
  26         -- $hash_functions
  27         hashInt, hashString,
  28         prime,
  29         -- * Diagnostics
  30         longestChain
  31  ) where
  32
  33 -- This module is imported by Data.Dynamic, which is pretty low down in the
  34 -- module hierarchy, so don't import "high-level" modules
  35
  36 #ifdef __GLASGOW_HASKELL__
  37 import GHC.Base
  38 #else
  39 import Prelude  hiding  ( lookup )
  40 #endif
  41 import Data.Tuple       ( fst )
  42 import Data.Bits
  43 import Data.Maybe
  44 import Data.List        ( maximumBy, filter, length, concat, foldl )
  45 import Data.Int         ( Int32 )
  46
  47 #if defined(__GLASGOW_HASKELL__)
  48 import GHC.Num
  49 import GHC.Real         ( Integral(..), fromIntegral )
  50
  51 import GHC.IOBase       ( IO, IOArray, newIOArray, readIOArray, writeIOArray,
  52                           unsafeReadIOArray, unsafeWriteIOArray,
  53                           IORef, newIORef, readIORef, writeIORef )
  54 import GHC.Err          ( undefined )
  55 #else
  56 import Data.Char        ( ord )
  57 import Data.IORef       ( IORef, newIORef, readIORef, writeIORef )
  58 #  if defined(__HUGS__)
  59 import Hugs.IOArray     ( IOArray, newIOArray, readIOArray, writeIOArray,
  60                           unsafeReadIOArray, unsafeWriteIOArray )
  61 #  elif defined(__NHC__)
  62 import NHC.IOExtras     ( IOArray, newIOArray, readIOArray, writeIOArray)
  63 #  endif
  64 #endif
  65 import Control.Monad    ( when, mapM, sequence_ )
  66
  67
  68 -----------------------------------------------------------------------
  69 myReadArray  :: IOArray Int32 a -> Int32 -> IO a
  70 myWriteArray :: IOArray Int32 a -> Int32 -> a -> IO ()
  71 #if defined(DEBUG) || defined(__NHC__)
  72 myReadArray  = readIOArray
  73 myWriteArray = writeIOArray
  74 #else
  75 myReadArray arr i = unsafeReadIOArray arr (fromIntegral i)
  76 myWriteArray arr i x = unsafeWriteIOArray arr (fromIntegral i) x
  77 #endif
  78
  79 -- | A hash table mapping keys of type @key@ to values of type @val@.
  80 --
  81 -- The implementation will grow the hash table as necessary, trying to
  82 -- maintain a reasonable average load per bucket in the table.
  83 --
  84 newtype HashTable key val = HashTable (IORef (HT key val))
  85 -- TODO: the IORef should really be an MVar.
  86
  87 data HT key val
  88   = HT {
  89         split  :: !Int32, -- Next bucket to split when expanding
  90         max_bucket :: !Int32, -- Max bucket of smaller table
  91         mask1  :: !Int32, -- Mask for doing the mod of h_1 (smaller table)
  92         mask2  :: !Int32, -- Mask for doing the mod of h_2 (larger table)
  93         kcount :: !Int32, -- Number of keys
  94         bcount :: !Int32, -- Number of buckets
  95         dir    :: !(IOArray Int32 (IOArray Int32 [(key,val)])),
  96         hash_fn :: key -> Int32,
  97         cmp    :: key -> key -> Bool
  98    }
  99
 100 {-
 101 ALTERNATIVE IMPLEMENTATION:
 102
 103 This works out slightly slower, because there's a tradeoff between
 104 allocating a complete new HT structure each time a modification is
 105 made (in the version above), and allocating new Int32s each time one
 106 of them is modified, as below.  Using FastMutInt instead of IORef
 107 Int32 helps, but yields an implementation which has about the same
 108 performance as the version above (and is more complex).
 109
 110 data HashTable key val
 111   = HashTable {
 112         split  :: !(IORef Int32), -- Next bucket to split when expanding
 113         max_bucket :: !(IORef Int32), -- Max bucket of smaller table
 114         mask1  :: !(IORef Int32), -- Mask for doing the mod of h_1 (smaller table)
 115         mask2  :: !(IORef Int32), -- Mask for doing the mod of h_2 (larger table)
 116         kcount :: !(IORef Int32), -- Number of keys
 117         bcount :: !(IORef Int32), -- Number of buckets
 118         dir    :: !(IOArray Int32 (IOArray Int32 [(key,val)])),
 119         hash_fn :: key -> Int32,
 120         cmp    :: key -> key -> Bool
 121    }
 122 -}
 123
 124
 125 -- -----------------------------------------------------------------------------
 126 -- Sample hash functions
 127
 128 -- $hash_functions
 129 --
 130 -- This implementation of hash tables uses the low-order /n/ bits of the hash
 131 -- value for a key, where /n/ varies as the hash table grows.  A good hash
 132 -- function therefore will give an even distribution regardless of /n/.
 133 --
 134 -- If your keyspace is integrals such that the low-order bits between
 135 -- keys are highly variable, then you could get away with using 'id'
 136 -- as the hash function.
 137 --
 138 -- We provide some sample hash functions for 'Int' and 'String' below.
 139
 140 -- | A sample hash function for 'Int', implemented as simply @(x `mod` P)@
 141 -- where P is a suitable prime (currently 1500007).  Should give
 142 -- reasonable results for most distributions of 'Int' values, except
 143 -- when the keys are all multiples of the prime!
 144 --
 145 hashInt :: Int -> Int32
 146 hashInt = (`rem` prime) . fromIntegral
 147
 148 -- | A sample hash function for 'String's.  The implementation is:
 149 --
 150 -- >    hashString = fromIntegral . foldr f 0
 151 -- >      where f c m = ord c + (m * 128) `rem` 1500007
 152 --
 153 -- which seems to give reasonable results.
 154 --
 155 hashString :: String -> Int32
 156 hashString = fromIntegral . foldl f 0
 157   where f m c = ord c + (m * 128) `rem` fromIntegral prime
 158
 159 -- | A prime larger than the maximum hash table size
 160 prime :: Int32
 161 prime = 1500007
 162
 163 -- -----------------------------------------------------------------------------
 164 -- Parameters
 165
 166 sEGMENT_SIZE  = 1024  :: Int32  -- Size of a single hash table segment
 167 sEGMENT_SHIFT = 10    :: Int  -- derived
 168 sEGMENT_MASK  = 0x3ff :: Int32  -- derived
 169
 170 dIR_SIZE = 1024  :: Int32  -- Size of the segment directory
 171         -- Maximum hash table size is sEGMENT_SIZE * dIR_SIZE
 172
 173 hLOAD = 4 :: Int32 -- Maximum average load of a single hash bucket
 174
 175 -- -----------------------------------------------------------------------------
 176 -- Creating a new hash table
 177
 178 -- | Creates a new hash table.  The following property should hold for the @eq@
 179 -- and @hash@ functions passed to 'new':
 180 --
 181 -- >   eq A B  =>  hash A == hash B
 182 --
 183 new
 184   :: (key -> key -> Bool)    -- ^ @eq@: An equality comparison on keys
 185   -> (key -> Int32)          -- ^ @hash@: A hash function on keys
 186   -> IO (HashTable key val)  -- ^ Returns: an empty hash table
 187
 188 new cmp hash_fn = do
 189   -- make a new hash table with a single, empty, segment
 190   dir     <- newIOArray (0,dIR_SIZE-1) undefined
 191   segment <- newIOArray (0,sEGMENT_SIZE-1) []
 192   myWriteArray dir 0 segment
 193
 194   let
 195     split  = 0
 196     max    = sEGMENT_SIZE
 197     mask1  = (sEGMENT_SIZE - 1)
 198     mask2  = (2 * sEGMENT_SIZE - 1)
 199     kcount = 0
 200     bcount = sEGMENT_SIZE
 201
 202     ht = HT {  dir=dir, split=split, max_bucket=max, mask1=mask1, mask2=mask2,
 203                kcount=kcount, bcount=bcount, hash_fn=hash_fn, cmp=cmp
 204           }
 205
 206   table <- newIORef ht
 207   return (HashTable table)
 208
 209 -- -----------------------------------------------------------------------------
 210 -- Inserting a key\/value pair into the hash table
 211
 212 -- | Inserts an key\/value mapping into the hash table.
 213 --
 214 -- Note that 'insert' doesn't remove the old entry from the table -
 215 -- the behaviour is like an association list, where 'lookup' returns
 216 -- the most-recently-inserted mapping for a key in the table.  The
 217 -- reason for this is to keep 'insert' as efficient as possible.  If
 218 -- you need to update a mapping, then we provide 'update'.
 219 --
 220 insert :: HashTable key val -> key -> val -> IO ()
 221
 222 insert (HashTable ref) key val = do
 223   table@HT{ kcount=k, bcount=b, dir=dir } <- readIORef ref
 224   let table1 = table{ kcount = k+1 }
 225   table2 <-
 226         if (k > hLOAD * b)
 227            then expandHashTable table1
 228            else return table1
 229   writeIORef ref table2
 230   (segment_index,segment_offset) <- tableLocation table2 key
 231   segment <- myReadArray dir segment_index
 232   bucket <- myReadArray segment segment_offset
 233   myWriteArray segment segment_offset ((key,val):bucket)
 234   return ()
 235
 236 bucketIndex :: HT key val -> key -> IO Int32
 237 bucketIndex HT{ hash_fn=hash_fn,
 238                 split=split,
 239                 mask1=mask1,
 240                 mask2=mask2 } key = do
 241   let
 242     h = fromIntegral (hash_fn key)
 243     small_bucket = h .&. mask1
 244     large_bucket = h .&. mask2
 245   --
 246   if small_bucket < split
 247         then return large_bucket
 248         else return small_bucket
 249
 250 tableLocation :: HT key val -> key -> IO (Int32,Int32)
 251 tableLocation table key = do
 252   bucket_index <- bucketIndex table key
 253   let
 254     segment_index  = bucket_index `shiftR` sEGMENT_SHIFT
 255     segment_offset = bucket_index .&. sEGMENT_MASK
 256   --
 257   return (segment_index,segment_offset)
 258
 259 expandHashTable :: HT key val -> IO (HT key val)
 260 expandHashTable
 261       table@HT{ dir=dir,
 262                 split=split,
 263                 max_bucket=max,
 264                 bcount=bcount,
 265                 mask2=mask2 } = do
 266   let
 267       oldsegment = split `shiftR` sEGMENT_SHIFT
 268       oldindex   = split .&. sEGMENT_MASK
 269
 270       newbucket  = max + split
 271       newsegment = newbucket `shiftR` sEGMENT_SHIFT
 272       newindex   = newbucket .&. sEGMENT_MASK
 273   --
 274   if newsegment >= dIR_SIZE     -- make sure we don't overflow the table.
 275         then return table
 276         else do
 277   --
 278    when (newindex == 0) $
 279         do segment <- newIOArray (0,sEGMENT_SIZE-1) []
 280            writeIOArray dir newsegment segment
 281            -- doesn't happen very often, so we might as well use a safe
 282            -- array index here.
 283   --
 284    let table' =
 285         if (split+1) < max
 286             then table{ split = split+1,
 287                         bcount = bcount+1 }
 288                 -- we've expanded all the buckets in this table, so start from
 289                 -- the beginning again.
 290             else table{ split = 0,
 291                         bcount = bcount+1,
 292                         max_bucket = max * 2,
 293                         mask1 = mask2,
 294                         mask2 = mask2 `shiftL` 1 .|. 1 }
 295    let
 296     split_bucket old new [] = do
 297         segment <- myReadArray dir oldsegment
 298         myWriteArray segment oldindex old
 299         segment <- myReadArray dir newsegment
 300         myWriteArray segment newindex new
 301     split_bucket old new ((k,v):xs) = do
 302         h <- bucketIndex table' k
 303         if h == newbucket
 304                 then split_bucket old ((k,v):new) xs
 305                 else split_bucket ((k,v):old) new xs
 306   --
 307    segment <- myReadArray dir oldsegment
 308    bucket <- myReadArray segment oldindex
 309    split_bucket [] [] bucket
 310    return table'
 311
 312 -- -----------------------------------------------------------------------------
 313 -- Deleting a mapping from the hash table
 314
 315 -- | Remove an entry from the hash table.
 316 delete :: HashTable key val -> key -> IO ()
 317
 318 delete (HashTable ref) key = do
 319   table@HT{ dir=dir, cmp=cmp } <- readIORef ref
 320   (segment_index,segment_offset) <- tableLocation table key
 321   segment <- myReadArray dir segment_index
 322   bucket <- myReadArray segment segment_offset
 323   myWriteArray segment segment_offset (filter (not.(key `cmp`).fst) bucket)
 324   return ()
 325
 326 -- -----------------------------------------------------------------------------
 327 -- Deleting a mapping from the hash table
 328
 329 -- | Updates an entry in the hash table, returning 'True' if there was
 330 -- already an entry for this key, or 'False' otherwise.  After 'update'
 331 -- there will always be exactly one entry for the given key in the table.
 332 --
 333 -- 'insert' is more efficient than 'update' if you don't care about
 334 -- multiple entries, or you know for sure that multiple entries can't
 335 -- occur.  However, 'update' is more efficient than 'delete' followed
 336 -- by 'insert'.
 337 update :: HashTable key val -> key -> val -> IO Bool
 338
 339 update (HashTable ref) key val = do
 340   table@HT{ kcount=k, bcount=b, dir=dir, cmp=cmp } <- readIORef ref
 341   let table1 = table{ kcount = k+1 }
 342   -- optimistically expand the table
 343   table2 <-
 344         if (k > hLOAD * b)
 345            then expandHashTable table1
 346            else return table1
 347   writeIORef ref table2
 348   (segment_index,segment_offset) <- tableLocation table2 key
 349   segment <- myReadArray dir segment_index
 350   bucket <- myReadArray segment segment_offset
 351   let
 352     (deleted,bucket') = foldr filt (0::Int32,[]) bucket
 353     filt pair@(k,v) (deleted,bucket)
 354         | key `cmp` k = (deleted+1, bucket)
 355         | otherwise   = (deleted,   pair:bucket)
 356   -- in
 357   myWriteArray segment segment_offset ((key,val):bucket')
 358   -- update the table load, taking into account the number of
 359   -- items we just deleted.
 360   writeIORef ref table2{ kcount = kcount table2 - deleted }
 361   return (deleted /= 0)
 362
 363 -- -----------------------------------------------------------------------------
 364 -- Looking up an entry in the hash table
 365
 366 -- | Looks up the value of a key in the hash table.
 367 lookup :: HashTable key val -> key -> IO (Maybe val)
 368
 369 lookup (HashTable ref) key = do
 370   table@HT{ dir=dir, cmp=cmp } <- readIORef ref
 371   (segment_index,segment_offset) <- tableLocation table key
 372   segment <- myReadArray dir segment_index
 373   bucket <- myReadArray segment segment_offset
 374   case [ val | (key',val) <- bucket, cmp key key' ] of
 375         [] -> return Nothing
 376         (v:_) -> return (Just v)
 377
 378 -- -----------------------------------------------------------------------------
 379 -- Converting to/from lists
 380
 381 -- | Convert a list of key\/value pairs into a hash table.  Equality on keys
 382 -- is taken from the Eq instance for the key type.
 383 --
 384 fromList :: Eq key => (key -> Int32) -> [(key,val)] -> IO (HashTable key val)
 385 fromList hash_fn list = do
 386   table <- new (==) hash_fn
 387   sequence_ [ insert table k v | (k,v) <- list ]
 388   return table
 389
 390 -- | Converts a hash table to a list of key\/value pairs.
 391 --
 392 toList :: HashTable key val -> IO [(key,val)]
 393 toList (HashTable ref) = do
 394   HT{ dir=dir, max_bucket=max, split=split } <- readIORef ref
 395   --
 396   let
 397     max_segment = (max + split - 1) `quot` sEGMENT_SIZE
 398   --
 399   segments <- mapM (segmentContents dir) [0 .. max_segment]
 400   return (concat segments)
 401  where
 402    segmentContents dir seg_index = do
 403      segment <- myReadArray dir seg_index
 404      bs <- mapM (myReadArray segment) [0 .. sEGMENT_SIZE-1]
 405      return (concat bs)
 406
 407 -- -----------------------------------------------------------------------------
 408 -- Diagnostics
 409
 410 -- | This function is useful for determining whether your hash function
 411 -- is working well for your data set.  It returns the longest chain
 412 -- of key\/value pairs in the hash table for which all the keys hash to
 413 -- the same bucket.  If this chain is particularly long (say, longer
 414 -- than 10 elements), then it might be a good idea to try a different
 415 -- hash function.
 416 --
 417 longestChain :: HashTable key val -> IO [(key,val)]
 418 longestChain (HashTable ref) = do
 419   HT{ dir=dir, max_bucket=max, split=split } <- readIORef ref
 420   --
 421   let
 422     max_segment = (max + split - 1) `quot` sEGMENT_SIZE
 423   --
 424   --trace ("maxChainLength: max = " ++ show max ++ ", split = " ++ show split ++ ", max_segment = " ++ show max_segment) $ do
 425   segments <- mapM (segmentMaxChainLength dir) [0 .. max_segment]
 426   return (maximumBy lengthCmp segments)
 427  where
 428    segmentMaxChainLength dir seg_index = do
 429      segment <- myReadArray dir seg_index
 430      bs <- mapM (myReadArray segment) [0 .. sEGMENT_SIZE-1]
 431      return (maximumBy lengthCmp bs)
 432
 433    lengthCmp x y = length x `compare` length y