Data/HashTable.hs

   1 -----------------------------------------------------------------------------
   2 -- |
   3 -- Module      :  Data.HashTable
   4 -- Copyright   :  (c) The University of Glasgow 2003
   5 -- License     :  BSD-style (see the file libraries/base/LICENSE)
   6 --
   7 -- Maintainer  :  libraries@haskell.org
   8 -- Stability   :  provisional
   9 -- Portability :  portable
  10 --
  11 -- An implementation of extensible hash tables, as described in
  12 -- Per-Ake Larson, /Dynamic Hash Tables/, CACM 31(4), April 1988,
  13 -- pp. 446--457.  The implementation is also derived from the one
  14 -- in GHC's runtime system (@ghc\/rts\/Hash.{c,h}@).
  15 --
  16 -----------------------------------------------------------------------------
  17
  18 module Data.HashTable (
  19         -- * Basic hash table operations
  20         HashTable, new, insert, delete, lookup,
  21         -- * Converting to and from lists
  22         fromList, toList,
  23         -- * Hash functions
  24         -- $hash_functions
  25         hashInt, hashString,
  26         -- * Diagnostics
  27         longestChain
  28  ) where
  29
  30 import Data.Char        ( ord )
  31 import Data.Int         ( Int32 )
  32 import Data.Array.IO
  33 import Data.Array.Base
  34 import Data.List        ( maximumBy )
  35 import Data.IORef
  36 import Data.Bits
  37 import Control.Monad    ( when )
  38 import Prelude          hiding (lookup)
  39 --import Debug.Trace
  40
  41 myReadArray  :: IOArray Int32 a -> Int32 -> IO a
  42 myWriteArray :: IOArray Int32 a -> Int32 -> a -> IO ()
  43 #ifdef DEBUG
  44 myReadArray  = readArray
  45 myWriteArray = writeArray
  46 #else
  47 myReadArray arr i = unsafeRead arr (fromIntegral i)
  48 myWriteArray arr i x = unsafeWrite arr (fromIntegral i) x
  49 #endif
  50
  51 -- | A hash table mapping keys of type @key@ to values of type @val@.
  52 --
  53 -- The implementation will grow the hash table as necessary, trying to
  54 -- maintain a reasonable average load per bucket in the table.
  55 --
  56 newtype HashTable key val = HashTable (IORef (HT key val))
  57 -- TODO: the IORef should really be an MVar.
  58
  59 data HT key val
  60   = HT {
  61         split  :: !Int32, -- Next bucket to split when expanding
  62         max_bucket :: !Int32, -- Max bucket of smaller table
  63         mask1  :: !Int32, -- Mask for doing the mod of h_1 (smaller table)
  64         mask2  :: !Int32, -- Mask for doing the mod of h_2 (larger table)
  65         kcount :: !Int32, -- Number of keys
  66         bcount :: !Int32, -- Number of buckets
  67         dir    :: !(IOArray Int32 (IOArray Int32 [(key,val)])),
  68         hash_fn :: key -> Int32,
  69         cmp    :: key -> key -> Bool
  70    }
  71
  72 {-
  73 ALTERNATIVE IMPLEMENTATION:
  74
  75 This works out slightly slower, because there's a tradeoff between
  76 allocating a complete new HT structure each time a modification is
  77 made (in the version above), and allocating new Int32s each time one
  78 of them is modified, as below.  Using FastMutInt instead of IORef
  79 Int32 helps, but yields an implementation which has about the same
  80 performance as the version above (and is more complex).
  81
  82 data HashTable key val
  83   = HashTable {
  84         split  :: !(IORef Int32), -- Next bucket to split when expanding
  85         max_bucket :: !(IORef Int32), -- Max bucket of smaller table
  86         mask1  :: !(IORef Int32), -- Mask for doing the mod of h_1 (smaller table)
  87         mask2  :: !(IORef Int32), -- Mask for doing the mod of h_2 (larger table)
  88         kcount :: !(IORef Int32), -- Number of keys
  89         bcount :: !(IORef Int32), -- Number of buckets
  90         dir    :: !(IOArray Int32 (IOArray Int32 [(key,val)])),
  91         hash_fn :: key -> Int32,
  92         cmp    :: key -> key -> Bool
  93    }
  94 -}
  95
  96
  97 -- -----------------------------------------------------------------------------
  98 -- Sample hash functions
  99
 100 -- $hash_functions
 101 --
 102 -- This implementation of hash tables uses the low-order /n/ bits of the hash
 103 -- value for a key, where /n/ varies as the hash table grows.  A good hash
 104 -- function therefore will give a good distribution regardless of /n/.
 105 --
 106 -- If your keyspace is integrals such that the low-order bits between
 107 -- keys are highly variable, then you could get away with using 'id'
 108 -- as the hash function.
 109 --
 110 -- We provide some sample hash functions for 'Int' and 'String' below.
 111
 112 -- | A sample hash function for 'Int', implemented as simply @(x `mod` P)@
 113 -- where P is a suitable prime (currently 1500007).  Should give
 114 -- reasonable results for most distributions of 'Int' values, except
 115 -- when the keys are all multiples of the prime!
 116 --
 117 hashInt :: Int -> Int32
 118 hashInt = (`rem` prime) . fromIntegral
 119
 120 -- | A sample hash fucntion for 'String's.  The implementation is:
 121 --
 122 -- >    hashString = fromIntegral . foldr f 0
 123 -- >      where f c m = ord c + (m * 128) `rem` 1500007
 124 --
 125 -- which seems to give reasonable results.
 126 --
 127 hashString :: String -> Int32
 128 hashString = fromIntegral . foldr f 0
 129   where f c m = ord c + (m * 128) `rem` fromIntegral prime
 130
 131 -- a prime larger than the maximum hash table size
 132 prime = 1500007 :: Int32
 133
 134 -- -----------------------------------------------------------------------------
 135 -- Parameters
 136
 137 sEGMENT_SIZE  = 1024  :: Int32  -- Size of a single hash table segment
 138 sEGMENT_SHIFT = 10    :: Int  -- derived
 139 sEGMENT_MASK  = 0x3ff :: Int32  -- derived
 140
 141 dIR_SIZE = 1024  :: Int32  -- Size of the segment directory
 142         -- Maximum hash table size is sEGMENT_SIZE * dIR_SIZE
 143
 144 hLOAD = 4 :: Int32 -- Maximum average load of a single hash bucket
 145
 146 -- -----------------------------------------------------------------------------
 147 -- Creating a new hash table
 148
 149 -- | Creates a new hash table
 150 new
 151   :: (key -> key -> Bool)    -- ^ An equality comparison on keys
 152   -> (key -> Int32)          -- ^ A hash function on keys
 153   -> IO (HashTable key val)  -- ^ Returns: an empty hash table
 154
 155 new cmp hash_fn = do
 156   -- make a new hash table with a single, empty, segment
 157   dir     <- newArray (0,dIR_SIZE) undefined
 158   segment <- newArray (0,sEGMENT_SIZE-1) []
 159   myWriteArray dir 0 segment
 160
 161   let
 162     split  = 0
 163     max    = sEGMENT_SIZE
 164     mask1  = (sEGMENT_SIZE - 1)
 165     mask2  = (2 * sEGMENT_SIZE - 1)
 166     kcount = 0
 167     bcount = sEGMENT_SIZE
 168
 169     ht = HT {  dir=dir, split=split, max_bucket=max, mask1=mask1, mask2=mask2,
 170                kcount=kcount, bcount=bcount, hash_fn=hash_fn, cmp=cmp
 171           }
 172
 173   table <- newIORef ht
 174   return (HashTable table)
 175
 176 -- -----------------------------------------------------------------------------
 177 -- Inserting a key\/value pair into the hash table
 178
 179 -- | Inserts an key\/value mapping into the hash table.
 180 insert :: HashTable key val -> key -> val -> IO ()
 181
 182 insert (HashTable ref) key val = do
 183   table@HT{ kcount=k, bcount=b, dir=dir } <- readIORef ref
 184   let table1 = table{ kcount = k+1 }
 185   table2 <-
 186         if (k > hLOAD * b)
 187            then expandHashTable table1
 188            else return table1
 189   writeIORef ref table2
 190   (segment_index,segment_offset) <- tableLocation table key
 191   segment <- myReadArray dir segment_index
 192   bucket <- myReadArray segment segment_offset
 193   myWriteArray segment segment_offset ((key,val):bucket)
 194   return ()
 195
 196 bucketIndex :: HT key val -> key -> IO Int32
 197 bucketIndex HT{ hash_fn=hash_fn,
 198                 split=split,
 199                 mask1=mask1,
 200                 mask2=mask2 } key = do
 201   let
 202     h = fromIntegral (hash_fn key)
 203     small_bucket = h .&. mask1
 204     large_bucket = h .&. mask2
 205   --
 206   if small_bucket < split
 207         then return large_bucket
 208         else return small_bucket
 209
 210 tableLocation :: HT key val -> key -> IO (Int32,Int32)
 211 tableLocation table key = do
 212   bucket_index <- bucketIndex table key
 213   let
 214     segment_index  = bucket_index `shiftR` sEGMENT_SHIFT
 215     segment_offset = bucket_index .&. sEGMENT_MASK
 216   --
 217   return (segment_index,segment_offset)
 218
 219 expandHashTable :: HT key val -> IO (HT key val)
 220 expandHashTable
 221       table@HT{ dir=dir,
 222                 split=split,
 223                 max_bucket=max,
 224                 mask2=mask2 } = do
 225   let
 226       oldsegment = split `shiftR` sEGMENT_SHIFT
 227       oldindex   = split .&. sEGMENT_MASK
 228
 229       newbucket  = max + split
 230       newsegment = newbucket `shiftR` sEGMENT_SHIFT
 231       newindex   = newbucket .&. sEGMENT_MASK
 232   --
 233   when (newindex == 0) $
 234         do segment <- newArray (0,sEGMENT_SIZE-1) []
 235            myWriteArray dir newsegment segment
 236   --
 237   let table' =
 238         if (split+1) < max
 239             then table{ split = split+1 }
 240                 -- we've expanded all the buckets in this table, so start from
 241                 -- the beginning again.
 242             else table{ split = 0,
 243                         max_bucket = max * 2,
 244                         mask1 = mask2,
 245                         mask2 = mask2 `shiftL` 1 .|. 1 }
 246   let
 247     split_bucket old new [] = do
 248         segment <- myReadArray dir oldsegment
 249         myWriteArray segment oldindex old
 250         segment <- myReadArray dir newsegment
 251         myWriteArray segment newindex new
 252     split_bucket old new ((k,v):xs) = do
 253         h <- bucketIndex table' k
 254         if h == newbucket
 255                 then split_bucket old ((k,v):new) xs
 256                 else split_bucket ((k,v):old) new xs
 257   --
 258   segment <- myReadArray dir oldsegment
 259   bucket <- myReadArray segment oldindex
 260   split_bucket [] [] bucket
 261   return table'
 262
 263 -- -----------------------------------------------------------------------------
 264 -- Deleting a mapping from the hash table
 265
 266 -- | Remove an entry from the hash table.
 267 delete :: HashTable key val -> key -> IO ()
 268
 269 delete (HashTable ref) key = do
 270   table@HT{ dir=dir, cmp=cmp } <- readIORef ref
 271   (segment_index,segment_offset) <- tableLocation table key
 272   segment <- myReadArray dir segment_index
 273   bucket <- myReadArray segment segment_offset
 274   myWriteArray segment segment_offset (filter (not.(key `cmp`).fst) bucket)
 275   return ()
 276
 277 -- -----------------------------------------------------------------------------
 278 -- Looking up an entry in the hash table
 279
 280 -- | Looks up the value of a key in the hash table.
 281 lookup :: HashTable key val -> key -> IO (Maybe val)
 282
 283 lookup (HashTable ref) key = do
 284   table@HT{ dir=dir, cmp=cmp } <- readIORef ref
 285   (segment_index,segment_offset) <- tableLocation table key
 286   segment <- myReadArray dir segment_index
 287   bucket <- myReadArray segment segment_offset
 288   case [ val | (key',val) <- bucket, cmp key key' ] of
 289         [] -> return Nothing
 290         (v:_) -> return (Just v)
 291
 292 -- -----------------------------------------------------------------------------
 293 -- Converting to/from lists
 294
 295 -- | Convert a list of key\/value pairs into a hash table.  Equality on keys
 296 -- is taken from the Eq instance for the key type.
 297 --
 298 fromList :: Eq key => (key -> Int32) -> [(key,val)] -> IO (HashTable key val)
 299 fromList hash_fn list = do
 300   table <- new (==) hash_fn
 301   sequence_ [ insert table k v | (k,v) <- list ]
 302   return table
 303
 304 -- | Converts a hash table to a list of key\/value pairs.
 305 --
 306 toList :: HashTable key val -> IO [(key,val)]
 307 toList (HashTable ref) = do
 308   HT{ dir=dir, max_bucket=max, split=split } <- readIORef ref
 309   --
 310   let
 311     max_segment = (max + split - 1) `quot` sEGMENT_SIZE
 312   --
 313   segments <- mapM (segmentContents dir) [0 .. max_segment]
 314   return (concat segments)
 315  where
 316    segmentContents dir seg_index = do
 317      segment <- myReadArray dir seg_index
 318      bs <- mapM (myReadArray segment) [0 .. sEGMENT_SIZE-1]
 319      return (concat bs)
 320
 321 -- -----------------------------------------------------------------------------
 322 -- Diagnostics
 323
 324 -- | This function is useful for determining whether your hash function
 325 -- is working well for your data set.  It returns the longest chain
 326 -- of key\/value pairs in the hash table for which all the keys hash to
 327 -- the same bucket.  If this chain is particularly long (say, longer
 328 -- than 10 elements), then it might be a good idea to try a different
 329 -- hash function.
 330 --
 331 longestChain :: HashTable key val -> IO [(key,val)]
 332 longestChain (HashTable ref) = do
 333   HT{ dir=dir, max_bucket=max, split=split } <- readIORef ref
 334   --
 335   let
 336     max_segment = (max + split - 1) `quot` sEGMENT_SIZE
 337   --
 338   --trace ("maxChainLength: max = " ++ show max ++ ", split = " ++ show split ++ ", max_segment = " ++ show max_segment) $ do
 339   segments <- mapM (segmentMaxChainLength dir) [0 .. max_segment]
 340   return (maximumBy lengthCmp segments)
 341  where
 342    segmentMaxChainLength dir seg_index = do
 343      segment <- myReadArray dir seg_index
 344      bs <- mapM (myReadArray segment) [0 .. sEGMENT_SIZE-1]
 345      return (maximumBy lengthCmp bs)
 346
 347    lengthCmp x y = length x `compare` length y