Data/HashTable.hs

   1 {-# OPTIONS -fno-implicit-prelude #-}
   2
   3 -----------------------------------------------------------------------------
   4 -- |
   5 -- Module      :  Data.HashTable
   6 -- Copyright   :  (c) The University of Glasgow 2003
   7 -- License     :  BSD-style (see the file libraries/base/LICENSE)
   8 --
   9 -- Maintainer  :  libraries@haskell.org
  10 -- Stability   :  provisional
  11 -- Portability :  portable
  12 --
  13 -- An implementation of extensible hash tables, as described in
  14 -- Per-Ake Larson, /Dynamic Hash Tables/, CACM 31(4), April 1988,
  15 -- pp. 446--457.  The implementation is also derived from the one
  16 -- in GHC's runtime system (@ghc\/rts\/Hash.{c,h}@).
  17 --
  18 -----------------------------------------------------------------------------
  19
  20 module Data.HashTable (
  21         -- * Basic hash table operations
  22         HashTable, new, insert, delete, lookup,
  23         -- * Converting to and from lists
  24         fromList, toList,
  25         -- * Hash functions
  26         -- $hash_functions
  27         hashInt, hashString,
  28         prime,
  29         -- * Diagnostics
  30         longestChain
  31  ) where
  32
  33 -- This module is imported by Data.Dynamic, which is pretty low down in the
  34 -- module hierarchy, so don't import "high-level" modules
  35
  36 import GHC.Base
  37 import Data.Tuple       ( fst )
  38 import Data.Bits
  39 import Data.Maybe
  40 import Data.List        ( maximumBy, filter, length, concat )
  41
  42 import GHC.Num
  43 import GHC.Int          ( Int32 )
  44 import GHC.Real         ( Integral(..), fromIntegral )
  45
  46 import GHC.IOBase       ( IO, IOArray, newIOArray, readIOArray, writeIOArray, unsafeReadIOArray, unsafeWriteIOArray,
  47                           IORef, newIORef, readIORef, writeIORef )
  48 import GHC.Err          ( undefined )
  49 import Control.Monad    ( when, mapM, sequence_ )
  50
  51 -----------------------------------------------------------------------
  52 myReadArray  :: IOArray Int32 a -> Int32 -> IO a
  53 myWriteArray :: IOArray Int32 a -> Int32 -> a -> IO ()
  54 #ifdef DEBUG
  55 myReadArray  = readIOArray
  56 myWriteArray = writeIOArray
  57 #else
  58 myReadArray arr i = unsafeReadIOArray arr (fromIntegral i)
  59 myWriteArray arr i x = unsafeWriteIOArray arr (fromIntegral i) x
  60 #endif
  61
  62 -- | A hash table mapping keys of type @key@ to values of type @val@.
  63 --
  64 -- The implementation will grow the hash table as necessary, trying to
  65 -- maintain a reasonable average load per bucket in the table.
  66 --
  67 newtype HashTable key val = HashTable (IORef (HT key val))
  68 -- TODO: the IORef should really be an MVar.
  69
  70 data HT key val
  71   = HT {
  72         split  :: !Int32, -- Next bucket to split when expanding
  73         max_bucket :: !Int32, -- Max bucket of smaller table
  74         mask1  :: !Int32, -- Mask for doing the mod of h_1 (smaller table)
  75         mask2  :: !Int32, -- Mask for doing the mod of h_2 (larger table)
  76         kcount :: !Int32, -- Number of keys
  77         bcount :: !Int32, -- Number of buckets
  78         dir    :: !(IOArray Int32 (IOArray Int32 [(key,val)])),
  79         hash_fn :: key -> Int32,
  80         cmp    :: key -> key -> Bool
  81    }
  82
  83 {-
  84 ALTERNATIVE IMPLEMENTATION:
  85
  86 This works out slightly slower, because there's a tradeoff between
  87 allocating a complete new HT structure each time a modification is
  88 made (in the version above), and allocating new Int32s each time one
  89 of them is modified, as below.  Using FastMutInt instead of IORef
  90 Int32 helps, but yields an implementation which has about the same
  91 performance as the version above (and is more complex).
  92
  93 data HashTable key val
  94   = HashTable {
  95         split  :: !(IORef Int32), -- Next bucket to split when expanding
  96         max_bucket :: !(IORef Int32), -- Max bucket of smaller table
  97         mask1  :: !(IORef Int32), -- Mask for doing the mod of h_1 (smaller table)
  98         mask2  :: !(IORef Int32), -- Mask for doing the mod of h_2 (larger table)
  99         kcount :: !(IORef Int32), -- Number of keys
 100         bcount :: !(IORef Int32), -- Number of buckets
 101         dir    :: !(IOArray Int32 (IOArray Int32 [(key,val)])),
 102         hash_fn :: key -> Int32,
 103         cmp    :: key -> key -> Bool
 104    }
 105 -}
 106
 107
 108 -- -----------------------------------------------------------------------------
 109 -- Sample hash functions
 110
 111 -- $hash_functions
 112 --
 113 -- This implementation of hash tables uses the low-order /n/ bits of the hash
 114 -- value for a key, where /n/ varies as the hash table grows.  A good hash
 115 -- function therefore will give an even distribution regardless of /n/.
 116 --
 117 -- If your keyspace is integrals such that the low-order bits between
 118 -- keys are highly variable, then you could get away with using 'id'
 119 -- as the hash function.
 120 --
 121 -- We provide some sample hash functions for 'Int' and 'String' below.
 122
 123 -- | A sample hash function for 'Int', implemented as simply @(x `mod` P)@
 124 -- where P is a suitable prime (currently 1500007).  Should give
 125 -- reasonable results for most distributions of 'Int' values, except
 126 -- when the keys are all multiples of the prime!
 127 --
 128 hashInt :: Int -> Int32
 129 hashInt = (`rem` prime) . fromIntegral
 130
 131 -- | A sample hash function for 'String's.  The implementation is:
 132 --
 133 -- >    hashString = fromIntegral . foldr f 0
 134 -- >      where f c m = ord c + (m * 128) `rem` 1500007
 135 --
 136 -- which seems to give reasonable results.
 137 --
 138 hashString :: String -> Int32
 139 hashString = fromIntegral . foldr f 0
 140   where f c m = ord c + (m * 128) `rem` fromIntegral prime
 141
 142 -- | A prime larger than the maximum hash table size
 143 prime = 1500007 :: Int32
 144
 145 -- -----------------------------------------------------------------------------
 146 -- Parameters
 147
 148 sEGMENT_SIZE  = 1024  :: Int32  -- Size of a single hash table segment
 149 sEGMENT_SHIFT = 10    :: Int  -- derived
 150 sEGMENT_MASK  = 0x3ff :: Int32  -- derived
 151
 152 dIR_SIZE = 1024  :: Int32  -- Size of the segment directory
 153         -- Maximum hash table size is sEGMENT_SIZE * dIR_SIZE
 154
 155 hLOAD = 4 :: Int32 -- Maximum average load of a single hash bucket
 156
 157 -- -----------------------------------------------------------------------------
 158 -- Creating a new hash table
 159
 160 -- | Creates a new hash table
 161 new
 162   :: (key -> key -> Bool)    -- ^ An equality comparison on keys
 163   -> (key -> Int32)          -- ^ A hash function on keys
 164   -> IO (HashTable key val)  -- ^ Returns: an empty hash table
 165
 166 new cmp hash_fn = do
 167   -- make a new hash table with a single, empty, segment
 168   dir     <- newIOArray (0,dIR_SIZE) undefined
 169   segment <- newIOArray (0,sEGMENT_SIZE-1) []
 170   myWriteArray dir 0 segment
 171
 172   let
 173     split  = 0
 174     max    = sEGMENT_SIZE
 175     mask1  = (sEGMENT_SIZE - 1)
 176     mask2  = (2 * sEGMENT_SIZE - 1)
 177     kcount = 0
 178     bcount = sEGMENT_SIZE
 179
 180     ht = HT {  dir=dir, split=split, max_bucket=max, mask1=mask1, mask2=mask2,
 181                kcount=kcount, bcount=bcount, hash_fn=hash_fn, cmp=cmp
 182           }
 183
 184   table <- newIORef ht
 185   return (HashTable table)
 186
 187 -- -----------------------------------------------------------------------------
 188 -- Inserting a key\/value pair into the hash table
 189
 190 -- | Inserts an key\/value mapping into the hash table.
 191 insert :: HashTable key val -> key -> val -> IO ()
 192
 193 insert (HashTable ref) key val = do
 194   table@HT{ kcount=k, bcount=b, dir=dir } <- readIORef ref
 195   let table1 = table{ kcount = k+1 }
 196   table2 <-
 197         if (k > hLOAD * b)
 198            then expandHashTable table1
 199            else return table1
 200   writeIORef ref table2
 201   (segment_index,segment_offset) <- tableLocation table key
 202   segment <- myReadArray dir segment_index
 203   bucket <- myReadArray segment segment_offset
 204   myWriteArray segment segment_offset ((key,val):bucket)
 205   return ()
 206
 207 bucketIndex :: HT key val -> key -> IO Int32
 208 bucketIndex HT{ hash_fn=hash_fn,
 209                 split=split,
 210                 mask1=mask1,
 211                 mask2=mask2 } key = do
 212   let
 213     h = fromIntegral (hash_fn key)
 214     small_bucket = h .&. mask1
 215     large_bucket = h .&. mask2
 216   --
 217   if small_bucket < split
 218         then return large_bucket
 219         else return small_bucket
 220
 221 tableLocation :: HT key val -> key -> IO (Int32,Int32)
 222 tableLocation table key = do
 223   bucket_index <- bucketIndex table key
 224   let
 225     segment_index  = bucket_index `shiftR` sEGMENT_SHIFT
 226     segment_offset = bucket_index .&. sEGMENT_MASK
 227   --
 228   return (segment_index,segment_offset)
 229
 230 expandHashTable :: HT key val -> IO (HT key val)
 231 expandHashTable
 232       table@HT{ dir=dir,
 233                 split=split,
 234                 max_bucket=max,
 235                 mask2=mask2 } = do
 236   let
 237       oldsegment = split `shiftR` sEGMENT_SHIFT
 238       oldindex   = split .&. sEGMENT_MASK
 239
 240       newbucket  = max + split
 241       newsegment = newbucket `shiftR` sEGMENT_SHIFT
 242       newindex   = newbucket .&. sEGMENT_MASK
 243   --
 244   when (newindex == 0) $
 245         do segment <- newIOArray (0,sEGMENT_SIZE-1) []
 246            myWriteArray dir newsegment segment
 247   --
 248   let table' =
 249         if (split+1) < max
 250             then table{ split = split+1 }
 251                 -- we've expanded all the buckets in this table, so start from
 252                 -- the beginning again.
 253             else table{ split = 0,
 254                         max_bucket = max * 2,
 255                         mask1 = mask2,
 256                         mask2 = mask2 `shiftL` 1 .|. 1 }
 257   let
 258     split_bucket old new [] = do
 259         segment <- myReadArray dir oldsegment
 260         myWriteArray segment oldindex old
 261         segment <- myReadArray dir newsegment
 262         myWriteArray segment newindex new
 263     split_bucket old new ((k,v):xs) = do
 264         h <- bucketIndex table' k
 265         if h == newbucket
 266                 then split_bucket old ((k,v):new) xs
 267                 else split_bucket ((k,v):old) new xs
 268   --
 269   segment <- myReadArray dir oldsegment
 270   bucket <- myReadArray segment oldindex
 271   split_bucket [] [] bucket
 272   return table'
 273
 274 -- -----------------------------------------------------------------------------
 275 -- Deleting a mapping from the hash table
 276
 277 -- | Remove an entry from the hash table.
 278 delete :: HashTable key val -> key -> IO ()
 279
 280 delete (HashTable ref) key = do
 281   table@HT{ dir=dir, cmp=cmp } <- readIORef ref
 282   (segment_index,segment_offset) <- tableLocation table key
 283   segment <- myReadArray dir segment_index
 284   bucket <- myReadArray segment segment_offset
 285   myWriteArray segment segment_offset (filter (not.(key `cmp`).fst) bucket)
 286   return ()
 287
 288 -- -----------------------------------------------------------------------------
 289 -- Looking up an entry in the hash table
 290
 291 -- | Looks up the value of a key in the hash table.
 292 lookup :: HashTable key val -> key -> IO (Maybe val)
 293
 294 lookup (HashTable ref) key = do
 295   table@HT{ dir=dir, cmp=cmp } <- readIORef ref
 296   (segment_index,segment_offset) <- tableLocation table key
 297   segment <- myReadArray dir segment_index
 298   bucket <- myReadArray segment segment_offset
 299   case [ val | (key',val) <- bucket, cmp key key' ] of
 300         [] -> return Nothing
 301         (v:_) -> return (Just v)
 302
 303 -- -----------------------------------------------------------------------------
 304 -- Converting to/from lists
 305
 306 -- | Convert a list of key\/value pairs into a hash table.  Equality on keys
 307 -- is taken from the Eq instance for the key type.
 308 --
 309 fromList :: Eq key => (key -> Int32) -> [(key,val)] -> IO (HashTable key val)
 310 fromList hash_fn list = do
 311   table <- new (==) hash_fn
 312   sequence_ [ insert table k v | (k,v) <- list ]
 313   return table
 314
 315 -- | Converts a hash table to a list of key\/value pairs.
 316 --
 317 toList :: HashTable key val -> IO [(key,val)]
 318 toList (HashTable ref) = do
 319   HT{ dir=dir, max_bucket=max, split=split } <- readIORef ref
 320   --
 321   let
 322     max_segment = (max + split - 1) `quot` sEGMENT_SIZE
 323   --
 324   segments <- mapM (segmentContents dir) [0 .. max_segment]
 325   return (concat segments)
 326  where
 327    segmentContents dir seg_index = do
 328      segment <- myReadArray dir seg_index
 329      bs <- mapM (myReadArray segment) [0 .. sEGMENT_SIZE-1]
 330      return (concat bs)
 331
 332 -- -----------------------------------------------------------------------------
 333 -- Diagnostics
 334
 335 -- | This function is useful for determining whether your hash function
 336 -- is working well for your data set.  It returns the longest chain
 337 -- of key\/value pairs in the hash table for which all the keys hash to
 338 -- the same bucket.  If this chain is particularly long (say, longer
 339 -- than 10 elements), then it might be a good idea to try a different
 340 -- hash function.
 341 --
 342 longestChain :: HashTable key val -> IO [(key,val)]
 343 longestChain (HashTable ref) = do
 344   HT{ dir=dir, max_bucket=max, split=split } <- readIORef ref
 345   --
 346   let
 347     max_segment = (max + split - 1) `quot` sEGMENT_SIZE
 348   --
 349   --trace ("maxChainLength: max = " ++ show max ++ ", split = " ++ show split ++ ", max_segment = " ++ show max_segment) $ do
 350   segments <- mapM (segmentMaxChainLength dir) [0 .. max_segment]
 351   return (maximumBy lengthCmp segments)
 352  where
 353    segmentMaxChainLength dir seg_index = do
 354      segment <- myReadArray dir seg_index
 355      bs <- mapM (myReadArray segment) [0 .. sEGMENT_SIZE-1]
 356      return (maximumBy lengthCmp bs)
 357
 358    lengthCmp x y = length x `compare` length y