1 -----------------------------------------------------------------------------
3 -- Module : Data.HashTable
4 -- Copyright : (c) The University of Glasgow 2003
5 -- License : BSD-style (see the file libraries/base/LICENSE)
7 -- Maintainer : libraries@haskell.org
8 -- Stability : provisional
9 -- Portability : portable
11 -- An implementation of extensible hash tables, as described in
12 -- Per-Ake Larson, /Dynamic Hash Tables/, CACM 31(4), April 1988,
13 -- pp. 446--457. The implementation is also derived from the one
14 -- in GHC's runtime system (@ghc\/rts\/Hash.{c,h}@).
16 -----------------------------------------------------------------------------
18 module Data.HashTable (
19 -- * Basic hash table operations
20 HashTable, new, insert, delete, lookup,
21 -- * Converting to and from lists
30 import Data.Char ( ord )
31 import Data.Int ( Int32 )
33 import Data.Array.Base
34 import Data.List ( maximumBy )
37 import Control.Monad ( when )
38 import Prelude hiding (lookup)
41 myReadArray :: IOArray Int32 a -> Int32 -> IO a
42 myWriteArray :: IOArray Int32 a -> Int32 -> a -> IO ()
44 myReadArray = readArray
45 myWriteArray = writeArray
47 myReadArray arr i = unsafeRead arr (fromIntegral i)
48 myWriteArray arr i x = unsafeWrite arr (fromIntegral i) x
51 -- | A hash table mapping keys of type @key@ to values of type @val@.
53 -- The implementation will grow the hash table as necessary, trying to
54 -- maintain a reasonable average load per bucket in the table.
56 newtype HashTable key val = HashTable (IORef (HT key val))
57 -- TODO: the IORef should really be an MVar.
61 split :: !Int32, -- Next bucket to split when expanding
62 max_bucket :: !Int32, -- Max bucket of smaller table
63 mask1 :: !Int32, -- Mask for doing the mod of h_1 (smaller table)
64 mask2 :: !Int32, -- Mask for doing the mod of h_2 (larger table)
65 kcount :: !Int32, -- Number of keys
66 bcount :: !Int32, -- Number of buckets
67 dir :: !(IOArray Int32 (IOArray Int32 [(key,val)])),
68 hash_fn :: key -> Int32,
69 cmp :: key -> key -> Bool
73 ALTERNATIVE IMPLEMENTATION:
75 This works out slightly slower, because there's a tradeoff between
76 allocating a complete new HT structure each time a modification is
77 made (in the version above), and allocating new Int32s each time one
78 of them is modified, as below. Using FastMutInt instead of IORef
79 Int32 helps, but yields an implementation which has about the same
80 performance as the version above (and is more complex).
82 data HashTable key val
84 split :: !(IORef Int32), -- Next bucket to split when expanding
85 max_bucket :: !(IORef Int32), -- Max bucket of smaller table
86 mask1 :: !(IORef Int32), -- Mask for doing the mod of h_1 (smaller table)
87 mask2 :: !(IORef Int32), -- Mask for doing the mod of h_2 (larger table)
88 kcount :: !(IORef Int32), -- Number of keys
89 bcount :: !(IORef Int32), -- Number of buckets
90 dir :: !(IOArray Int32 (IOArray Int32 [(key,val)])),
91 hash_fn :: key -> Int32,
92 cmp :: key -> key -> Bool
97 -- -----------------------------------------------------------------------------
98 -- Sample hash functions
102 -- This implementation of hash tables uses the low-order /n/ bits of the hash
103 -- value for a key, where /n/ varies as the hash table grows. A good hash
104 -- function therefore will give a good distribution regardless of /n/.
106 -- If your keyspace is integrals such that the low-order bits between
107 -- keys are highly variable, then you could get away with using 'id'
108 -- as the hash function.
110 -- We provide some sample hash functions for 'Int' and 'String' below.
112 -- | A sample hash function for 'Int', implemented as simply @(x `mod` P)@
113 -- where P is a suitable prime (currently 1500007). Should give
114 -- reasonable results for most distributions of 'Int' values, except
115 -- when the keys are all multiples of the prime!
117 hashInt :: Int -> Int32
118 hashInt = (`rem` prime) . fromIntegral
120 -- | A sample hash fucntion for 'String's. The implementation is:
122 -- > hashString = fromIntegral . foldr f 0
123 -- > where f c m = ord c + (m * 128) `rem` 1500007
125 -- which seems to give reasonable results.
127 hashString :: String -> Int32
128 hashString = fromIntegral . foldr f 0
129 where f c m = ord c + (m * 128) `rem` fromIntegral prime
131 -- a prime larger than the maximum hash table size
132 prime = 1500007 :: Int32
134 -- -----------------------------------------------------------------------------
137 sEGMENT_SIZE = 1024 :: Int32 -- Size of a single hash table segment
138 sEGMENT_SHIFT = 10 :: Int -- derived
139 sEGMENT_MASK = 0x3ff :: Int32 -- derived
141 dIR_SIZE = 1024 :: Int32 -- Size of the segment directory
142 -- Maximum hash table size is sEGMENT_SIZE * dIR_SIZE
144 hLOAD = 4 :: Int32 -- Maximum average load of a single hash bucket
146 -- -----------------------------------------------------------------------------
147 -- Creating a new hash table
149 -- | Creates a new hash table
151 :: (key -> key -> Bool) -- ^ An equality comparison on keys
152 -> (key -> Int32) -- ^ A hash function on keys
153 -> IO (HashTable key val) -- ^ Returns: an empty hash table
156 -- make a new hash table with a single, empty, segment
157 dir <- newArray (0,dIR_SIZE) undefined
158 segment <- newArray (0,sEGMENT_SIZE-1) []
159 myWriteArray dir 0 segment
164 mask1 = (sEGMENT_SIZE - 1)
165 mask2 = (2 * sEGMENT_SIZE - 1)
167 bcount = sEGMENT_SIZE
169 ht = HT { dir=dir, split=split, max_bucket=max, mask1=mask1, mask2=mask2,
170 kcount=kcount, bcount=bcount, hash_fn=hash_fn, cmp=cmp
174 return (HashTable table)
176 -- -----------------------------------------------------------------------------
177 -- Inserting a key\/value pair into the hash table
179 -- | Inserts an key\/value mapping into the hash table.
180 insert :: HashTable key val -> key -> val -> IO ()
182 insert (HashTable ref) key val = do
183 table@HT{ kcount=k, bcount=b, dir=dir } <- readIORef ref
184 let table1 = table{ kcount = k+1 }
187 then expandHashTable table1
189 writeIORef ref table2
190 (segment_index,segment_offset) <- tableLocation table key
191 segment <- myReadArray dir segment_index
192 bucket <- myReadArray segment segment_offset
193 myWriteArray segment segment_offset ((key,val):bucket)
196 bucketIndex :: HT key val -> key -> IO Int32
197 bucketIndex HT{ hash_fn=hash_fn,
200 mask2=mask2 } key = do
202 h = fromIntegral (hash_fn key)
203 small_bucket = h .&. mask1
204 large_bucket = h .&. mask2
206 if small_bucket < split
207 then return large_bucket
208 else return small_bucket
210 tableLocation :: HT key val -> key -> IO (Int32,Int32)
211 tableLocation table key = do
212 bucket_index <- bucketIndex table key
214 segment_index = bucket_index `shiftR` sEGMENT_SHIFT
215 segment_offset = bucket_index .&. sEGMENT_MASK
217 return (segment_index,segment_offset)
219 expandHashTable :: HT key val -> IO (HT key val)
226 oldsegment = split `shiftR` sEGMENT_SHIFT
227 oldindex = split .&. sEGMENT_MASK
229 newbucket = max + split
230 newsegment = newbucket `shiftR` sEGMENT_SHIFT
231 newindex = newbucket .&. sEGMENT_MASK
233 when (newindex == 0) $
234 do segment <- newArray (0,sEGMENT_SIZE-1) []
235 myWriteArray dir newsegment segment
239 then table{ split = split+1 }
240 -- we've expanded all the buckets in this table, so start from
241 -- the beginning again.
242 else table{ split = 0,
243 max_bucket = max * 2,
245 mask2 = mask2 `shiftL` 1 .|. 1 }
247 split_bucket old new [] = do
248 segment <- myReadArray dir oldsegment
249 myWriteArray segment oldindex old
250 segment <- myReadArray dir newsegment
251 myWriteArray segment newindex new
252 split_bucket old new ((k,v):xs) = do
253 h <- bucketIndex table' k
255 then split_bucket old ((k,v):new) xs
256 else split_bucket ((k,v):old) new xs
258 segment <- myReadArray dir oldsegment
259 bucket <- myReadArray segment oldindex
260 split_bucket [] [] bucket
263 -- -----------------------------------------------------------------------------
264 -- Deleting a mapping from the hash table
266 -- | Remove an entry from the hash table.
267 delete :: HashTable key val -> key -> IO ()
269 delete (HashTable ref) key = do
270 table@HT{ dir=dir, cmp=cmp } <- readIORef ref
271 (segment_index,segment_offset) <- tableLocation table key
272 segment <- myReadArray dir segment_index
273 bucket <- myReadArray segment segment_offset
274 myWriteArray segment segment_offset (filter (not.(key `cmp`).fst) bucket)
277 -- -----------------------------------------------------------------------------
278 -- Looking up an entry in the hash table
280 -- | Looks up the value of a key in the hash table.
281 lookup :: HashTable key val -> key -> IO (Maybe val)
283 lookup (HashTable ref) key = do
284 table@HT{ dir=dir, cmp=cmp } <- readIORef ref
285 (segment_index,segment_offset) <- tableLocation table key
286 segment <- myReadArray dir segment_index
287 bucket <- myReadArray segment segment_offset
288 case [ val | (key',val) <- bucket, cmp key key' ] of
290 (v:_) -> return (Just v)
292 -- -----------------------------------------------------------------------------
293 -- Converting to/from lists
295 -- | Convert a list of key\/value pairs into a hash table. Equality on keys
296 -- is taken from the Eq instance for the key type.
298 fromList :: Eq key => (key -> Int32) -> [(key,val)] -> IO (HashTable key val)
299 fromList hash_fn list = do
300 table <- new (==) hash_fn
301 sequence_ [ insert table k v | (k,v) <- list ]
304 -- | Converts a hash table to a list of key\/value pairs.
306 toList :: HashTable key val -> IO [(key,val)]
307 toList (HashTable ref) = do
308 HT{ dir=dir, max_bucket=max, split=split } <- readIORef ref
311 max_segment = (max + split - 1) `quot` sEGMENT_SIZE
313 segments <- mapM (segmentContents dir) [0 .. max_segment]
314 return (concat segments)
316 segmentContents dir seg_index = do
317 segment <- myReadArray dir seg_index
318 bs <- mapM (myReadArray segment) [0 .. sEGMENT_SIZE-1]
321 -- -----------------------------------------------------------------------------
324 -- | This function is useful for determining whether your hash function
325 -- is working well for your data set. It returns the longest chain
326 -- of key\/value pairs in the hash table for which all the keys hash to
327 -- the same bucket. If this chain is particularly long (say, longer
328 -- than 10 elements), then it might be a good idea to try a different
331 longestChain :: HashTable key val -> IO [(key,val)]
332 longestChain (HashTable ref) = do
333 HT{ dir=dir, max_bucket=max, split=split } <- readIORef ref
336 max_segment = (max + split - 1) `quot` sEGMENT_SIZE
338 --trace ("maxChainLength: max = " ++ show max ++ ", split = " ++ show split ++ ", max_segment = " ++ show max_segment) $ do
339 segments <- mapM (segmentMaxChainLength dir) [0 .. max_segment]
340 return (maximumBy lengthCmp segments)
342 segmentMaxChainLength dir seg_index = do
343 segment <- myReadArray dir seg_index
344 bs <- mapM (myReadArray segment) [0 .. sEGMENT_SIZE-1]
345 return (maximumBy lengthCmp bs)
347 lengthCmp x y = length x `compare` length y