1 {-# OPTIONS -fno-implicit-prelude #-}
3 -----------------------------------------------------------------------------
5 -- Module : Data.HashTable
6 -- Copyright : (c) The University of Glasgow 2003
7 -- License : BSD-style (see the file libraries/base/LICENSE)
9 -- Maintainer : libraries@haskell.org
10 -- Stability : provisional
11 -- Portability : portable
13 -- An implementation of extensible hash tables, as described in
14 -- Per-Ake Larson, /Dynamic Hash Tables/, CACM 31(4), April 1988,
15 -- pp. 446--457. The implementation is also derived from the one
16 -- in GHC's runtime system (@ghc\/rts\/Hash.{c,h}@).
18 -----------------------------------------------------------------------------
20 module Data.HashTable (
21 -- * Basic hash table operations
22 HashTable, new, insert, delete, lookup,
23 -- * Converting to and from lists
33 -- This module is imported by Data.Dynamic, which is pretty low down in the
34 -- module hierarchy, so don't import "high-level" modules
37 import Data.Tuple ( fst )
40 import Data.List ( maximumBy, filter, length, concat )
43 import GHC.Int ( Int32 )
44 import GHC.Real ( Integral(..), fromIntegral )
46 import GHC.IOBase ( IO, IOArray, newIOArray, readIOArray, writeIOArray, unsafeReadIOArray, unsafeWriteIOArray,
47 IORef, newIORef, readIORef, writeIORef )
48 import GHC.Err ( undefined )
49 import Control.Monad ( when, mapM, sequence_ )
51 -----------------------------------------------------------------------
52 myReadArray :: IOArray Int32 a -> Int32 -> IO a
53 myWriteArray :: IOArray Int32 a -> Int32 -> a -> IO ()
55 myReadArray = readIOArray
56 myWriteArray = writeIOArray
58 myReadArray arr i = unsafeReadIOArray arr (fromIntegral i)
59 myWriteArray arr i x = unsafeWriteIOArray arr (fromIntegral i) x
62 -- | A hash table mapping keys of type @key@ to values of type @val@.
64 -- The implementation will grow the hash table as necessary, trying to
65 -- maintain a reasonable average load per bucket in the table.
67 newtype HashTable key val = HashTable (IORef (HT key val))
68 -- TODO: the IORef should really be an MVar.
72 split :: !Int32, -- Next bucket to split when expanding
73 max_bucket :: !Int32, -- Max bucket of smaller table
74 mask1 :: !Int32, -- Mask for doing the mod of h_1 (smaller table)
75 mask2 :: !Int32, -- Mask for doing the mod of h_2 (larger table)
76 kcount :: !Int32, -- Number of keys
77 bcount :: !Int32, -- Number of buckets
78 dir :: !(IOArray Int32 (IOArray Int32 [(key,val)])),
79 hash_fn :: key -> Int32,
80 cmp :: key -> key -> Bool
84 ALTERNATIVE IMPLEMENTATION:
86 This works out slightly slower, because there's a tradeoff between
87 allocating a complete new HT structure each time a modification is
88 made (in the version above), and allocating new Int32s each time one
89 of them is modified, as below. Using FastMutInt instead of IORef
90 Int32 helps, but yields an implementation which has about the same
91 performance as the version above (and is more complex).
93 data HashTable key val
95 split :: !(IORef Int32), -- Next bucket to split when expanding
96 max_bucket :: !(IORef Int32), -- Max bucket of smaller table
97 mask1 :: !(IORef Int32), -- Mask for doing the mod of h_1 (smaller table)
98 mask2 :: !(IORef Int32), -- Mask for doing the mod of h_2 (larger table)
99 kcount :: !(IORef Int32), -- Number of keys
100 bcount :: !(IORef Int32), -- Number of buckets
101 dir :: !(IOArray Int32 (IOArray Int32 [(key,val)])),
102 hash_fn :: key -> Int32,
103 cmp :: key -> key -> Bool
108 -- -----------------------------------------------------------------------------
109 -- Sample hash functions
113 -- This implementation of hash tables uses the low-order /n/ bits of the hash
114 -- value for a key, where /n/ varies as the hash table grows. A good hash
115 -- function therefore will give an even distribution regardless of /n/.
117 -- If your keyspace is integrals such that the low-order bits between
118 -- keys are highly variable, then you could get away with using 'id'
119 -- as the hash function.
121 -- We provide some sample hash functions for 'Int' and 'String' below.
123 -- | A sample hash function for 'Int', implemented as simply @(x `mod` P)@
124 -- where P is a suitable prime (currently 1500007). Should give
125 -- reasonable results for most distributions of 'Int' values, except
126 -- when the keys are all multiples of the prime!
128 hashInt :: Int -> Int32
129 hashInt = (`rem` prime) . fromIntegral
131 -- | A sample hash function for 'String's. The implementation is:
133 -- > hashString = fromIntegral . foldr f 0
134 -- > where f c m = ord c + (m * 128) `rem` 1500007
136 -- which seems to give reasonable results.
138 hashString :: String -> Int32
139 hashString = fromIntegral . foldr f 0
140 where f c m = ord c + (m * 128) `rem` fromIntegral prime
142 -- | A prime larger than the maximum hash table size
143 prime = 1500007 :: Int32
145 -- -----------------------------------------------------------------------------
148 sEGMENT_SIZE = 1024 :: Int32 -- Size of a single hash table segment
149 sEGMENT_SHIFT = 10 :: Int -- derived
150 sEGMENT_MASK = 0x3ff :: Int32 -- derived
152 dIR_SIZE = 1024 :: Int32 -- Size of the segment directory
153 -- Maximum hash table size is sEGMENT_SIZE * dIR_SIZE
155 hLOAD = 4 :: Int32 -- Maximum average load of a single hash bucket
157 -- -----------------------------------------------------------------------------
158 -- Creating a new hash table
160 -- | Creates a new hash table
162 :: (key -> key -> Bool) -- ^ An equality comparison on keys
163 -> (key -> Int32) -- ^ A hash function on keys
164 -> IO (HashTable key val) -- ^ Returns: an empty hash table
167 -- make a new hash table with a single, empty, segment
168 dir <- newIOArray (0,dIR_SIZE) undefined
169 segment <- newIOArray (0,sEGMENT_SIZE-1) []
170 myWriteArray dir 0 segment
175 mask1 = (sEGMENT_SIZE - 1)
176 mask2 = (2 * sEGMENT_SIZE - 1)
178 bcount = sEGMENT_SIZE
180 ht = HT { dir=dir, split=split, max_bucket=max, mask1=mask1, mask2=mask2,
181 kcount=kcount, bcount=bcount, hash_fn=hash_fn, cmp=cmp
185 return (HashTable table)
187 -- -----------------------------------------------------------------------------
188 -- Inserting a key\/value pair into the hash table
190 -- | Inserts an key\/value mapping into the hash table.
191 insert :: HashTable key val -> key -> val -> IO ()
193 insert (HashTable ref) key val = do
194 table@HT{ kcount=k, bcount=b, dir=dir } <- readIORef ref
195 let table1 = table{ kcount = k+1 }
198 then expandHashTable table1
200 writeIORef ref table2
201 (segment_index,segment_offset) <- tableLocation table key
202 segment <- myReadArray dir segment_index
203 bucket <- myReadArray segment segment_offset
204 myWriteArray segment segment_offset ((key,val):bucket)
207 bucketIndex :: HT key val -> key -> IO Int32
208 bucketIndex HT{ hash_fn=hash_fn,
211 mask2=mask2 } key = do
213 h = fromIntegral (hash_fn key)
214 small_bucket = h .&. mask1
215 large_bucket = h .&. mask2
217 if small_bucket < split
218 then return large_bucket
219 else return small_bucket
221 tableLocation :: HT key val -> key -> IO (Int32,Int32)
222 tableLocation table key = do
223 bucket_index <- bucketIndex table key
225 segment_index = bucket_index `shiftR` sEGMENT_SHIFT
226 segment_offset = bucket_index .&. sEGMENT_MASK
228 return (segment_index,segment_offset)
230 expandHashTable :: HT key val -> IO (HT key val)
237 oldsegment = split `shiftR` sEGMENT_SHIFT
238 oldindex = split .&. sEGMENT_MASK
240 newbucket = max + split
241 newsegment = newbucket `shiftR` sEGMENT_SHIFT
242 newindex = newbucket .&. sEGMENT_MASK
244 when (newindex == 0) $
245 do segment <- newIOArray (0,sEGMENT_SIZE-1) []
246 myWriteArray dir newsegment segment
250 then table{ split = split+1 }
251 -- we've expanded all the buckets in this table, so start from
252 -- the beginning again.
253 else table{ split = 0,
254 max_bucket = max * 2,
256 mask2 = mask2 `shiftL` 1 .|. 1 }
258 split_bucket old new [] = do
259 segment <- myReadArray dir oldsegment
260 myWriteArray segment oldindex old
261 segment <- myReadArray dir newsegment
262 myWriteArray segment newindex new
263 split_bucket old new ((k,v):xs) = do
264 h <- bucketIndex table' k
266 then split_bucket old ((k,v):new) xs
267 else split_bucket ((k,v):old) new xs
269 segment <- myReadArray dir oldsegment
270 bucket <- myReadArray segment oldindex
271 split_bucket [] [] bucket
274 -- -----------------------------------------------------------------------------
275 -- Deleting a mapping from the hash table
277 -- | Remove an entry from the hash table.
278 delete :: HashTable key val -> key -> IO ()
280 delete (HashTable ref) key = do
281 table@HT{ dir=dir, cmp=cmp } <- readIORef ref
282 (segment_index,segment_offset) <- tableLocation table key
283 segment <- myReadArray dir segment_index
284 bucket <- myReadArray segment segment_offset
285 myWriteArray segment segment_offset (filter (not.(key `cmp`).fst) bucket)
288 -- -----------------------------------------------------------------------------
289 -- Looking up an entry in the hash table
291 -- | Looks up the value of a key in the hash table.
292 lookup :: HashTable key val -> key -> IO (Maybe val)
294 lookup (HashTable ref) key = do
295 table@HT{ dir=dir, cmp=cmp } <- readIORef ref
296 (segment_index,segment_offset) <- tableLocation table key
297 segment <- myReadArray dir segment_index
298 bucket <- myReadArray segment segment_offset
299 case [ val | (key',val) <- bucket, cmp key key' ] of
301 (v:_) -> return (Just v)
303 -- -----------------------------------------------------------------------------
304 -- Converting to/from lists
306 -- | Convert a list of key\/value pairs into a hash table. Equality on keys
307 -- is taken from the Eq instance for the key type.
309 fromList :: Eq key => (key -> Int32) -> [(key,val)] -> IO (HashTable key val)
310 fromList hash_fn list = do
311 table <- new (==) hash_fn
312 sequence_ [ insert table k v | (k,v) <- list ]
315 -- | Converts a hash table to a list of key\/value pairs.
317 toList :: HashTable key val -> IO [(key,val)]
318 toList (HashTable ref) = do
319 HT{ dir=dir, max_bucket=max, split=split } <- readIORef ref
322 max_segment = (max + split - 1) `quot` sEGMENT_SIZE
324 segments <- mapM (segmentContents dir) [0 .. max_segment]
325 return (concat segments)
327 segmentContents dir seg_index = do
328 segment <- myReadArray dir seg_index
329 bs <- mapM (myReadArray segment) [0 .. sEGMENT_SIZE-1]
332 -- -----------------------------------------------------------------------------
335 -- | This function is useful for determining whether your hash function
336 -- is working well for your data set. It returns the longest chain
337 -- of key\/value pairs in the hash table for which all the keys hash to
338 -- the same bucket. If this chain is particularly long (say, longer
339 -- than 10 elements), then it might be a good idea to try a different
342 longestChain :: HashTable key val -> IO [(key,val)]
343 longestChain (HashTable ref) = do
344 HT{ dir=dir, max_bucket=max, split=split } <- readIORef ref
347 max_segment = (max + split - 1) `quot` sEGMENT_SIZE
349 --trace ("maxChainLength: max = " ++ show max ++ ", split = " ++ show split ++ ", max_segment = " ++ show max_segment) $ do
350 segments <- mapM (segmentMaxChainLength dir) [0 .. max_segment]
351 return (maximumBy lengthCmp segments)
353 segmentMaxChainLength dir seg_index = do
354 segment <- myReadArray dir seg_index
355 bs <- mapM (myReadArray segment) [0 .. sEGMENT_SIZE-1]
356 return (maximumBy lengthCmp bs)
358 lengthCmp x y = length x `compare` length y