-{-# OPTIONS_GHC -fno-implicit-prelude #-}
+{-# OPTIONS_GHC -XNoImplicitPrelude -funbox-strict-fields #-}
-----------------------------------------------------------------------------
-- |
-----------------------------------------------------------------------------
module Data.HashTable (
- -- * Basic hash table operations
- HashTable, new, insert, delete, lookup, update,
- -- * Converting to and from lists
- fromList, toList,
- -- * Hash functions
- -- $hash_functions
- hashInt, hashString,
- prime,
- -- * Diagnostics
- longestChain
+ -- * Basic hash table operations
+ HashTable, new, insert, delete, lookup, update,
+ -- * Converting to and from lists
+ fromList, toList,
+ -- * Hash functions
+ -- $hash_functions
+ hashInt, hashString,
+ prime,
+ -- * Diagnostics
+ longestChain
) where
-- This module is imported by Data.Dynamic, which is pretty low down in the
#ifdef __GLASGOW_HASKELL__
import GHC.Base
#else
-import Prelude hiding ( lookup )
+import Prelude hiding ( lookup )
#endif
-import Data.Tuple ( fst )
+import Data.Tuple ( fst )
import Data.Bits
import Data.Maybe
-import Data.List ( maximumBy, length, concat, foldl', partition )
-import Data.Int ( Int32 )
+import Data.List ( maximumBy, length, concat, foldl', partition )
+import Data.Int ( Int32 )
#if defined(__GLASGOW_HASKELL__)
import GHC.Num
-import GHC.Real ( fromIntegral )
-import GHC.Show ( Show(..) )
-import GHC.Int ( Int64 )
+import GHC.Real ( fromIntegral )
+import GHC.Show ( Show(..) )
+import GHC.Int ( Int64 )
-import GHC.IOBase ( IO, IOArray, newIOArray,
- unsafeReadIOArray, unsafeWriteIOArray, unsafePerformIO,
- IORef, newIORef, readIORef, writeIORef )
+import GHC.IO
+import GHC.IOArray
+import GHC.IORef
#else
-import Data.Char ( ord )
-import Data.IORef ( IORef, newIORef, readIORef, writeIORef )
-import System.IO.Unsafe ( unsafePerformIO )
-import Data.Int ( Int64 )
+import Data.Char ( ord )
+import Data.IORef ( IORef, newIORef, readIORef, writeIORef )
+import System.IO.Unsafe ( unsafePerformIO )
+import Data.Int ( Int64 )
# if defined(__HUGS__)
-import Hugs.IOArray ( IOArray, newIOArray,
- unsafeReadIOArray, unsafeWriteIOArray )
+import Hugs.IOArray ( IOArray, newIOArray,
+ unsafeReadIOArray, unsafeWriteIOArray )
# elif defined(__NHC__)
-import NHC.IOExtras ( IOArray, newIOArray, readIOArray, writeIOArray )
+import NHC.IOExtras ( IOArray, newIOArray, readIOArray, writeIOArray )
# endif
#endif
-import Control.Monad ( mapM, mapM_, sequence_ )
+import Control.Monad ( mapM, mapM_, sequence_ )
-----------------------------------------------------------------------
readHTArray :: HTArray a -> Int32 -> IO a
writeMutArray :: MutArray a -> Int32 -> a -> IO ()
-freezeArray :: MutArray a -> IO (HTArray a)
-thawArray :: HTArray a -> IO (MutArray a)
newMutArray :: (Int32, Int32) -> a -> IO (MutArray a)
-#if defined(DEBUG) || defined(__NHC__)
+newMutArray = newIOArray
type MutArray a = IOArray Int32 a
type HTArray a = MutArray a
-newMutArray = newIOArray
+#if defined(DEBUG) || defined(__NHC__)
readHTArray = readIOArray
writeMutArray = writeIOArray
-freezeArray = return
-thawArray = return
#else
-type MutArray a = IOArray Int32 a
-type HTArray a = MutArray a -- Array Int32 a
-newMutArray = newIOArray
-readHTArray arr i = readMutArray arr i -- return $! (unsafeAt arr (fromIntegral i))
-readMutArray :: MutArray a -> Int32 -> IO a
-readMutArray arr i = unsafeReadIOArray arr (fromIntegral i)
+readHTArray arr i = unsafeReadIOArray arr (fromIntegral i)
writeMutArray arr i x = unsafeWriteIOArray arr (fromIntegral i) x
-freezeArray = return -- unsafeFreeze
-thawArray = return -- unsafeThaw
#endif
data HashTable key val = HashTable {
- cmp :: !(key -> key -> Bool),
- hash_fn :: !(key -> Int32),
+ cmp :: !(key -> key -> Bool),
+ hash_fn :: !(key -> Int32),
tab :: !(IORef (HT key val))
}
-- TODO: the IORef should really be an MVar.
data HT key val
= HT {
- kcount :: !Int32, -- Total number of keys.
+ kcount :: !Int32, -- Total number of keys.
bmask :: !Int32,
- buckets :: !(HTArray [(key,val)])
+ buckets :: !(HTArray [(key,val)])
}
-- ------------------------------------------------------------
-- stats :: IO String
-- stats = fmap show $ readIORef hashData
--- -----------------------------------------------------------------------------
+-- ----------------------------------------------------------------------------
-- Sample hash functions
-- $hash_functions
-- function therefore will give an even distribution regardless of /n/.
--
-- If your keyspace is integrals such that the low-order bits between
--- keys are highly variable, then you could get away with using 'id'
+-- keys are highly variable, then you could get away with using 'fromIntegral'
-- as the hash function.
--
-- We provide some sample hash functions for 'Int' and 'String' below.
golden :: Int32
-golden = -1640531527
+golden = 1013904242 -- = round ((sqrt 5 - 1) * 2^32) :: Int32
+-- was -1640531527 = round ((sqrt 5 - 1) * 2^31) :: Int32
+-- but that has bad mulHi properties (even adding 2^32 to get its inverse)
+-- Whereas the above works well and contains no hash duplications for
+-- [-32767..65536]
+
+hashInt32 :: Int32 -> Int32
+hashInt32 x = mulHi x golden + x
-- | A sample (and useful) hash function for Int and Int32,
-- implemented by extracting the uppermost 32 bits of the 64-bit
--- result of multiplying by a 32-bit constant. The constant is from
+-- result of multiplying by a 33-bit constant. The constant is from
-- Knuth, derived from the golden ratio:
--
--- > golden = round ((sqrt 5 - 1) * 2^31) :: Int
+-- > golden = round ((sqrt 5 - 1) * 2^32)
+--
+-- We get good key uniqueness on small inputs
+-- (a problem with previous versions):
+-- (length $ group $ sort $ map hashInt [-32767..65536]) == 65536 + 32768
+--
hashInt :: Int -> Int32
-hashInt x = mulHi (fromIntegral x) golden
+hashInt x = hashInt32 (fromIntegral x)
-- hi 32 bits of a x-bit * 32 bit -> 64-bit multiply
mulHi :: Int32 -> Int32 -> Int32
mulHi a b = fromIntegral (r `shiftR` 32)
- where r :: Int64
- r = fromIntegral a * fromIntegral b :: Int64
+ where r :: Int64
+ r = fromIntegral a * fromIntegral b
-- | A sample hash function for Strings. We keep multiplying by the
-- golden ratio and adding. The implementation is:
--
--- > hashString = foldl' f 0
--- > where f m c = fromIntegral (ord c) + mulHi m golden
+-- > hashString = foldl' f golden
+-- > where f m c = fromIntegral (ord c) * magic + hashInt32 m
+-- > magic = 0xdeadbeef
+--
+-- Where hashInt32 works just as hashInt shown above.
+--
+-- Knuth argues that repeated multiplication by the golden ratio
+-- will minimize gaps in the hash space, and thus it's a good choice
+-- for combining together multiple keys to form one.
--
--- Note that this has not been extensively tested for reasonability,
--- but Knuth argues that repeated multiplication by the golden ratio
--- will minimize gaps in the hash space.
+-- Here we know that individual characters c are often small, and this
+-- produces frequent collisions if we use ord c alone. A
+-- particular problem are the shorter low ASCII and ISO-8859-1
+-- character strings. We pre-multiply by a magic twiddle factor to
+-- obtain a good distribution. In fact, given the following test:
+--
+-- > testp :: Int32 -> Int
+-- > testp k = (n - ) . length . group . sort . map hs . take n $ ls
+-- > where ls = [] : [c : l | l <- ls, c <- ['\0'..'\xff']]
+-- > hs = foldl' f golden
+-- > f m c = fromIntegral (ord c) * k + hashInt32 m
+-- > n = 100000
+--
+-- We discover that testp magic = 0.
+
hashString :: String -> Int32
-hashString = foldl' f 0
- where f m c = fromIntegral (ord c) + mulHi m golden
+hashString = foldl' f golden
+ where f m c = fromIntegral (ord c) * magic + hashInt32 m
+ magic = 0xdeadbeef
-- | A prime larger than the maximum hash table size
prime :: Int32
--
new
:: (key -> key -> Bool) -- ^ @eq@: An equality comparison on keys
- -> (key -> Int32) -- ^ @hash@: A hash function on keys
+ -> (key -> Int32) -- ^ @hash@: A hash function on keys
-> IO (HashTable key val) -- ^ Returns: an empty hash table
new cmpr hash = do
recordNew
-- make a new hash table with a single, empty, segment
let mask = tABLE_MIN-1
- bkts' <- newMutArray (0,mask) []
- bkts <- freezeArray bkts'
+ bkts <- newMutArray (0,mask) []
let
kcnt = 0
(bckt', inserts, result) <- return $ bucketFn bckt
let k' = k + inserts
table1 = table { kcount=k' }
- bkts' <- thawArray bkts
- writeMutArray bkts' indx bckt'
- freezeArray bkts'
+ writeMutArray bkts indx bckt'
table2 <- if canEnlarge == CanInsert && inserts > 0 then do
recordIns inserts k' bckt'
if tooBig k' b
then return table
else do
--
- newbkts' <- newMutArray (0,newmask) []
+ newbkts <- newMutArray (0,newmask) []
let
splitBucket oldindex = do
bucket <- readHTArray bkts oldindex
let (oldb,newb) =
partition ((oldindex==). bucketIndex newmask . hash . fst) bucket
- writeMutArray newbkts' oldindex oldb
- writeMutArray newbkts' (oldindex + oldsize) newb
+ writeMutArray newbkts oldindex oldb
+ writeMutArray newbkts (oldindex + oldsize) newb
mapM_ splitBucket [0..mask]
- newbkts <- freezeArray newbkts'
-
return ( table{ buckets=newbkts, bmask=newmask } )
-- -----------------------------------------------------------------------------