-{-# OPTIONS_GHC -fno-implicit-prelude #-}
+{-# OPTIONS_GHC -fno-implicit-prelude -funbox-strict-fields #-}
-----------------------------------------------------------------------------
-- |
import Hugs.IOArray ( IOArray, newIOArray,
unsafeReadIOArray, unsafeWriteIOArray )
# elif defined(__NHC__)
-import NHC.IOExtras ( IOArray, newIOArray )
+import NHC.IOExtras ( IOArray, newIOArray, readIOArray, writeIOArray )
# endif
#endif
import Control.Monad ( mapM, mapM_, sequence_ )
#if defined(DEBUG) || defined(__NHC__)
type MutArray a = IOArray Int32 a
type HTArray a = MutArray a
-newMutArray = newArray
-readHTArray = readArray
-writeMutArray = writeArray
+newMutArray = newIOArray
+readHTArray = readIOArray
+writeMutArray = writeIOArray
freezeArray = return
thawArray = return
#else
-- stats :: IO String
-- stats = fmap show $ readIORef hashData
--- -----------------------------------------------------------------------------
+-- ----------------------------------------------------------------------------
-- Sample hash functions
-- $hash_functions
-- function therefore will give an even distribution regardless of /n/.
--
-- If your keyspace is integrals such that the low-order bits between
--- keys are highly variable, then you could get away with using 'id'
+-- keys are highly variable, then you could get away with using 'fromIntegral'
-- as the hash function.
--
-- We provide some sample hash functions for 'Int' and 'String' below.
golden :: Int32
-golden = -1640531527
+golden = 1013904242 -- = round ((sqrt 5 - 1) * 2^32) :: Int32
+-- was -1640531527 = round ((sqrt 5 - 1) * 2^31) :: Int32
+-- but that has bad mulHi properties (even adding 2^32 to get its inverse)
+-- Whereas the above works well and contains no hash duplications for
+-- [-32767..65536]
+
+hashInt32 :: Int32 -> Int32
+hashInt32 x = mulHi x golden + x
-- | A sample (and useful) hash function for Int and Int32,
-- implemented by extracting the uppermost 32 bits of the 64-bit
--- result of multiplying by a 32-bit constant. The constant is from
+-- result of multiplying by a 33-bit constant. The constant is from
-- Knuth, derived from the golden ratio:
--- > golden = round ((sqrt 5 - 1) * 2^31) :: Int
+--
+-- > golden = round ((sqrt 5 - 1) * 2^32)
+--
+-- We get good key uniqueness on small inputs
+-- (a problem with previous versions):
+-- (length $ group $ sort $ map hashInt [-32767..65536]) == 65536 + 32768
+--
hashInt :: Int -> Int32
-hashInt x = mulHi (fromIntegral x) golden
+hashInt x = hashInt32 (fromIntegral x)
-- hi 32 bits of a x-bit * 32 bit -> 64-bit multiply
mulHi :: Int32 -> Int32 -> Int32
mulHi a b = fromIntegral (r `shiftR` 32)
- where r :: Int64
- r = fromIntegral a * fromIntegral b :: Int64
+ where r :: Int64
+ r = fromIntegral a * fromIntegral b
-- | A sample hash function for Strings. We keep multiplying by the
-- golden ratio and adding. The implementation is:
--
--- > hashString = foldl' f 0
--- > where f m c = fromIntegral (ord c) + mulHi m golden
+-- > hashString = foldl' f golden
+-- > where f m c = fromIntegral (ord c) * magic + hashInt32 m
+-- > magic = 0xdeadbeef
--
--- Note that this has not been extensively tested for reasonability,
--- but Knuth argues that repeated multiplication by the golden ratio
--- will minimize gaps in the hash space.
+-- Where hashInt32 works just as hashInt shown above.
+--
+-- Knuth argues that repeated multiplication by the golden ratio
+-- will minimize gaps in the hash space, and thus it's a good choice
+-- for combining together multiple keys to form one.
+--
+-- Here we know that individual characters c are often small, and this
+-- produces frequent collisions if we use ord c alone. A
+-- particular problem are the shorter low ASCII and ISO-8859-1
+-- character strings. We pre-multiply by a magic twiddle factor to
+-- obtain a good distribution. In fact, given the following test:
+--
+-- > testp :: Int32 -> Int
+-- > testp k = (n - ) . length . group . sort . map hs . take n $ ls
+-- > where ls = [] : [c : l | l <- ls, c <- ['\0'..'\xff']]
+-- > hs = foldl' f golden
+-- > f m c = fromIntegral (ord c) * k + hashInt32 m
+-- > n = 100000
+--
+-- We discover that testp magic = 0.
+
hashString :: String -> Int32
-hashString = foldl' f 0
- where f m c = fromIntegral (ord c) + mulHi m golden
+hashString = foldl' f golden
+ where f m c = fromIntegral (ord c) * magic + hashInt32 m
+ magic = 0xdeadbeef
-- | A prime larger than the maximum hash table size
prime :: Int32
then return table
else do
--
- newbkts' <- newMutArray (0,newmask) []
+ newbkts' <- newMutArray (0,newmask) []
- let
- splitBucket oldindex = do
- bucket <- readHTArray bkts oldindex
- let (oldb,newb) =
+ let
+ splitBucket oldindex = do
+ bucket <- readHTArray bkts oldindex
+ let (oldb,newb) =
partition ((oldindex==). bucketIndex newmask . hash . fst) bucket
- writeMutArray newbkts' oldindex oldb
- writeMutArray newbkts' (oldindex + oldsize) newb
- mapM_ splitBucket [0..mask]
+ writeMutArray newbkts' oldindex oldb
+ writeMutArray newbkts' (oldindex + oldsize) newb
+ mapM_ splitBucket [0..mask]
- newbkts <- freezeArray newbkts'
+ newbkts <- freezeArray newbkts'
- return ( table{ buckets=newbkts, bmask=newmask } )
+ return ( table{ buckets=newbkts, bmask=newmask } )
-- -----------------------------------------------------------------------------
-- Deleting a mapping from the hash table