X-Git-Url: http://git.megacz.com/?a=blobdiff_plain;f=Data%2FHashTable.hs;h=48ecb0bbc3718b93c575f19b5fcab79173b590cb;hb=ec0f319384683be6603f5b2862e6621fe243dc65;hp=8ea79099f0b37726efbb247c59103cdcfdaf8e7f;hpb=1a3710614eec1bba6f241f44e9bf84249d43b58d;p=ghc-base.git diff --git a/Data/HashTable.hs b/Data/HashTable.hs index 8ea7909..48ecb0b 100644 --- a/Data/HashTable.hs +++ b/Data/HashTable.hs @@ -1,4 +1,4 @@ -{-# OPTIONS_GHC -fno-implicit-prelude #-} +{-# OPTIONS_GHC -XNoImplicitPrelude -funbox-strict-fields #-} ----------------------------------------------------------------------------- -- | @@ -18,16 +18,16 @@ ----------------------------------------------------------------------------- module Data.HashTable ( - -- * Basic hash table operations - HashTable, new, insert, delete, lookup, update, - -- * Converting to and from lists - fromList, toList, - -- * Hash functions - -- $hash_functions - hashInt, hashString, - prime, - -- * Diagnostics - longestChain + -- * Basic hash table operations + HashTable, new, insert, delete, lookup, update, + -- * Converting to and from lists + fromList, toList, + -- * Hash functions + -- $hash_functions + hashInt, hashString, + prime, + -- * Diagnostics + longestChain ) where -- This module is imported by Data.Dynamic, which is pretty low down in the @@ -36,36 +36,36 @@ module Data.HashTable ( #ifdef __GLASGOW_HASKELL__ import GHC.Base #else -import Prelude hiding ( lookup ) +import Prelude hiding ( lookup ) #endif -import Data.Tuple ( fst ) +import Data.Tuple ( fst ) import Data.Bits import Data.Maybe -import Data.List ( maximumBy, length, concat, foldl', partition ) -import Data.Int ( Int32 ) +import Data.List ( maximumBy, length, concat, foldl', partition ) +import Data.Int ( Int32 ) #if defined(__GLASGOW_HASKELL__) import GHC.Num -import GHC.Real ( fromIntegral ) -import GHC.Show ( Show(..) ) -import GHC.Int ( Int64 ) +import GHC.Real ( fromIntegral ) +import GHC.Show ( Show(..) ) +import GHC.Int ( Int64 ) -import GHC.IOBase ( IO, IOArray, newIOArray, - unsafeReadIOArray, unsafeWriteIOArray, unsafePerformIO, - IORef, newIORef, readIORef, writeIORef ) +import GHC.IOBase ( IO, IOArray, newIOArray, + unsafeReadIOArray, unsafeWriteIOArray, unsafePerformIO, + IORef, newIORef, readIORef, writeIORef ) #else -import Data.Char ( ord ) -import Data.IORef ( IORef, newIORef, readIORef, writeIORef ) -import System.IO.Unsafe ( unsafePerformIO ) -import Data.Int ( Int64 ) +import Data.Char ( ord ) +import Data.IORef ( IORef, newIORef, readIORef, writeIORef ) +import System.IO.Unsafe ( unsafePerformIO ) +import Data.Int ( Int64 ) # if defined(__HUGS__) -import Hugs.IOArray ( IOArray, newIOArray, - unsafeReadIOArray, unsafeWriteIOArray ) +import Hugs.IOArray ( IOArray, newIOArray, + unsafeReadIOArray, unsafeWriteIOArray ) # elif defined(__NHC__) -import NHC.IOExtras ( IOArray, newIOArray, readIOArray, writeIOArray ) +import NHC.IOExtras ( IOArray, newIOArray, readIOArray, writeIOArray ) # endif #endif -import Control.Monad ( mapM, mapM_, sequence_ ) +import Control.Monad ( mapM, mapM_, sequence_ ) ----------------------------------------------------------------------- @@ -101,17 +101,17 @@ thawArray = return -- unsafeThaw #endif data HashTable key val = HashTable { - cmp :: !(key -> key -> Bool), - hash_fn :: !(key -> Int32), + cmp :: !(key -> key -> Bool), + hash_fn :: !(key -> Int32), tab :: !(IORef (HT key val)) } -- TODO: the IORef should really be an MVar. data HT key val = HT { - kcount :: !Int32, -- Total number of keys. + kcount :: !Int32, -- Total number of keys. bmask :: !Int32, - buckets :: !(HTArray [(key,val)]) + buckets :: !(HTArray [(key,val)]) } -- ------------------------------------------------------------ @@ -170,7 +170,7 @@ recordLookup = instrument lkup -- stats :: IO String -- stats = fmap show $ readIORef hashData --- ----------------------------------------------------------------------------- +-- ---------------------------------------------------------------------------- -- Sample hash functions -- $hash_functions @@ -180,41 +180,73 @@ recordLookup = instrument lkup -- function therefore will give an even distribution regardless of /n/. -- -- If your keyspace is integrals such that the low-order bits between --- keys are highly variable, then you could get away with using 'id' +-- keys are highly variable, then you could get away with using 'fromIntegral' -- as the hash function. -- -- We provide some sample hash functions for 'Int' and 'String' below. golden :: Int32 -golden = -1640531527 +golden = 1013904242 -- = round ((sqrt 5 - 1) * 2^32) :: Int32 +-- was -1640531527 = round ((sqrt 5 - 1) * 2^31) :: Int32 +-- but that has bad mulHi properties (even adding 2^32 to get its inverse) +-- Whereas the above works well and contains no hash duplications for +-- [-32767..65536] + +hashInt32 :: Int32 -> Int32 +hashInt32 x = mulHi x golden + x -- | A sample (and useful) hash function for Int and Int32, -- implemented by extracting the uppermost 32 bits of the 64-bit --- result of multiplying by a 32-bit constant. The constant is from +-- result of multiplying by a 33-bit constant. The constant is from -- Knuth, derived from the golden ratio: -- --- > golden = round ((sqrt 5 - 1) * 2^31) :: Int +-- > golden = round ((sqrt 5 - 1) * 2^32) +-- +-- We get good key uniqueness on small inputs +-- (a problem with previous versions): +-- (length $ group $ sort $ map hashInt [-32767..65536]) == 65536 + 32768 +-- hashInt :: Int -> Int32 -hashInt x = mulHi (fromIntegral x) golden +hashInt x = hashInt32 (fromIntegral x) -- hi 32 bits of a x-bit * 32 bit -> 64-bit multiply mulHi :: Int32 -> Int32 -> Int32 mulHi a b = fromIntegral (r `shiftR` 32) - where r :: Int64 - r = fromIntegral a * fromIntegral b :: Int64 + where r :: Int64 + r = fromIntegral a * fromIntegral b -- | A sample hash function for Strings. We keep multiplying by the -- golden ratio and adding. The implementation is: -- --- > hashString = foldl' f 0 --- > where f m c = fromIntegral (ord c) + mulHi m golden +-- > hashString = foldl' f golden +-- > where f m c = fromIntegral (ord c) * magic + hashInt32 m +-- > magic = 0xdeadbeef +-- +-- Where hashInt32 works just as hashInt shown above. -- --- Note that this has not been extensively tested for reasonability, --- but Knuth argues that repeated multiplication by the golden ratio --- will minimize gaps in the hash space. +-- Knuth argues that repeated multiplication by the golden ratio +-- will minimize gaps in the hash space, and thus it's a good choice +-- for combining together multiple keys to form one. +-- +-- Here we know that individual characters c are often small, and this +-- produces frequent collisions if we use ord c alone. A +-- particular problem are the shorter low ASCII and ISO-8859-1 +-- character strings. We pre-multiply by a magic twiddle factor to +-- obtain a good distribution. In fact, given the following test: +-- +-- > testp :: Int32 -> Int +-- > testp k = (n - ) . length . group . sort . map hs . take n $ ls +-- > where ls = [] : [c : l | l <- ls, c <- ['\0'..'\xff']] +-- > hs = foldl' f golden +-- > f m c = fromIntegral (ord c) * k + hashInt32 m +-- > n = 100000 +-- +-- We discover that testp magic = 0. + hashString :: String -> Int32 -hashString = foldl' f 0 - where f m c = fromIntegral (ord c) + mulHi m golden +hashString = foldl' f golden + where f m c = fromIntegral (ord c) * magic + hashInt32 m + magic = 0xdeadbeef -- | A prime larger than the maximum hash table size prime :: Int32 @@ -246,7 +278,7 @@ hYSTERESIS = 64 -- entries to ignore in load computation -- new :: (key -> key -> Bool) -- ^ @eq@: An equality comparison on keys - -> (key -> Int32) -- ^ @hash@: A hash function on keys + -> (key -> Int32) -- ^ @hash@: A hash function on keys -> IO (HashTable key val) -- ^ Returns: an empty hash table new cmpr hash = do