X-Git-Url: http://git.megacz.com/?a=blobdiff_plain;f=Data%2FHashTable.hs;h=e96160a59c07bd18b39c80bc271d7106d7d91b1b;hb=HEAD;hp=391876f2e08dd898d7dd7b7d6f5281102ccf22f3;hpb=0c685143cbdfcfc3964cf945f8df1b2f3cb0cef8;p=ghc-base.git diff --git a/Data/HashTable.hs b/Data/HashTable.hs index 391876f..e96160a 100644 --- a/Data/HashTable.hs +++ b/Data/HashTable.hs @@ -1,4 +1,5 @@ -{-# OPTIONS_GHC -fno-implicit-prelude #-} +{-# LANGUAGE CPP, NoImplicitPrelude #-} +{-# OPTIONS_GHC -funbox-strict-fields -fno-warn-name-shadowing #-} ----------------------------------------------------------------------------- -- | @@ -18,16 +19,16 @@ ----------------------------------------------------------------------------- module Data.HashTable ( - -- * Basic hash table operations - HashTable, new, insert, delete, lookup, update, - -- * Converting to and from lists - fromList, toList, - -- * Hash functions - -- $hash_functions - hashInt, hashString, - prime, - -- * Diagnostics - longestChain + -- * Basic hash table operations + HashTable, new, newHint, insert, delete, lookup, update, + -- * Converting to and from lists + fromList, toList, + -- * Hash functions + -- $hash_functions + hashInt, hashString, + prime, + -- * Diagnostics + longestChain ) where -- This module is imported by Data.Dynamic, which is pretty low down in the @@ -36,36 +37,36 @@ module Data.HashTable ( #ifdef __GLASGOW_HASKELL__ import GHC.Base #else -import Prelude hiding ( lookup ) +import Prelude hiding ( lookup ) #endif -import Data.Tuple ( fst ) +import Data.Tuple ( fst ) import Data.Bits import Data.Maybe -import Data.List ( maximumBy, length, concat, foldl', partition ) -import Data.Int ( Int32 ) +import Data.List ( maximumBy, length, concat, foldl', partition ) +import Data.Int ( Int32 ) #if defined(__GLASGOW_HASKELL__) import GHC.Num -import GHC.Real ( fromIntegral ) -import GHC.Show ( Show(..) ) -import GHC.Int ( Int64 ) +import GHC.Real ( fromIntegral ) +import GHC.Show ( Show(..) ) +import GHC.Int ( Int64 ) -import GHC.IOBase ( IO, IOArray, newIOArray, - unsafeReadIOArray, unsafeWriteIOArray, unsafePerformIO, - IORef, newIORef, readIORef, writeIORef ) +import GHC.IO +import GHC.IOArray +import GHC.IORef #else -import Data.Char ( ord ) -import Data.IORef ( IORef, newIORef, readIORef, writeIORef ) -import System.IO.Unsafe ( unsafePerformIO ) -import Data.Int ( Int64 ) +import Data.Char ( ord ) +import Data.IORef ( IORef, newIORef, readIORef, writeIORef ) +import System.IO.Unsafe ( unsafePerformIO ) +import Data.Int ( Int64 ) # if defined(__HUGS__) -import Hugs.IOArray ( IOArray, newIOArray, - unsafeReadIOArray, unsafeWriteIOArray ) +import Hugs.IOArray ( IOArray, newIOArray, + unsafeReadIOArray, unsafeWriteIOArray ) # elif defined(__NHC__) -import NHC.IOExtras ( IOArray, newIOArray, readIOArray, writeIOArray ) +import NHC.IOExtras ( IOArray, newIOArray, readIOArray, writeIOArray ) # endif #endif -import Control.Monad ( mapM, mapM_, sequence_ ) +import Control.Monad ( mapM, mapM_, sequence_ ) ----------------------------------------------------------------------- @@ -77,41 +78,30 @@ iNSTRUMENTED = False readHTArray :: HTArray a -> Int32 -> IO a writeMutArray :: MutArray a -> Int32 -> a -> IO () -freezeArray :: MutArray a -> IO (HTArray a) -thawArray :: HTArray a -> IO (MutArray a) newMutArray :: (Int32, Int32) -> a -> IO (MutArray a) -#if defined(DEBUG) || defined(__NHC__) +newMutArray = newIOArray type MutArray a = IOArray Int32 a type HTArray a = MutArray a -newMutArray = newIOArray +#if defined(DEBUG) || defined(__NHC__) readHTArray = readIOArray writeMutArray = writeIOArray -freezeArray = return -thawArray = return #else -type MutArray a = IOArray Int32 a -type HTArray a = MutArray a -- Array Int32 a -newMutArray = newIOArray -readHTArray arr i = readMutArray arr i -- return $! (unsafeAt arr (fromIntegral i)) -readMutArray :: MutArray a -> Int32 -> IO a -readMutArray arr i = unsafeReadIOArray arr (fromIntegral i) +readHTArray arr i = unsafeReadIOArray arr (fromIntegral i) writeMutArray arr i x = unsafeWriteIOArray arr (fromIntegral i) x -freezeArray = return -- unsafeFreeze -thawArray = return -- unsafeThaw #endif data HashTable key val = HashTable { - cmp :: !(key -> key -> Bool), - hash_fn :: !(key -> Int32), + cmp :: !(key -> key -> Bool), + hash_fn :: !(key -> Int32), tab :: !(IORef (HT key val)) } -- TODO: the IORef should really be an MVar. data HT key val = HT { - kcount :: !Int32, -- Total number of keys. + kcount :: !Int32, -- Total number of keys. bmask :: !Int32, - buckets :: !(HTArray [(key,val)]) + buckets :: !(HTArray [(key,val)]) } -- ------------------------------------------------------------ @@ -170,7 +160,7 @@ recordLookup = instrument lkup -- stats :: IO String -- stats = fmap show $ readIORef hashData --- ----------------------------------------------------------------------------- +-- ---------------------------------------------------------------------------- -- Sample hash functions -- $hash_functions @@ -180,41 +170,73 @@ recordLookup = instrument lkup -- function therefore will give an even distribution regardless of /n/. -- -- If your keyspace is integrals such that the low-order bits between --- keys are highly variable, then you could get away with using 'id' +-- keys are highly variable, then you could get away with using 'fromIntegral' -- as the hash function. -- -- We provide some sample hash functions for 'Int' and 'String' below. golden :: Int32 -golden = -1640531527 +golden = 1013904242 -- = round ((sqrt 5 - 1) * 2^32) :: Int32 +-- was -1640531527 = round ((sqrt 5 - 1) * 2^31) :: Int32 +-- but that has bad mulHi properties (even adding 2^32 to get its inverse) +-- Whereas the above works well and contains no hash duplications for +-- [-32767..65536] + +hashInt32 :: Int32 -> Int32 +hashInt32 x = mulHi x golden + x -- | A sample (and useful) hash function for Int and Int32, --- implemented by extracting the lowermost 32 bits of the --- result of multiplying by a 32-bit constant. The constant is from +-- implemented by extracting the uppermost 32 bits of the 64-bit +-- result of multiplying by a 33-bit constant. The constant is from -- Knuth, derived from the golden ratio: -- --- > golden = round ((sqrt 5 - 1) * 2^31) :: Int +-- > golden = round ((sqrt 5 - 1) * 2^32) +-- +-- We get good key uniqueness on small inputs +-- (a problem with previous versions): +-- (length $ group $ sort $ map hashInt [-32767..65536]) == 65536 + 32768 +-- hashInt :: Int -> Int32 -hashInt x = fromIntegral x * golden +hashInt x = hashInt32 (fromIntegral x) -- hi 32 bits of a x-bit * 32 bit -> 64-bit multiply mulHi :: Int32 -> Int32 -> Int32 mulHi a b = fromIntegral (r `shiftR` 32) - where r :: Int64 - r = fromIntegral a * fromIntegral b :: Int64 + where r :: Int64 + r = fromIntegral a * fromIntegral b -- | A sample hash function for Strings. We keep multiplying by the -- golden ratio and adding. The implementation is: -- --- > hashString = foldl' f 0 --- > where f m c = fromIntegral (fromEnum c + 1) * golden + mulHi m golden +-- > hashString = foldl' f golden +-- > where f m c = fromIntegral (ord c) * magic + hashInt32 m +-- > magic = 0xdeadbeef +-- +-- Where hashInt32 works just as hashInt shown above. +-- +-- Knuth argues that repeated multiplication by the golden ratio +-- will minimize gaps in the hash space, and thus it's a good choice +-- for combining together multiple keys to form one. -- --- Note that this has not been extensively tested for reasonability, --- but Knuth argues that repeated multiplication by the golden ratio --- will minimize gaps in the hash space. +-- Here we know that individual characters c are often small, and this +-- produces frequent collisions if we use ord c alone. A +-- particular problem are the shorter low ASCII and ISO-8859-1 +-- character strings. We pre-multiply by a magic twiddle factor to +-- obtain a good distribution. In fact, given the following test: +-- +-- > testp :: Int32 -> Int +-- > testp k = (n - ) . length . group . sort . map hs . take n $ ls +-- > where ls = [] : [c : l | l <- ls, c <- ['\0'..'\xff']] +-- > hs = foldl' f golden +-- > f m c = fromIntegral (ord c) * k + hashInt32 m +-- > n = 100000 +-- +-- We discover that testp magic = 0. + hashString :: String -> Int32 -hashString = foldl' f 0 - where f m c = fromIntegral (ord c + 1) * golden + mulHi m golden +hashString = foldl' f golden + where f m c = fromIntegral (ord c) * magic + hashInt32 m + magic = 0xdeadbeef -- | A prime larger than the maximum hash table size prime :: Int32 @@ -246,15 +268,63 @@ hYSTERESIS = 64 -- entries to ignore in load computation -- new :: (key -> key -> Bool) -- ^ @eq@: An equality comparison on keys - -> (key -> Int32) -- ^ @hash@: A hash function on keys + -> (key -> Int32) -- ^ @hash@: A hash function on keys -> IO (HashTable key val) -- ^ Returns: an empty hash table new cmpr hash = do recordNew -- make a new hash table with a single, empty, segment let mask = tABLE_MIN-1 - bkts' <- newMutArray (0,mask) [] - bkts <- freezeArray bkts' + bkts <- newMutArray (0,mask) [] + + let + kcnt = 0 + ht = HT { buckets=bkts, kcount=kcnt, bmask=mask } + + table <- newIORef ht + return (HashTable { tab=table, hash_fn=hash, cmp=cmpr }) + +{- + bitTwiddleSameAs takes as arguments positive Int32s less than maxBound/2 and + returns the smallest power of 2 that is greater than or equal to the + argument. + http://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2 +-} +bitTwiddleSameAs :: Int32 -> Int32 +bitTwiddleSameAs v0 = + let v1 = v0-1 + v2 = v1 .|. (v1`shiftR`1) + v3 = v2 .|. (v2`shiftR`2) + v4 = v3 .|. (v3`shiftR`4) + v5 = v4 .|. (v4`shiftR`8) + v6 = v5 .|. (v5`shiftR`16) + in v6+1 + +{- + powerOver takes as arguments Int32s and returns the smallest power of 2 + that is greater than or equal to the argument if that power of 2 is + within [tABLE_MIN,tABLE_MAX] +-} +powerOver :: Int32 -> Int32 +powerOver n = + if n <= tABLE_MIN + then tABLE_MIN + else if n >= tABLE_MAX + then tABLE_MAX + else bitTwiddleSameAs n + +-- | Creates a new hash table with the given minimum size. +newHint + :: (key -> key -> Bool) -- ^ @eq@: An equality comparison on keys + -> (key -> Int32) -- ^ @hash@: A hash function on keys + -> Int -- ^ @minSize@: initial table size + -> IO (HashTable key val) -- ^ Returns: an empty hash table + +newHint cmpr hash minSize = do + recordNew + -- make a new hash table with a single, empty, segment + let mask = powerOver $ fromIntegral minSize + bkts <- newMutArray (0,mask) [] let kcnt = 0 @@ -337,9 +407,7 @@ updatingBucket canEnlarge bucketFn (bckt', inserts, result) <- return $ bucketFn bckt let k' = k + inserts table1 = table { kcount=k' } - bkts' <- thawArray bkts - writeMutArray bkts' indx bckt' - freezeArray bkts' + writeMutArray bkts indx bckt' table2 <- if canEnlarge == CanInsert && inserts > 0 then do recordIns inserts k' bckt' if tooBig k' b @@ -360,19 +428,17 @@ expandHashTable hash table@HT{ buckets=bkts, bmask=mask } = do then return table else do -- - newbkts' <- newMutArray (0,newmask) [] + newbkts <- newMutArray (0,newmask) [] let splitBucket oldindex = do bucket <- readHTArray bkts oldindex let (oldb,newb) = partition ((oldindex==). bucketIndex newmask . hash . fst) bucket - writeMutArray newbkts' oldindex oldb - writeMutArray newbkts' (oldindex + oldsize) newb + writeMutArray newbkts oldindex oldb + writeMutArray newbkts (oldindex + oldsize) newb mapM_ splitBucket [0..mask] - newbkts <- freezeArray newbkts' - return ( table{ buckets=newbkts, bmask=newmask } ) -- -----------------------------------------------------------------------------