[project @ 2005-01-28 10:15:44 by ross]

[ghc-base.git] / Data / HashTable.hs
diff --git a/Data/HashTable.hs b/Data/HashTable.hs

index 917daab..0f31af5 100644 (file)
--- a/Data/HashTable.hs
+++ b/Data/HashTable.hs
@@ -1,3 +1,5 @@
+{-# OPTIONS_GHC -fno-implicit-prelude #-}
+
  -----------------------------------------------------------------------------
  -- |
  -- Module      :  Data.HashTable
@@ -17,35 +19,61 @@
  
  module Data.HashTable (
         -- * Basic hash table operations
-       HashTable, new, insert, delete, lookup,
+       HashTable, new, insert, delete, lookup, update,
         -- * Converting to and from lists
         fromList, toList,
         -- * Hash functions
         -- $hash_functions
         hashInt, hashString,
+       prime,
         -- * Diagnostics
         longestChain
   ) where
  
-import Data.Char       ( ord )
-import Data.Int        ( Int32 )
-import Data.Array.IO
-import Data.Array.Base
-import Data.List       ( maximumBy )
-import Data.IORef
+-- This module is imported by Data.Dynamic, which is pretty low down in the
+-- module hierarchy, so don't import "high-level" modules
+
+#ifdef __GLASGOW_HASKELL__
+import GHC.Base
+#else
+import Prelude hiding  ( lookup )
+#endif
+import Data.Tuple      ( fst )
  import Data.Bits
-import Control.Monad   ( when )
-import Prelude                 hiding (lookup)
---import Debug.Trace
+import Data.Maybe
+import Data.List       ( maximumBy, filter, length, concat, foldl )
+import Data.Int                ( Int32 )
+
+#if defined(__GLASGOW_HASKELL__)
+import GHC.Num
+import GHC.Real                ( Integral(..), fromIntegral )
+
+import GHC.IOBase      ( IO, IOArray, newIOArray, readIOArray, writeIOArray,
+                         unsafeReadIOArray, unsafeWriteIOArray,
+                         IORef, newIORef, readIORef, writeIORef )
+import GHC.Err         ( undefined )
+#else
+import Data.Char       ( ord )
+import Data.IORef      ( IORef, newIORef, readIORef, writeIORef )
+#  if defined(__HUGS__)
+import Hugs.IOArray    ( IOArray, newIOArray, readIOArray, writeIOArray,
+                         unsafeReadIOArray, unsafeWriteIOArray )
+#  elif defined(__NHC__)
+import NHC.IOExtras    ( IOArray, newIOArray, readIOArray, writeIOArray)
+#  endif
+#endif
+import Control.Monad   ( when, mapM, sequence_ )
  
+
+-----------------------------------------------------------------------
  myReadArray  :: IOArray Int32 a -> Int32 -> IO a
  myWriteArray :: IOArray Int32 a -> Int32 -> a -> IO ()
-#ifdef DEBUG
-myReadArray  = readArray
-myWriteArray = writeArray
+#if defined(DEBUG) || defined(__NHC__)
+myReadArray  = readIOArray
+myWriteArray = writeIOArray
  #else
-myReadArray arr i = unsafeRead arr (fromIntegral i)
-myWriteArray arr i x = unsafeWrite arr (fromIntegral i) x
+myReadArray arr i = unsafeReadIOArray arr (fromIntegral i)
+myWriteArray arr i x = unsafeWriteIOArray arr (fromIntegral i) x
  #endif
  
  -- | A hash table mapping keys of type @key@ to values of type @val@.
@@ -125,11 +153,12 @@ hashInt = (`rem` prime) . fromIntegral
  -- which seems to give reasonable results.
  --
  hashString :: String -> Int32
-hashString = fromIntegral . foldr f 0
-  where f c m = ord c + (m * 128) `rem` fromIntegral prime
+hashString = fromIntegral . foldl f 0
+  where f m c = ord c + (m * 128) `rem` fromIntegral prime
  
--- a prime larger than the maximum hash table size
-prime = 1500007 :: Int32
+-- | A prime larger than the maximum hash table size
+prime :: Int32
+prime = 1500007
  
  -- -----------------------------------------------------------------------------
  -- Parameters
@@ -146,16 +175,20 @@ hLOAD = 4 :: Int32 -- Maximum average load of a single hash bucket
  -- -----------------------------------------------------------------------------
  -- Creating a new hash table
  
--- | Creates a new hash table
+-- | Creates a new hash table.  The following property should hold for the @eq@
+-- and @hash@ functions passed to 'new':
+--
+-- >   eq A B  =>  hash A == hash B
+--
  new
-  :: (key -> key -> Bool)    -- ^ An equality comparison on keys
-  -> (key -> Int32)         -- ^ A hash function on keys
+  :: (key -> key -> Bool)    -- ^ @eq@: An equality comparison on keys
+  -> (key -> Int32)         -- ^ @hash@: A hash function on keys
    -> IO (HashTable key val)  -- ^ Returns: an empty hash table
  
  new cmp hash_fn = do
    -- make a new hash table with a single, empty, segment
-  dir     <- newArray (0,dIR_SIZE) undefined
-  segment <- newArray (0,sEGMENT_SIZE-1) []
+  dir     <- newIOArray (0,dIR_SIZE-1) undefined
+  segment <- newIOArray (0,sEGMENT_SIZE-1) []
    myWriteArray dir 0 segment
  
    let
@@ -176,7 +209,14 @@ new cmp hash_fn = do
  -- -----------------------------------------------------------------------------
  -- Inserting a key\/value pair into the hash table
  
--- | Inserts an key\/value mapping into the hash table.
+-- | Inserts an key\/value mapping into the hash table.  
+--
+-- Note that 'insert' doesn't remove the old entry from the table -
+-- the behaviour is like an association list, where 'lookup' returns
+-- the most-recently-inserted mapping for a key in the table.  The
+-- reason for this is to keep 'insert' as efficient as possible.  If
+-- you need to update a mapping, then we provide 'update'.
+--
  insert :: HashTable key val -> key -> val -> IO ()
  
  insert (HashTable ref) key val = do
@@ -187,7 +227,7 @@ insert (HashTable ref) key val = do
            then expandHashTable table1
            else return table1
    writeIORef ref table2
-  (segment_index,segment_offset) <- tableLocation table key
+  (segment_index,segment_offset) <- tableLocation table2 key
    segment <- myReadArray dir segment_index
    bucket <- myReadArray segment segment_offset
    myWriteArray segment segment_offset ((key,val):bucket)
@@ -221,6 +261,7 @@ expandHashTable
        table@HT{ dir=dir,
                 split=split,
                 max_bucket=max,
+               bcount=bcount,
                 mask2=mask2 } = do
    let
        oldsegment = split `shiftR` sEGMENT_SHIFT
@@ -230,20 +271,28 @@ expandHashTable
        newsegment = newbucket `shiftR` sEGMENT_SHIFT
        newindex   = newbucket .&. sEGMENT_MASK
    --
-  when (newindex == 0) $
-       do segment <- newArray (0,sEGMENT_SIZE-1) []
-          myWriteArray dir newsegment segment
+  if newsegment >= dIR_SIZE    -- make sure we don't overflow the table.
+       then return table
+       else do
+  --
+   when (newindex == 0) $
+       do segment <- newIOArray (0,sEGMENT_SIZE-1) []
+          writeIOArray dir newsegment segment
+          -- doesn't happen very often, so we might as well use a safe
+          -- array index here.
    --
-  let table' =
+   let table' =
         if (split+1) < max
-           then table{ split = split+1 }
+           then table{ split = split+1,
+                       bcount = bcount+1 }
                 -- we've expanded all the buckets in this table, so start from
                 -- the beginning again.
             else table{ split = 0,
+                       bcount = bcount+1,
                         max_bucket = max * 2,
                         mask1 = mask2,
                         mask2 = mask2 `shiftL` 1 .|. 1 }
-  let
+   let
      split_bucket old new [] = do
         segment <- myReadArray dir oldsegment
         myWriteArray segment oldindex old
@@ -255,10 +304,10 @@ expandHashTable
                 then split_bucket old ((k,v):new) xs
                 else split_bucket ((k,v):old) new xs
    --
-  segment <- myReadArray dir oldsegment
-  bucket <- myReadArray segment oldindex
-  split_bucket [] [] bucket
-  return table'
+   segment <- myReadArray dir oldsegment
+   bucket <- myReadArray segment oldindex
+   split_bucket [] [] bucket
+   return table'
  
  -- -----------------------------------------------------------------------------
  -- Deleting a mapping from the hash table
@@ -275,6 +324,43 @@ delete (HashTable ref) key = do
    return ()
  
  -- -----------------------------------------------------------------------------
+-- Deleting a mapping from the hash table
+
+-- | Updates an entry in the hash table, returning 'True' if there was
+-- already an entry for this key, or 'False' otherwise.  After 'update'
+-- there will always be exactly one entry for the given key in the table.
+--
+-- 'insert' is more efficient than 'update' if you don't care about
+-- multiple entries, or you know for sure that multiple entries can't
+-- occur.  However, 'update' is more efficient than 'delete' followed
+-- by 'insert'.
+update :: HashTable key val -> key -> val -> IO Bool
+
+update (HashTable ref) key val = do
+  table@HT{ kcount=k, bcount=b, dir=dir, cmp=cmp } <- readIORef ref
+  let table1 = table{ kcount = k+1 }
+  -- optimistically expand the table
+  table2 <-
+       if (k > hLOAD * b)
+          then expandHashTable table1
+          else return table1
+  writeIORef ref table2
+  (segment_index,segment_offset) <- tableLocation table2 key
+  segment <- myReadArray dir segment_index
+  bucket <- myReadArray segment segment_offset
+  let 
+    (deleted,bucket') = foldr filt (0::Int32,[]) bucket
+    filt pair@(k,v) (deleted,bucket)
+       | key `cmp` k = (deleted+1, bucket)
+       | otherwise   = (deleted,   pair:bucket)
+  -- in  
+  myWriteArray segment segment_offset ((key,val):bucket')
+  -- update the table load, taking into account the number of
+  -- items we just deleted.
+  writeIORef ref table2{ kcount = kcount table2 - deleted }
+  return (deleted /= 0)
+
+-- -----------------------------------------------------------------------------
  -- Looking up an entry in the hash table
  
  -- | Looks up the value of a key in the hash table.