Sync with FPS head

author dons@cse.unsw.edu.au <unknown>

Wed, 3 May 2006 10:52:59 +0000 (10:52 +0000)

committer dons@cse.unsw.edu.au <unknown>

Wed, 3 May 2006 10:52:59 +0000 (10:52 +0000)
author dons@cse.unsw.edu.au <unknown>
Wed, 3 May 2006 10:52:59 +0000 (10:52 +0000)
committer dons@cse.unsw.edu.au <unknown>
Wed, 3 May 2006 10:52:59 +0000 (10:52 +0000)
diff --git a/Data/ByteString.hs b/Data/ByteString.hs

index 6eb861a..76e84d5 100644 (file)
--- a/Data/ByteString.hs
+++ b/Data/ByteString.hs
@@ -1,4 +1,4 @@
-{-# OPTIONS_GHC -cpp -fffi #-}
+{-# OPTIONS_GHC -cpp -fffi -fglasgow-exts #-}
  --
  -- Module      : ByteString
  -- Copyright   : (c) The University of Glasgow 2001,
@@ -99,6 +99,7 @@ module Data.ByteString (
  
          -- ** Breaking and dropping on specific bytes
          breakByte,              -- :: Word8 -> ByteString -> (ByteString, ByteString)
+        spanByte,               -- :: Word8 -> ByteString -> (ByteString, ByteString)
          breakFirst,             -- :: Word8 -> ByteString -> Maybe (ByteString,ByteString)
          breakLast,              -- :: Word8 -> ByteString -> Maybe (ByteString,ByteString)
  
@@ -106,6 +107,8 @@ module Data.ByteString (
          split,                  -- :: Word8 -> ByteString -> [ByteString]
          splitWith,              -- :: (Word8 -> Bool) -> ByteString -> [ByteString]
          tokens,                 -- :: (Word8 -> Bool) -> ByteString -> [ByteString]
+        group,                  -- :: ByteString -> [ByteString]
+        groupBy,                -- :: (Word8 -> Word8 -> Bool) -> ByteString -> [ByteString]
  
          -- ** Joining strings
          join,                   -- :: ByteString -> [ByteString] -> ByteString
@@ -240,6 +243,7 @@ import qualified Data.Array as Array ((!))
  
  -- Control.Exception.bracket not available in yhc or nhc
  import Control.Exception        (bracket)
+import Control.Monad            (when)
  
  import Foreign.C.String         (CString, CStringLen)
  import Foreign.C.Types          (CSize, CInt)
@@ -647,7 +651,7 @@ foldl1 f ps
  foldr1 :: (Word8 -> Word8 -> Word8) -> ByteString -> Word8
  foldr1 f ps
      | null ps        = errorEmptyList "foldr1"
-    | otherwise      = f (unsafeHead ps) (foldr1 f (unsafeTail ps))
+    | otherwise      = foldr f (last ps) (init ps)
  
  -- ---------------------------------------------------------------------
  -- Special folds
@@ -873,6 +877,24 @@ breakByte c p = case elemIndex c p of
      Just n  -> (take n p, drop n p)
  {-# INLINE breakByte #-}
  
+-- | 'spanByte' breaks its ByteString argument at the first
+-- occurence of a byte other than its argument. It is more efficient
+-- than 'span (==)'
+--
+-- > span  (=='c') "abcd" == spanByte 'c' "abcd"
+--
+spanByte :: Word8 -> ByteString -> (ByteString, ByteString)
+spanByte c ps@(PS x s l) = inlinePerformIO $ withForeignPtr x $ \p ->
+    go (p `plusPtr` s) 0
+  where
+    STRICT2(go)
+    go p i | i >= l    = return (ps, empty)
+           | otherwise = do c' <- peekByteOff p i
+                            if c /= c'
+                                then return (take i ps, drop i ps)
+                                else go p (i+1)
+{-# INLINE spanByte #-}
+
  -- | /O(n)/ 'breakFirst' breaks the given ByteString on the first
  -- occurence of @w@. It behaves like 'break', except the delimiter is
  -- not returned, and @Nothing@ is returned if the delimiter is not in
@@ -910,7 +932,7 @@ breakLast c p = case elemIndexLast c p of
  -- | 'span' @p xs@ breaks the ByteString into two segments. It is
  -- equivalent to @('takeWhile' p xs, 'dropWhile' p xs)@
  span :: (Word8 -> Bool) -> ByteString -> (ByteString, ByteString)
-span  p ps = break (not . p) ps
+span p ps = break (not . p) ps
  {-# INLINE span #-}
  
  -- | 'spanEnd' behaves like 'span' but from the end of the 'ByteString'.
@@ -1037,6 +1059,31 @@ split (W8# w#) (PS fp off len) = splitWith' off len fp
  tokens :: (Word8 -> Bool) -> ByteString -> [ByteString]
  tokens f = P.filter (not.null) . splitWith f
  
+-- | The 'group' function takes a ByteString and returns a list of
+-- ByteStrings such that the concatenation of the result is equal to the
+-- argument.  Moreover, each sublist in the result contains only equal
+-- elements.  For example,
+--
+-- > group "Mississippi" = ["M","i","ss","i","ss","i","pp","i"]
+--
+-- It is a special case of 'groupBy', which allows the programmer to
+-- supply their own equality test. It is about 40% faster than 
+-- /groupBy (==)/
+group :: ByteString -> [ByteString]
+group xs
+    | null xs   = []
+    | otherwise = ys : group zs
+    where
+        (ys, zs) = spanByte (unsafeHead xs) xs
+
+-- | The 'groupBy' function is the non-overloaded version of 'group'.
+groupBy :: (Word8 -> Word8 -> Bool) -> ByteString -> [ByteString]
+groupBy k xs
+    | null xs   = []
+    | otherwise = take n xs : groupBy k (drop n xs)
+    where
+        n = 1 + findIndexOrEnd (not . k (unsafeHead xs)) (unsafeTail xs)
+
  -- | /O(n)/ The 'join' function takes a 'ByteString' and a list of
  -- 'ByteString's and concatenates the list after interspersing the first
  -- argument between each element of the list.
@@ -1390,16 +1437,52 @@ elems (PS x s l) = (PS x s 1:elems (PS x (s+1) (l-1)))
  -- ---------------------------------------------------------------------
  -- ** Ordered 'ByteString's
  
--- | /O(n log(n))/ Sort a ByteString efficiently, using qsort(3).
+-- | /O(n)/ Sort a ByteString efficiently, using counting sort.
+sort :: ByteString -> ByteString
+sort (PS input s l) = create l $ \p -> allocaArray 256 $ \arr -> do
+
+    memset (castPtr arr) 0 (256 * fromIntegral (sizeOf (undefined :: CSize)))
+    withForeignPtr input (\x -> countEach arr (x `plusPtr` s) l)
+
+    let STRICT2(go)
+        go 256 _   = return ()
+        go i   ptr = do n <- peekElemOff arr i
+                        when (n /= 0) $ memset ptr (fromIntegral i) n >> return ()
+                        go (i + 1) (ptr `plusPtr` (fromIntegral n))
+    go 0 p
+
+-- "countEach counts str l" counts the number of occurences of each Word8 in
+-- str, and stores the result in counts.
+countEach :: Ptr CSize -> Ptr Word8 -> Int -> IO ()
+STRICT3(countEach)
+countEach counts str l = go 0
+ where
+    STRICT1(go)
+    go i | i == l    = return ()
+         | otherwise = do k <- fromIntegral `fmap` peekElemOff str i
+                          x <- peekElemOff counts k
+                          pokeElemOff counts k (x + 1)
+                          go (i + 1)
+
+{-
  sort :: ByteString -> ByteString
  sort (PS x s l) = create l $ \p -> withForeignPtr x $ \f -> do
          memcpy p (f `plusPtr` s) l
          c_qsort p l -- inplace
+-}
  
  {-
  sort = pack . List.sort . unpack
  -}
  
+-- | The 'sortBy' function is the non-overloaded version of 'sort'.
+--
+-- Try some linear sorts: radix, counting
+-- Or mergesort.
+--
+-- sortBy :: (Word8 -> Word8 -> Ordering) -> ByteString -> ByteString
+-- sortBy f ps = undefined
+
  -- ---------------------------------------------------------------------
  --
  -- Extensions to the basic interface
@@ -1863,8 +1946,9 @@ mallocByteString l = do
  
  -- | A way of creating ForeignPtrs outside the IO monad. The @Int@
  -- argument gives the final size of the ByteString. Unlike 'generate'
--- the ByteString is no reallocated if the final size is less than the
--- estimated size.
+-- the ByteString is not reallocated if the final size is less than the
+-- estimated size. Also, unlike 'generate' ByteString's created this way
+-- are managed on the Haskell heap.
  create :: Int -> (Ptr Word8 -> IO ()) -> ByteString
  create l write_ptr = inlinePerformIO $ do
      fp <- mallocByteString (l+1)
@@ -1971,9 +2055,6 @@ foreign import ccall unsafe "static fpstring.h minimum" c_minimum
  foreign import ccall unsafe "static fpstring.h count" c_count
      :: Ptr Word8 -> Int -> Word8 -> Int
  
-foreign import ccall unsafe "static fpstring.h my_qsort" c_qsort
-    :: Ptr Word8 -> Int -> IO ()
-
  -- ---------------------------------------------------------------------
  -- MMap
  
diff --git a/Data/ByteString/Char8.hs b/Data/ByteString/Char8.hs

index dd94d0a..530dda8 100644 (file)
--- a/Data/ByteString/Char8.hs
+++ b/Data/ByteString/Char8.hs
@@ -1,4 +1,4 @@
-{-# OPTIONS_GHC -cpp -fffi #-}
+{-# OPTIONS_GHC -cpp -fffi -fglasgow-exts #-}
  --
  -- Module      : Data.ByteString.Char8
  -- Copyright   : (c) Don Stewart 2006
@@ -98,6 +98,7 @@ module Data.ByteString.Char8 (
  
          -- ** Breaking and dropping on specific Chars
          breakChar,              -- :: Char -> ByteString -> (ByteString, ByteString)
+        spanChar,           -- :: Char -> ByteString -> (ByteString, ByteString)
          breakFirst,             -- :: Char -> ByteString -> Maybe (ByteString,ByteString)
          breakLast,              -- :: Char -> ByteString -> Maybe (ByteString,ByteString)
          breakSpace,             -- :: ByteString -> Maybe (ByteString,ByteString)
@@ -108,6 +109,8 @@ module Data.ByteString.Char8 (
          split,                  -- :: Char -> ByteString -> [ByteString]
          splitWith,              -- :: (Char -> Bool) -> ByteString -> [ByteString]
          tokens,                 -- :: (Char -> Bool) -> ByteString -> [ByteString]
+        group,                  -- :: ByteString -> [ByteString]
+        groupBy,                -- :: (Word8 -> Word8 -> Bool) -> ByteString -> [ByteString]
  
          -- ** Breaking into lines and words
          lines,                  -- :: ByteString -> [ByteString]
@@ -232,7 +235,7 @@ import Data.ByteString (ByteString(..)
                         ,inits,tails,elems,reverse,transpose
                         ,concat,take,drop,splitAt,join
                         ,sort,isPrefixOf,isSuffixOf,isSubstringOf,findSubstring
-                       ,findSubstrings,unsafeTail,copy
+                       ,findSubstrings,unsafeTail,copy,group
  
                         ,getContents, putStr, putStrLn
                         ,readFile, {-mmapFile,-} writeFile
@@ -365,7 +368,7 @@ foldl1 f ps = w2c (B.foldl1 (\x y -> c2w (f (w2c x) (w2c y))) ps)
  -- | 'foldr1' is a variant of 'foldr' that has no starting value argument,
  -- and thus must be applied to non-empty 'ByteString's
  foldr1 :: (Char -> Char -> Char) -> ByteString -> Char
-foldr1 f ps = w2c (B.foldl1 (\x y -> c2w (f (w2c x) (w2c y))) ps)
+foldr1 f ps = w2c (B.foldr1 (\x y -> c2w (f (w2c x) (w2c y))) ps)
  {-# INLINE foldr1 #-}
  
  -- | Map a function over a 'ByteString' and concatenate the results
@@ -486,6 +489,16 @@ breakChar :: Char -> ByteString -> (ByteString, ByteString)
  breakChar = B.breakByte . c2w
  {-# INLINE breakChar #-}
  
+-- | 'spanChar' breaks its ByteString argument at the first
+-- occurence of a Char other than its argument. It is more efficient
+-- than 'span (==)'
+--
+-- > span  (=='c') "abcd" == spanByte 'c' "abcd"
+--
+spanChar :: Char -> ByteString -> (ByteString, ByteString)
+spanChar = B.spanByte . c2w
+{-# INLINE spanChar #-}
+
  -- | /O(n)/ 'breakFirst' breaks the given ByteString on the first
  -- occurence of @w@. It behaves like 'break', except the delimiter is
  -- not returned, and @Nothing@ is returned if the delimiter is not in
@@ -557,6 +570,10 @@ tokens :: (Char -> Bool) -> ByteString -> [ByteString]
  tokens f = B.tokens (f . w2c)
  {-# INLINE tokens #-}
  
+-- | The 'groupBy' function is the non-overloaded version of 'group'.
+groupBy :: (Char -> Char -> Bool) -> ByteString -> [ByteString]
+groupBy k = B.groupBy (\a b -> k (w2c a) (w2c b))
+
  -- | /O(n)/ joinWithChar. An efficient way to join to two ByteStrings with a
  -- char. Around 4 times faster than the generalised join.
  --
@@ -608,6 +625,10 @@ findIndices f = B.findIndices (f . w2c)
  -- | count returns the number of times its argument appears in the ByteString
  --
  -- > count = length . elemIndices
+-- 
+-- Also
+--  
+-- > count '\n' == length . lines
  --
  -- But more efficiently than using length on the intermediate list.
  count :: Char -> ByteString -> Int
@@ -785,6 +806,7 @@ lastnonspace ptr n
  
  -- | 'lines' breaks a ByteString up into a list of ByteStrings at
  -- newline Chars. The resulting strings do not contain newlines.
+--
  lines :: ByteString -> [ByteString]
  lines ps
      | null ps = []
diff --git a/cbits/fpstring.c b/cbits/fpstring.c

index 3fabcc8..b8fc540 100644 (file)
--- a/cbits/fpstring.c
+++ b/cbits/fpstring.c
@@ -41,16 +41,6 @@ void reverse(unsigned char *dest, unsigned char *from, int len) {
          *q++ = *p--;
  }
  
-/* compare bytes ascii-wise */
-static int cmp(const void *p, const void *q) {
-    return (*(unsigned char *)p - *(unsigned char *)q);
-}
-
-/* quicksort wrapper */
-void my_qsort(unsigned char *base, size_t size) {
-    qsort(base, size, sizeof(char), cmp);
-}
-
  /* duplicate a string, interspersing the character through the elements
     of the duplicated string */
  void intersperse(unsigned char *dest, unsigned char *from, int len, char c) {
diff --git a/include/fpstring.h b/include/fpstring.h

index 18e633f..614162d 100644 (file)
--- a/include/fpstring.h
+++ b/include/fpstring.h
@@ -1,7 +1,5 @@
-#include <stdlib.h>
  
  void reverse(unsigned char *dest, unsigned char *from, int len);
-void my_qsort(unsigned char *base, size_t size);
  void intersperse(unsigned char *dest, unsigned char *from, int len, char c);
  unsigned char maximum(unsigned char *p, int len);
  unsigned char minimum(unsigned char *p, int len);
author	dons@cse.unsw.edu.au <unknown>
	Wed, 3 May 2006 10:52:59 +0000 (10:52 +0000)
committer	dons@cse.unsw.edu.au <unknown>
	Wed, 3 May 2006 10:52:59 +0000 (10:52 +0000)
Data/ByteString.hs		patch \| blob \| history
Data/ByteString/Char8.hs		patch \| blob \| history
cbits/fpstring.c		patch \| blob \| history
include/fpstring.h		patch \| blob \| history