Faster filterF, filterNotByte

[haskell-directory.git] / Data / ByteString.hs
diff --git a/Data/ByteString.hs b/Data/ByteString.hs

index 7350eb8..3c15b84 100644 (file)
--- a/Data/ByteString.hs
+++ b/Data/ByteString.hs
@@ -1,4 +1,4 @@
-{-# OPTIONS_GHC -cpp -fffi #-}
+{-# OPTIONS_GHC -cpp -fffi -fglasgow-exts #-}
  --
  -- Module      : ByteString
  -- Copyright   : (c) The University of Glasgow 2001,
@@ -6,6 +6,11 @@
  --               (c) Simon Marlow 2005
  --               (c) Don Stewart 2005-2006
  --               (c) Bjorn Bringert 2006
+--
+-- Array fusion code:
+--               (c) 2001,2002 Manuel M T Chakravarty & Gabriele Keller
+--               (c) 2006      Manuel M T Chakravarty & Roman Leshchinskiy
+--
  -- License     : BSD-style
  --
  -- Maintainer  : dons@cse.unsw.edu.au
@@ -18,8 +23,8 @@
  -- | A time and space-efficient implementation of byte vectors using
  -- packed Word8 arrays, suitable for high performance use, both in terms
  -- of large data quantities, or high speed requirements. Byte vectors
--- are encoded as Word8 arrays of bytes, held in a ForeignPtr, and can
--- be passed between C and Haskell with little effort.
+-- are encoded as strict Word8 arrays of bytes, held in a ForeignPtr,
+-- and can be passed between C and Haskell with little effort.
  --
  -- This module is intended to be imported @qualified@, to avoid name
  -- clashes with Prelude functions.  eg.
@@ -80,7 +85,6 @@ module Data.ByteString (
          maximum,                -- :: ByteString -> Word8
          minimum,                -- :: ByteString -> Word8
          mapIndexed,             -- :: (Int -> Word8 -> Word8) -> ByteString -> ByteString
-        hash,                   -- :: ByteString -> Int32
  
          -- * Generating and unfolding ByteStrings
          replicate,              -- :: Int -> Word8 -> ByteString
@@ -100,6 +104,7 @@ module Data.ByteString (
  
          -- ** Breaking and dropping on specific bytes
          breakByte,              -- :: Word8 -> ByteString -> (ByteString, ByteString)
+        spanByte,               -- :: Word8 -> ByteString -> (ByteString, ByteString)
          breakFirst,             -- :: Word8 -> ByteString -> Maybe (ByteString,ByteString)
          breakLast,              -- :: Word8 -> ByteString -> Maybe (ByteString,ByteString)
  
@@ -107,6 +112,8 @@ module Data.ByteString (
          split,                  -- :: Word8 -> ByteString -> [ByteString]
          splitWith,              -- :: (Word8 -> Bool) -> ByteString -> [ByteString]
          tokens,                 -- :: (Word8 -> Bool) -> ByteString -> [ByteString]
+        group,                  -- :: ByteString -> [ByteString]
+        groupBy,                -- :: (Word8 -> Word8 -> Bool) -> ByteString -> [ByteString]
  
          -- ** Joining strings
          join,                   -- :: ByteString -> [ByteString] -> ByteString
@@ -202,6 +209,7 @@ module Data.ByteString (
          -- ** Files
          readFile,               -- :: FilePath -> IO ByteString
          writeFile,              -- :: FilePath -> ByteString -> IO ()
+--      mmapFile,               -- :: FilePath -> IO ByteString
  
          -- ** I\/O with Handles
  #if defined(__GLASGOW_HASKELL__)
@@ -213,11 +221,15 @@ module Data.ByteString (
          hGet,                   -- :: Handle -> Int -> IO ByteString
          hPut,                   -- :: Handle -> ByteString -> IO ()
  
+        -- * Fusion utilities
  #if defined(__GLASGOW_HASKELL__)
-        -- * Miscellaneous
          unpackList, -- eek, otherwise it gets thrown away by the simplifier
  #endif
  
+        noAL, NoAL, loopArr, loopAcc, loopSndAcc,
+        loopU, mapEFL, filterEFL, foldEFL,
+        filterF, mapF
+
    ) where
  
  import qualified Prelude as P
@@ -234,50 +246,53 @@ import qualified Data.List as List
  
  import Data.Char
  import Data.Word                (Word8)
-import Data.Int                 (Int32)
-import Data.Bits                (rotateL)
  import Data.Maybe               (listToMaybe)
  import Data.Array               (listArray)
  import qualified Data.Array as Array ((!))
  
+-- Control.Exception.bracket not available in yhc or nhc
  import Control.Exception        (bracket)
+import Control.Monad            (when)
  
-import Foreign.C.Types          (CSize, CInt)
  import Foreign.C.String         (CString, CStringLen)
-import Foreign.Storable
+import Foreign.C.Types          (CSize, CInt)
  import Foreign.ForeignPtr
-import Foreign.Ptr
  import Foreign.Marshal.Array
+import Foreign.Ptr
+import Foreign.Storable         (Storable(..))
  
+-- hGetBuf and hPutBuf not available in yhc or nhc
  import System.IO                (stdin,stdout,hClose,hFileSize
                                  ,hGetBuf,hPutBuf,openBinaryFile
                                  ,Handle,IOMode(..))
  
-#if defined(__GLASGOW_HASKELL__)
-
-import System.IO                (hGetBufNonBlocking)
+#if !defined(__GLASGOW_HASKELL__)
+import System.IO.Unsafe
+#endif
  
-import qualified Foreign.Concurrent as FC (newForeignPtr)
+#if defined(__GLASGOW_HASKELL__)
  
  import Data.Generics            (Data(..), Typeable(..))
  
+import System.IO                (hGetBufNonBlocking)
  import System.IO.Error          (isEOFError)
+
  import Foreign.Marshal          (alloca)
+import qualified Foreign.Concurrent as FC (newForeignPtr)
  
  import GHC.Handle
-import GHC.Prim
+import GHC.Prim                 (realWorld#, Addr#, Word#, (+#), writeWord8OffAddr#)
  import GHC.Base                 (build, unsafeChr)
  import GHC.Word hiding (Word8)
  import GHC.Ptr                  (Ptr(..))
  import GHC.ST                   (ST(..))
  import GHC.IOBase
  
-#else
-
-import System.IO.Unsafe
-
  #endif
  
+-- CFILES stuff is Hugs only
+{-# CFILES cbits/fpstring.c #-}
+
  -- -----------------------------------------------------------------------------
  --
  -- Useful macros, until we have bang patterns
@@ -324,19 +339,24 @@ instance Arbitrary PackedString where
  
  -- | /O(n)/ Equality on the 'ByteString' type.
  eq :: ByteString -> ByteString -> Bool
-eq a b = (compareBytes a b) == EQ
+eq a@(PS p s l) b@(PS p' s' l')
+    | l /= l'            = False    -- short cut on length
+    | p == p' && s == s' = True     -- short cut for the same string
+    | otherwise          = compareBytes a b == EQ
  {-# INLINE eq #-}
  
  -- | /O(n)/ 'compareBytes' provides an 'Ordering' for 'ByteStrings' supporting slices. 
  compareBytes :: ByteString -> ByteString -> Ordering
-compareBytes (PS _ _ 0) (PS _ _ 0)       = EQ    -- short cut for empty strings
-compareBytes (PS x1 s1 l1) (PS x2 s2 l2) = inlinePerformIO $
-    withForeignPtr x1 $ \p1 ->
-    withForeignPtr x2 $ \p2 -> do
-        i <- memcmp (p1 `plusPtr` s1) (p2 `plusPtr` s2) (min l1 l2)
-        return $ case i `compare` 0 of
-                    EQ  -> l1 `compare` l2
-                    x   -> x
+compareBytes (PS x1 s1 l1) (PS x2 s2 l2)
+    | l1 == 0  && l2 == 0               = EQ  -- short cut for empty strings
+    | x1 == x2 && s1 == s2 && l1 == l2  = EQ  -- short cut for the same string
+    | otherwise                         = inlinePerformIO $
+        withForeignPtr x1 $ \p1 ->
+        withForeignPtr x2 $ \p2 -> do
+            i <- memcmp (p1 `plusPtr` s1) (p2 `plusPtr` s2) (min l1 l2)
+            return $ case i `compare` 0 of
+                        EQ  -> l1 `compare` l2
+                        x   -> x
  {-# INLINE compareBytes #-}
  
  {-
@@ -375,10 +395,29 @@ empty = inlinePerformIO $ mallocByteString 1 >>= \fp -> return $ PS fp 0 0
  
  -- | /O(1)/ Convert a 'Word8' into a 'ByteString'
  packByte :: Word8 -> ByteString
-packByte c = inlinePerformIO $ mallocByteString 2 >>= \fp -> do
+packByte c = unsafePerformIO $ mallocByteString 2 >>= \fp -> do
      withForeignPtr fp $ \p -> poke p c
      return $ PS fp 0 1
-{-# NOINLINE packByte #-}
+{-# INLINE packByte #-}
+
+--
+-- XXX The unsafePerformIO is critical!
+--
+-- Otherwise:
+--
+--  packByte 255 `compare` packByte 127
+--
+-- is compiled to:
+--
+--  case mallocByteString 2 of 
+--      ForeignPtr f internals -> 
+--           case writeWord8OffAddr# f 0 255 of _ -> 
+--           case writeWord8OffAddr# f 0 127 of _ ->
+--           case eqAddr# f f of 
+--                  False -> case compare (GHC.Prim.plusAddr# f 0) 
+--                                        (GHC.Prim.plusAddr# f 0)
+--
+--
  
  -- | /O(n)/ Convert a '[Word8]' into a 'ByteString'. 
  --
@@ -495,6 +534,8 @@ cons c (PS x s l) = create (l+1) $ \p -> withForeignPtr x $ \f -> do
          poke p c
  {-# INLINE cons #-}
  
+-- todo fuse
+
  -- | /O(n)/ Append a byte to the end of a 'ByteString'
  snoc :: ByteString -> Word8 -> ByteString
  snoc (PS x s l) c = create (l+1) $ \p -> withForeignPtr x $ \f -> do
@@ -502,6 +543,8 @@ snoc (PS x s l) c = create (l+1) $ \p -> withForeignPtr x $ \f -> do
          poke (p `plusPtr` l) c
  {-# INLINE snoc #-}
  
+-- todo fuse
+
  -- | /O(1)/ Extract the first element of a ByteString, which must be non-empty.
  head :: ByteString -> Word8
  head ps@(PS x s _)
@@ -537,52 +580,43 @@ append xs ys | null xs   = ys
               | otherwise = concat [xs,ys]
  {-# INLINE append #-}
  
-{-
---
--- About 30% faster, but allocating in a big chunk isn't good for memory use
---
-append :: ByteString -> ByteString -> ByteString
-append xs@(PS ffp s l) ys@(PS fgp t m)
-    | null xs   = ys
-    | null ys   = xs
-    | otherwise = create len $ \ptr ->
-        withForeignPtr ffp $ \fp ->
-        withForeignPtr fgp $ \gp -> do
-            memcpy ptr               (fp `plusPtr` s) l
-            memcpy (ptr `plusPtr` l) (gp `plusPtr` t) m
-        where len = length xs + length ys
--}
-
  -- ---------------------------------------------------------------------
  -- Transformations
  
  -- | /O(n)/ 'map' @f xs@ is the ByteString obtained by applying @f@ to each
--- element of @xs@
---
+-- element of @xs@. This function is subject to array fusion.
  map :: (Word8 -> Word8) -> ByteString -> ByteString
-map f (PS fp start len) = inlinePerformIO $ withForeignPtr fp $ \p -> do
-    new_fp <- mallocByteString len
-    withForeignPtr new_fp $ \new_p -> do
-        map_ f (len-1) (p `plusPtr` start) new_p
-        return (PS new_fp 0 len)
+map f = loopArr . loopU (mapEFL f) noAL
  {-# INLINE map #-}
  
-map_ :: (Word8 -> Word8) -> Int -> Ptr Word8 -> Ptr Word8 -> IO ()
-STRICT4(map_)
-map_ f n p1 p2
-   | n < 0 = return ()
-   | otherwise = do
-        x <- peekByteOff p1 n
-        pokeByteOff p2 n (f x)
-        map_ f (n-1) p1 p2
-{-# INLINE map_ #-}
+-- | /O(n)/ Like 'map', but not fuseable. The benefit is that it is
+-- slightly faster for one-shot cases.
+mapF :: (Word8 -> Word8) -> ByteString -> ByteString
+STRICT2(mapF)
+mapF f (PS fp s len) = inlinePerformIO $ withForeignPtr fp $ \a -> do
+    np <- mallocByteString (len+1)
+    withForeignPtr np $ \p -> do
+        map_ 0 (a `plusPtr` s) p
+        return (PS np 0 len)
+  where
+    map_ :: Int -> Ptr Word8 -> Ptr Word8 -> IO ()
+    STRICT3(map_)
+    map_ n p1 p2
+       | n >= len = return ()
+       | otherwise = do
+            x <- peekByteOff p1 n
+            pokeByteOff p2 n (f x)
+            map_ (n+1) p1 p2
+{-# INLINE mapF #-}
  
  -- | /O(n)/ 'reverse' @xs@ efficiently returns the elements of @xs@ in reverse order.
  reverse :: ByteString -> ByteString
  reverse (PS x s l) = create l $ \p -> withForeignPtr x $ \f ->
          c_reverse p (f `plusPtr` s) l
  
--- reverse = pack . P.reverse . unpack
+{-
+reverse = pack . P.reverse . unpack
+-}
  
  -- | /O(n)/ The 'intersperse' function takes a 'Word8' and a
  -- 'ByteString' and \`intersperses\' that byte between the elements of
@@ -594,7 +628,9 @@ intersperse c ps@(PS x s l)
      | otherwise      = create (2*l-1) $ \p -> withForeignPtr x $ \f ->
          c_intersperse p (f `plusPtr` s) l c
  
--- intersperse c = pack . List.intersperse c . unpack
+{-
+intersperse c = pack . List.intersperse c . unpack
+-}
  
  -- | The 'transpose' function transposes the rows and columns of its
  -- 'ByteString' argument.
@@ -607,7 +643,16 @@ transpose ps = P.map pack (List.transpose (P.map unpack ps))
  -- | 'foldl', applied to a binary operator, a starting value (typically
  -- the left-identity of the operator), and a ByteString, reduces the
  -- ByteString using the binary operator, from left to right.
+-- This function is subject to array fusion.
  foldl :: (a -> Word8 -> a) -> a -> ByteString -> a
+foldl f z = loopAcc . loopU (foldEFL f) z
+{-# INLINE foldl #-}
+
+{-
+--
+-- About twice as fast with 6.4.1, but not fuseable
+-- A simple fold . map is enough to make it worth while.
+--
  foldl f v (PS x s l) = inlinePerformIO $ withForeignPtr x $ \ptr ->
          lgo v (ptr `plusPtr` s) (ptr `plusPtr` (s+l))
      where
@@ -615,6 +660,7 @@ foldl f v (PS x s l) = inlinePerformIO $ withForeignPtr x $ \ptr ->
          lgo z p q | p == q    = return z
                    | otherwise = do c <- peek p
                                     lgo (f z c) (p `plusPtr` 1) q
+-}
  
  -- | 'foldr', applied to a binary operator, a starting value
  -- (typically the right-identity of the operator), and a ByteString,
@@ -631,6 +677,7 @@ foldr k z (PS x s l) = inlinePerformIO $ withForeignPtr x $ \ptr ->
  
  -- | 'foldl1' is a variant of 'foldl' that has no starting value
  -- argument, and thus must be applied to non-empty 'ByteStrings'.
+-- This function is subject to array fusion.
  foldl1 :: (Word8 -> Word8 -> Word8) -> ByteString -> Word8
  foldl1 f ps
      | null ps   = errorEmptyList "foldl1"
@@ -641,7 +688,7 @@ foldl1 f ps
  foldr1 :: (Word8 -> Word8 -> Word8) -> ByteString -> Word8
  foldr1 f ps
      | null ps        = errorEmptyList "foldr1"
-    | otherwise      = f (unsafeHead ps) (foldr1 f (unsafeTail ps))
+    | otherwise      = foldr f (last ps) (init ps)
  
  -- ---------------------------------------------------------------------
  -- Special folds
@@ -650,26 +697,13 @@ foldr1 f ps
  concat :: [ByteString] -> ByteString
  concat []     = empty
  concat [ps]   = ps
-concat xs     = inlinePerformIO $ do
-    let start_size = 1024
-    p <- mallocArray start_size
-    f p 0 1024 xs
-
-    where f ptr len _ [] = do
-                ptr' <- reallocArray ptr (len+1)
-                poke (ptr' `plusPtr` len) (0::Word8)    -- XXX so CStrings work
-                fp   <- newForeignFreePtr ptr'
-                return $ PS fp 0 len
-
-          f ptr len to_go pss@(PS p s l:pss')
-           | l <= to_go = do withForeignPtr p $ \pf ->
-                                 memcpy (ptr `plusPtr` len)
-                                          (pf `plusPtr` s) l
-                             f ptr (len + l) (to_go - l) pss'
-
-           | otherwise = do let new_total = ((len + to_go) * 2) `max` (len + l)
-                            ptr' <- reallocArray ptr new_total
-                            f ptr' len (new_total - len) pss
+concat xs     = create len $ \ptr -> go xs ptr
+  where len = P.sum . P.map length $ xs
+        STRICT2(go)
+        go []            _   = return ()
+        go (PS p s l:ps) ptr = do
+                withForeignPtr p $ \fp -> memcpy ptr (fp `plusPtr` s) l
+                go ps (ptr `plusPtr` l)
  
  -- | Map a function over a 'ByteString' and concatenate the results
  concatMap :: (Word8 -> ByteString) -> ByteString -> ByteString
@@ -688,6 +722,8 @@ any f (PS x s l) = inlinePerformIO $ withForeignPtr x $ \ptr ->
                                  if f c then return True
                                         else go (p `plusPtr` 1) q
  
+-- todo fuse
+
  -- | /O(n)/ Applied to a predicate and a 'ByteString', 'all' determines
  -- if all elements of the 'ByteString' satisfy the predicate.
  all :: (Word8 -> Bool) -> ByteString -> Bool
@@ -701,6 +737,7 @@ all f (PS x s l) = inlinePerformIO $ withForeignPtr x $ \ptr ->
                                   if f c
                                      then go (p `plusPtr` 1) q
                                      else return False
+-- todo fuse
  
  -- | /O(n)/ 'maximum' returns the maximum value from a 'ByteString'
  maximum :: ByteString -> Word8
@@ -708,6 +745,7 @@ maximum xs@(PS x s l)
      | null xs   = errorEmptyList "maximum"
      | otherwise = inlinePerformIO $ withForeignPtr x $ \p ->
                      return $ c_maximum (p `plusPtr` s) l
+{-# INLINE maximum #-}
  
  -- | /O(n)/ 'minimum' returns the minimum value from a 'ByteString'
  minimum :: ByteString -> Word8
@@ -715,6 +753,9 @@ minimum xs@(PS x s l)
      | null xs   = errorEmptyList "minimum"
      | otherwise = inlinePerformIO $ withForeignPtr x $ \p ->
                      return $ c_minimum (p `plusPtr` s) l
+{-# INLINE minimum #-}
+
+-- fusion is too slow here (10x)
  
  {-
  maximum xs@(PS x s l)
@@ -722,7 +763,6 @@ maximum xs@(PS x s l)
      | otherwise = inlinePerformIO $ withForeignPtr x $ \p -> do
                          w <- peek p
                          maximum_ (p `plusPtr` s) 0 l w
-{-# INLINE maximum #-}
  
  maximum_ :: Ptr Word8 -> Int -> Int -> Word8 -> IO Word8
  STRICT4(maximum_)
@@ -736,7 +776,6 @@ minimum xs@(PS x s l)
      | otherwise = inlinePerformIO $ withForeignPtr x $ \p -> do
                          w <- peek p
                          minimum_ (p `plusPtr` s) 0 l w
-{-# INLINE minimum #-}
  
  minimum_ :: Ptr Word8 -> Int -> Int -> Word8 -> IO Word8
  STRICT4(minimum_)
@@ -745,6 +784,7 @@ minimum_ ptr n m c
      | otherwise = do w <- peekByteOff ptr n
                       minimum_ ptr (n+1) m (if w < c then w else c)
  -}
+
  -- | /O(n)/ map Word8 functions, provided with the index at each position
  mapIndexed :: (Int -> Word8 -> Word8) -> ByteString -> ByteString
  mapIndexed k (PS ps s l) = create l $ \p -> withForeignPtr ps $ \f ->
@@ -757,17 +797,6 @@ mapIndexed k (PS ps s l) = create l $ \p -> withForeignPtr ps $ \f ->
                                  ((poke t) . k n) w
                                  go (n+1) (f `plusPtr` 1) (t `plusPtr` 1) p
  
--- | /O(n)/ Hash a ByteString into an 'Int32' value, suitable for use as a key.
-hash :: ByteString -> Int32
-hash (PS x s l) = inlinePerformIO $ withForeignPtr x $ \p ->
-    go (0 :: Int32) (p `plusPtr` s) l
-  where
-    go :: Int32 -> Ptr Word8 -> Int -> IO Int32
-    STRICT3(go)
-    go h _ 0 = return h
-    go h p n = do w <- peek p
-                  go (fromIntegral w + rotateL h 8) (p `plusPtr` 1) (n-1)
-
  -- ---------------------------------------------------------------------
  -- Unfolds and replicates
  
@@ -877,6 +906,24 @@ breakByte c p = case elemIndex c p of
      Just n  -> (take n p, drop n p)
  {-# INLINE breakByte #-}
  
+-- | 'spanByte' breaks its ByteString argument at the first
+-- occurence of a byte other than its argument. It is more efficient
+-- than 'span (==)'
+--
+-- > span  (=='c') "abcd" == spanByte 'c' "abcd"
+--
+spanByte :: Word8 -> ByteString -> (ByteString, ByteString)
+spanByte c ps@(PS x s l) = inlinePerformIO $ withForeignPtr x $ \p ->
+    go (p `plusPtr` s) 0
+  where
+    STRICT2(go)
+    go p i | i >= l    = return (ps, empty)
+           | otherwise = do c' <- peekByteOff p i
+                            if c /= c'
+                                then return (take i ps, drop i ps)
+                                else go p (i+1)
+{-# INLINE spanByte #-}
+
  -- | /O(n)/ 'breakFirst' breaks the given ByteString on the first
  -- occurence of @w@. It behaves like 'break', except the delimiter is
  -- not returned, and @Nothing@ is returned if the delimiter is not in
@@ -914,7 +961,7 @@ breakLast c p = case elemIndexLast c p of
  -- | 'span' @p xs@ breaks the ByteString into two segments. It is
  -- equivalent to @('takeWhile' p xs, 'dropWhile' p xs)@
  span :: (Word8 -> Bool) -> ByteString -> (ByteString, ByteString)
-span  p ps = break (not . p) ps
+span p ps = break (not . p) ps
  {-# INLINE span #-}
  
  -- | 'spanEnd' behaves like 'span' but from the end of the 'ByteString'.
@@ -1041,6 +1088,31 @@ split (W8# w#) (PS fp off len) = splitWith' off len fp
  tokens :: (Word8 -> Bool) -> ByteString -> [ByteString]
  tokens f = P.filter (not.null) . splitWith f
  
+-- | The 'group' function takes a ByteString and returns a list of
+-- ByteStrings such that the concatenation of the result is equal to the
+-- argument.  Moreover, each sublist in the result contains only equal
+-- elements.  For example,
+--
+-- > group "Mississippi" = ["M","i","ss","i","ss","i","pp","i"]
+--
+-- It is a special case of 'groupBy', which allows the programmer to
+-- supply their own equality test. It is about 40% faster than 
+-- /groupBy (==)/
+group :: ByteString -> [ByteString]
+group xs
+    | null xs   = []
+    | otherwise = ys : group zs
+    where
+        (ys, zs) = spanByte (unsafeHead xs) xs
+
+-- | The 'groupBy' function is the non-overloaded version of 'group'.
+groupBy :: (Word8 -> Word8 -> Bool) -> ByteString -> [ByteString]
+groupBy k xs
+    | null xs   = []
+    | otherwise = take n xs : groupBy k (drop n xs)
+    where
+        n = 1 + findIndexOrEnd (not . k (unsafeHead xs)) (unsafeTail xs)
+
  -- | /O(n)/ The 'join' function takes a 'ByteString' and a list of
  -- 'ByteString's and concatenates the list after interspersing the first
  -- argument between each element of the list.
@@ -1143,6 +1215,14 @@ elemIndices c ps = loop 0 ps
  -- But more efficiently than using length on the intermediate list.
  count :: Word8 -> ByteString -> Int
  count w (PS x s m) = inlinePerformIO $ withForeignPtr x $ \p ->
+    return $ c_count (p `plusPtr` s) (fromIntegral m) w
+{-# INLINE count #-}
+
+{-
+--
+-- around 30% slower
+--
+count w (PS x s m) = inlinePerformIO $ withForeignPtr x $ \p ->
       go (p `plusPtr` s) (fromIntegral m) 0
      where
          go :: Ptr Word8 -> CSize -> Int -> IO Int
@@ -1153,13 +1233,23 @@ count w (PS x s m) = inlinePerformIO $ withForeignPtr x $ \p ->
                  then return i
                  else do let k = fromIntegral $ q `minusPtr` p
                          go (q `plusPtr` 1) (l-k-1) (i+1)
-{-# INLINE count #-}
+-}
  
  -- | The 'findIndex' function takes a predicate and a 'ByteString' and
  -- returns the index of the first element in the ByteString
  -- satisfying the predicate.
  findIndex :: (Word8 -> Bool) -> ByteString -> Maybe Int
-findIndex = (listToMaybe .) . findIndices
+findIndex k ps@(PS x s l)
+    | null ps   = Nothing
+    | otherwise = inlinePerformIO $ withForeignPtr x $ \f -> go (f `plusPtr` s) 0
+  where
+    STRICT2(go)
+    go ptr n | n >= l    = return Nothing
+             | otherwise = do w <- peek ptr
+                              if k w
+                                then return (Just n)
+                                else go (ptr `plusPtr` 1) (n+1)
+{-# INLINE findIndex #-}
  
  -- | The 'findIndices' function extends 'findIndex', by returning the
  -- indices of all elements satisfying the predicate, in ascending order.
@@ -1167,8 +1257,8 @@ findIndices :: (Word8 -> Bool) -> ByteString -> [Int]
  findIndices p ps = loop 0 ps
     where
       STRICT2(loop)
-     loop _ qs | null qs           = []
-     loop n qs | p (unsafeHead qs) = n : loop (n+1) (unsafeTail qs)
+     loop n qs | null qs           = []
+               | p (unsafeHead qs) = n : loop (n+1) (unsafeTail qs)
                 | otherwise         =     loop (n+1) (unsafeTail qs)
  
  -- ---------------------------------------------------------------------
@@ -1181,9 +1271,34 @@ elem c ps = case elemIndex c ps of Nothing -> False ; _ -> True
  
  -- | /O(n)/ 'notElem' is the inverse of 'elem'
  notElem :: Word8 -> ByteString -> Bool
-notElem c ps = case elemIndex c ps of Nothing -> True ; _ -> False
+notElem c ps = not (elem c ps)
  {-# INLINE notElem #-}
  
+-- | /O(n)/ 'filter', applied to a predicate and a ByteString,
+-- returns a ByteString containing those characters that satisfy the
+-- predicate. This function is subject to array fusion.
+filter :: (Word8 -> Bool) -> ByteString -> ByteString
+filter p  = loopArr . loopU (filterEFL p) noAL
+{-# INLINE filter #-}
+
+-- | /O(n)/ 'filterF' is a non-fuseable version of filter, that may be
+-- around 2x faster for some one-shot applications.
+filterF :: (Word8 -> Bool) -> ByteString -> ByteString
+filterF k ps@(PS x s l)
+    | null ps   = ps
+    | otherwise = inlinePerformIO $ generate l $ \p -> withForeignPtr x $ \f -> do
+        t <- go (f `plusPtr` s) p (f `plusPtr` (s + l))
+        return (t `minusPtr` p) -- actual length
+    where
+        STRICT3(go)
+        go f t end | f == end  = return t
+                   | otherwise = do
+                        w <- peek f
+                        if k w
+                            then poke t w >> go (f `plusPtr` 1) (t `plusPtr` 1) end
+                            else             go (f `plusPtr` 1) t               end
+{-# INLINE filterF #-}
+
  --
  -- | /O(n)/ A first order equivalent of /filter . (==)/, for the common
  -- case of filtering a single byte. It is more efficient to use
@@ -1195,23 +1310,7 @@ notElem c ps = case elemIndex c ps of Nothing -> True ; _ -> False
  -- filter equivalent
  filterByte :: Word8 -> ByteString -> ByteString
  filterByte w ps = replicate (count w ps) w
-
-{-
--- slower than the replicate version
-
-filterByte ch ps@(PS x s l)
-    | null ps   = ps
-    | otherwise = inlinePerformIO $ generate l $ \p -> withForeignPtr x $ \f -> do
-        t <- go (f `plusPtr` s) p l
-        return (t `minusPtr` p) -- actual length
-    where
-        STRICT3(go)
-        go _ t 0 = return t
-        go f t e = do w <- peek f
-                      if w == ch
-                        then poke t w >> go (f `plusPtr` 1) (t `plusPtr` 1) (e-1)
-                        else             go (f `plusPtr` 1) t               (e-1)
--}
+{-# INLINE filterByte #-}
  
  --
  -- | /O(n)/ A first order equivalent of /filter . (\/=)/, for the common
@@ -1220,48 +1319,32 @@ filterByte ch ps@(PS x s l)
  --
  -- > filterNotByte == filter . (/=)
  --
--- filterNotByte is around 3x faster, and uses much less space, than its
--- filter equivalent
+-- filterNotByte is around 2x faster than its filter equivalent.
  filterNotByte :: Word8 -> ByteString -> ByteString
-filterNotByte ch ps@(PS x s l)
-    | null ps   = ps
-    | otherwise = inlinePerformIO $ generate l $ \p -> withForeignPtr x $ \f -> do
-        t <- go (f `plusPtr` s) p l
-        return (t `minusPtr` p) -- actual length
-    where
-        STRICT3(go)
-        go _ t 0 = return t
-        go f t e = do w <- peek f
-                      if w /= ch
-                        then poke t w >> go (f `plusPtr` 1) (t `plusPtr` 1) (e-1)
-                        else             go (f `plusPtr` 1) t               (e-1)
-
--- | /O(n)/ 'filter', applied to a predicate and a ByteString,
--- returns a ByteString containing those characters that satisfy the
--- predicate.
-filter :: (Word8 -> Bool) -> ByteString -> ByteString
-filter k ps@(PS x s l)
-    | null ps   = ps
-    | otherwise = inlinePerformIO $ generate l $ \p -> withForeignPtr x $ \f -> do
-        t <- go (f `plusPtr` s) p l
-        return (t `minusPtr` p) -- actual length
-    where
-        STRICT3(go)
-        go _ t 0 = return t
-        go f t e = do w <- peek f
-                      if k w
-                        then poke t w >> go (f `plusPtr` 1) (t `plusPtr` 1) (e - 1)
-                        else             go (f `plusPtr` 1) t               (e - 1)
-
--- Almost as good: pack $ foldl (\xs c -> if f c then c : xs else xs) [] ps
+filterNotByte w = filterF (/= w)
+{-# INLINE filterNotByte #-}
  
  -- | /O(n)/ The 'find' function takes a predicate and a ByteString,
  -- and returns the first element in matching the predicate, or 'Nothing'
  -- if there is no such element.
+--
+-- > find f p = case findIndex f p of Just n -> Just (p ! n) ; _ -> Nothing
+--
  find :: (Word8 -> Bool) -> ByteString -> Maybe Word8
-find p ps = case filter p ps of
-    q | null q -> Nothing
-      | otherwise -> Just (unsafeHead q)
+find f p = case findIndex f p of
+                    Just n -> Just (p `unsafeIndex` n)
+                    _      -> Nothing
+{-# INLINE find #-}
+
+{-
+--
+-- fuseable, but we don't want to walk the whole array.
+-- 
+find k = foldl findEFL Nothing
+    where findEFL a@(Just _) _ = a
+          findEFL _          c | k c       = Just c
+                               | otherwise = Nothing
+-}
  
  -- ---------------------------------------------------------------------
  -- Searching for substrings
@@ -1386,13 +1469,51 @@ elems (PS x s l) = (PS x s 1:elems (PS x (s+1) (l-1)))
  -- ---------------------------------------------------------------------
  -- ** Ordered 'ByteString's
  
--- | /O(n log(n))/ Sort a ByteString efficiently, using qsort(3).
+-- | /O(n)/ Sort a ByteString efficiently, using counting sort.
+sort :: ByteString -> ByteString
+sort (PS input s l) = create l $ \p -> allocaArray 256 $ \arr -> do
+
+    memset (castPtr arr) 0 (256 * fromIntegral (sizeOf (undefined :: CSize)))
+    withForeignPtr input (\x -> countEach arr (x `plusPtr` s) l)
+
+    let STRICT2(go)
+        go 256 _   = return ()
+        go i   ptr = do n <- peekElemOff arr i
+                        when (n /= 0) $ memset ptr (fromIntegral i) n >> return ()
+                        go (i + 1) (ptr `plusPtr` (fromIntegral n))
+    go 0 p
+
+-- "countEach counts str l" counts the number of occurences of each Word8 in
+-- str, and stores the result in counts.
+countEach :: Ptr CSize -> Ptr Word8 -> Int -> IO ()
+STRICT3(countEach)
+countEach counts str l = go 0
+ where
+    STRICT1(go)
+    go i | i == l    = return ()
+         | otherwise = do k <- fromIntegral `fmap` peekElemOff str i
+                          x <- peekElemOff counts k
+                          pokeElemOff counts k (x + 1)
+                          go (i + 1)
+
+{-
  sort :: ByteString -> ByteString
  sort (PS x s l) = create l $ \p -> withForeignPtr x $ \f -> do
          memcpy p (f `plusPtr` s) l
          c_qsort p l -- inplace
+-}
  
--- sort = pack . List.sort . unpack
+{-
+sort = pack . List.sort . unpack
+-}
+
+-- | The 'sortBy' function is the non-overloaded version of 'sort'.
+--
+-- Try some linear sorts: radix, counting
+-- Or mergesort.
+--
+-- sortBy :: (Word8 -> Word8 -> Ordering) -> ByteString -> ByteString
+-- sortBy f ps = undefined
  
  -- ---------------------------------------------------------------------
  --
@@ -1584,7 +1705,7 @@ unsafeUseAsCStringLen (PS ps s l) ac = withForeignPtr ps $ \p -> ac (castPtr p `
  --
  generate :: Int -> (Ptr Word8 -> IO Int) -> IO ByteString
  generate i f = do
-    p <- mallocArray i
+    p <- mallocArray (i+1)
      i' <- f p
      p' <- reallocArray p (i'+1)
      poke (p' `plusPtr` i') (0::Word8)    -- XXX so CStrings work
@@ -1659,7 +1780,7 @@ hGetLine h = wantReadableHandle "Data.ByteString.hGetLine" h $ \ handle_ -> do
  mkPS :: RawBuffer -> Int -> Int -> IO ByteString
  mkPS buf start end = do
      let len = end - start
-    fp <- mallocByteString (len `quot` 8)
+    fp <- mallocByteString len
      withForeignPtr fp $ \p -> do
          memcpy_ptr_baoff p buf start (fromIntegral len)
          return (PS fp 0 len)
@@ -1726,15 +1847,15 @@ hGetContents h = do
          else f p start_size
      where
          f p s = do
-        let s' = 2 * s
-        p' <- reallocArray p s'
-        i  <- hGetBuf h (p' `plusPtr` s) s
-        if i < s
-            then do let i' = s + i
-                    p'' <- reallocArray p' i'
-                    fp  <- newForeignFreePtr p''
-                    return $ PS fp 0 i'
-            else f p' s'
+            let s' = 2 * s
+            p' <- reallocArray p s'
+            i  <- hGetBuf h (p' `plusPtr` s) s
+            if i < s
+                then do let i' = s + i
+                        p'' <- reallocArray p' i'
+                        fp  <- newForeignFreePtr p''
+                        return $ PS fp 0 i'
+                else f p' s'
  
  -- | getContents. Equivalent to hGetContents stdin
  getContents :: IO ByteString
@@ -1759,6 +1880,62 @@ writeFile f ps = do
      hPut h ps
      hClose h
  
+{-
+--
+-- Disable until we can move it into a portable .hsc file
+--
+
+-- | Like readFile, this reads an entire file directly into a
+-- 'ByteString', but it is even more efficient.  It involves directly
+-- mapping the file to memory.  This has the advantage that the contents
+-- of the file never need to be copied.  Also, under memory pressure the
+-- page may simply be discarded, while in the case of readFile it would
+-- need to be written to swap.  If you read many small files, mmapFile
+-- will be less memory-efficient than readFile, since each mmapFile
+-- takes up a separate page of memory.  Also, you can run into bus
+-- errors if the file is modified.  As with 'readFile', the string
+-- representation in the file is assumed to be ISO-8859-1.
+--
+-- On systems without mmap, this is the same as a readFile.
+--
+mmapFile :: FilePath -> IO ByteString
+mmapFile f = mmap f >>= \(fp,l) -> return $ PS fp 0 l
+
+mmap :: FilePath -> IO (ForeignPtr Word8, Int)
+mmap f = do
+    h <- openBinaryFile f ReadMode
+    l <- fromIntegral `fmap` hFileSize h
+    -- Don't bother mmaping small files because each mmapped file takes up
+    -- at least one full VM block.
+    if l < mmap_limit
+       then do thefp <- mallocByteString l
+               withForeignPtr thefp $ \p-> hGetBuf h p l
+               hClose h
+               return (thefp, l)
+       else do
+               -- unix only :(
+               fd <- fromIntegral `fmap` handleToFd h
+               p  <- my_mmap l fd
+               fp <- if p == nullPtr
+                     then do thefp <- mallocByteString l
+                             withForeignPtr thefp $ \p' -> hGetBuf h p' l
+                             return thefp
+                     else do
+                          -- The munmap leads to crashes on OpenBSD.
+                          -- maybe there's a use after unmap in there somewhere?
+#if !defined(__OpenBSD__)
+                             let unmap = c_munmap p l >> return ()
+#else
+                             let unmap = return ()
+#endif
+                             fp <- FC.newForeignPtr p unmap
+                             return fp
+               c_close fd
+               hClose h
+               return (fp, l)
+    where mmap_limit = 16*1024
+-}
+
  #if defined(__GLASGOW_HASKELL__)
  --
  -- | A ByteString equivalent for getArgs. More efficient for large argument lists
@@ -1801,8 +1978,9 @@ mallocByteString l = do
  
  -- | A way of creating ForeignPtrs outside the IO monad. The @Int@
  -- argument gives the final size of the ByteString. Unlike 'generate'
--- the ByteString is no reallocated if the final size is less than the
--- estimated size.
+-- the ByteString is not reallocated if the final size is less than the
+-- estimated size. Also, unlike 'generate' ByteString's created this way
+-- are managed on the Haskell heap.
  create :: Int -> (Ptr Word8 -> IO ()) -> ByteString
  create l write_ptr = inlinePerformIO $ do
      fp <- mallocByteString (l+1)
@@ -1906,8 +2084,24 @@ foreign import ccall unsafe "static fpstring.h maximum" c_maximum
  foreign import ccall unsafe "static fpstring.h minimum" c_minimum
      :: Ptr Word8 -> Int -> Word8
  
-foreign import ccall unsafe "static fpstring.h my_qsort" c_qsort
-    :: Ptr Word8 -> Int -> IO ()
+foreign import ccall unsafe "static fpstring.h count" c_count
+    :: Ptr Word8 -> Int -> Word8 -> Int
+
+-- ---------------------------------------------------------------------
+-- MMap
+
+{-
+foreign import ccall unsafe "static fpstring.h my_mmap" my_mmap
+    :: Int -> Int -> IO (Ptr Word8)
+
+foreign import ccall unsafe "static unistd.h close" c_close
+    :: Int -> IO Int
+
+#  if !defined(__OpenBSD__)
+foreign import ccall unsafe "static sys/mman.h munmap" c_munmap
+    :: Ptr Word8 -> Int -> IO Int
+#  endif
+-}
  
  -- ---------------------------------------------------------------------
  -- Internal GHC Haskell magic
@@ -1919,3 +2113,123 @@ foreign import ccall unsafe "RtsAPI.h getProgArgv"
  foreign import ccall unsafe "__hscore_memcpy_src_off"
     memcpy_ptr_baoff :: Ptr a -> RawBuffer -> Int -> CSize -> IO (Ptr ())
  #endif
+
+-- ---------------------------------------------------------------------
+--
+-- Functional array fusion for ByteStrings. 
+--
+-- From the Data Parallel Haskell project, 
+--      http://www.cse.unsw.edu.au/~chak/project/dph/
+--
+
+-- |Data type for accumulators which can be ignored. The rewrite rules rely on
+-- the fact that no bottoms of this type are ever constructed; hence, we can
+-- assume @(_ :: NoAL) `seq` x = x@.
+--
+data NoAL = NoAL
+
+-- | Special forms of loop arguments
+--
+-- * These are common special cases for the three function arguments of gen
+--   and loop; we give them special names to make it easier to trigger RULES
+--   applying in the special cases represented by these arguments.  The
+--   "INLINE [1]" makes sure that these functions are only inlined in the last
+--   two simplifier phases.
+--
+-- * In the case where the accumulator is not needed, it is better to always
+--   explicitly return a value `()', rather than just copy the input to the
+--   output, as the former gives GHC better local information.
+-- 
+
+-- | Element function expressing a mapping only
+mapEFL :: (Word8 -> Word8) -> (NoAL -> Word8 -> (NoAL, Maybe Word8))
+mapEFL f = \_ e -> (noAL, (Just $ f e))
+{-# INLINE [1] mapEFL #-}
+
+-- | Element function implementing a filter function only
+filterEFL :: (Word8 -> Bool) -> (NoAL -> Word8 -> (NoAL, Maybe Word8))
+filterEFL p = \_ e -> if p e then (noAL, Just e) else (noAL, Nothing)
+{-# INLINE [1] filterEFL #-}
+
+-- |Element function expressing a reduction only
+foldEFL :: (acc -> Word8 -> acc) -> (acc -> Word8 -> (acc, Maybe Word8))
+foldEFL f = \a e -> (f a e, Nothing)
+{-# INLINE [1] foldEFL #-}
+
+-- | No accumulator
+noAL :: NoAL
+noAL = NoAL
+{-# INLINE [1] noAL #-}
+
+-- | Projection functions that are fusion friendly (as in, we determine when
+-- they are inlined)
+loopArr :: (ByteString, acc) -> ByteString
+loopArr (arr, _) = arr
+{-# INLINE [1] loopArr #-}
+
+loopAcc :: (ByteString, acc) -> acc
+loopAcc (_, acc) = acc
+{-# INLINE [1] loopAcc #-}
+
+loopSndAcc :: (ByteString, (acc1, acc2)) -> (ByteString, acc2)
+loopSndAcc (arr, (_, acc)) = (arr, acc)
+{-# INLINE [1] loopSndAcc #-}
+
+------------------------------------------------------------------------
+
+-- | Iteration over over ByteStrings
+loopU :: (acc -> Word8 -> (acc, Maybe Word8))  -- ^ mapping & folding, once per elem
+      -> acc                                   -- ^ initial acc value
+      -> ByteString                            -- ^ input ByteString
+      -> (ByteString, acc)
+
+loopU f start (PS z s i) = inlinePerformIO $ withForeignPtr z $ \a -> do
+    fp          <- mallocByteString i
+    (ptr,n,acc) <- withForeignPtr fp $ \p -> do
+        (acc, i') <- go (a `plusPtr` s) p start
+        if i' == i
+            then return (fp,i,acc)                      -- no realloc for map
+            else do fp_ <- mallocByteString (i'+1)      -- realloc
+                    withForeignPtr fp_ $ \p' -> do
+                        memcpy p' p i'
+                        poke (p' `plusPtr` i') (0::Word8)
+                    return (fp_,i',acc)
+
+    return (PS ptr 0 n, acc)
+  where
+    go p ma = trans 0 0
+        where
+            STRICT3(trans)
+            trans a_off ma_off acc
+                | a_off >= i = return (acc, ma_off)
+                | otherwise  = do
+                    x <- peekByteOff p a_off
+                    let (acc', oe) = f acc x
+                    ma_off' <- case oe of
+                        Nothing  -> return ma_off
+                        Just e   -> do pokeByteOff ma ma_off e
+                                       return $ ma_off + 1
+                    trans (a_off+1) ma_off' acc'
+
+{-# INLINE [1] loopU #-}
+
+{-# RULES
+
+"array fusion!" forall em1 em2 start1 start2 arr.
+  loopU em2 start2 (loopArr (loopU em1 start1 arr)) =
+    let em (acc1, acc2) e =
+            case em1 acc1 e of
+                (acc1', Nothing) -> ((acc1', acc2), Nothing)
+                (acc1', Just e') ->
+                    case em2 acc2 e' of
+                        (acc2', res) -> ((acc1', acc2'), res)
+    in loopSndAcc (loopU em (start1, start2) arr)
+
+"loopArr/loopSndAcc" forall x.
+  loopArr (loopSndAcc x) = loopArr x
+
+"seq/NoAL" forall (u::NoAL) e.
+  u `seq` e = e
+
+ #-}
+