From 4a1f785332f15f382c5f5a729bbdaba54d69870a Mon Sep 17 00:00:00 2001
From: Don Stewart <dons@cse.unsw.edu.au>
Date: Sat, 1 Jul 2006 08:43:45 +0000
Subject: [PATCH] Import Data.ByteString.Lazy, improve ByteString Fusion, and
 resync with FPS head

This patch imports the Data.ByteString.Lazy module, and its helpers,
providing a ByteString implemented as a lazy list of strict cache-sized
chunks. This type allows the usual lazy operations to be written on
bytestrings, including lazy IO, with much improved space and time over
the [Char] equivalents.
---
 Data/ByteString.hs            | 1016 ++++++++++-----------------------
 Data/ByteString/Base.hs       |  463 +++++++++++++++
 Data/ByteString/Char8.hs      |  205 +++----
 Data/ByteString/Fusion.hs     |  700 +++++++++++++++++++++++
 Data/ByteString/Lazy.hs       | 1246 +++++++++++++++++++++++++++++++++++++++++
 Data/ByteString/Lazy/Char8.hs |  693 +++++++++++++++++++++++
 Makefile                      |    1 +
 base.cabal                    |    4 +
 cbits/fpstring.c              |   10 +-
 include/fpstring.h            |   10 +-
 package.conf.in               |    4 +
 11 files changed, 3507 insertions(+), 845 deletions(-)
 create mode 100644 Data/ByteString/Base.hs
 create mode 100644 Data/ByteString/Fusion.hs
 create mode 100644 Data/ByteString/Lazy.hs
 create mode 100644 Data/ByteString/Lazy/Char8.hs

diff --git a/Data/ByteString.hs b/Data/ByteString.hs
index 2001110..9187ff5 100644
--- a/Data/ByteString.hs
+++ b/Data/ByteString.hs
@@ -1,4 +1,4 @@
-{-# OPTIONS_GHC -cpp -fffi -fglasgow-exts #-}
+{-# OPTIONS_GHC -cpp -fglasgow-exts -fno-warn-orphans #-}
 --
 -- Module      : ByteString
 -- Copyright   : (c) The University of Glasgow 2001,
@@ -39,11 +39,11 @@
 module Data.ByteString (
 
         -- * The @ByteString@ type
-        ByteString(..),         -- instances: Eq, Ord, Show, Read, Data, Typeable
+        ByteString,             -- abstract, instances: Eq, Ord, Show, Read, Data, Typeable, Monoid
 
         -- * Introducing and eliminating 'ByteString's
         empty,                  -- :: ByteString
-        singleton,               -- :: Word8   -> ByteString
+        singleton,              -- :: Word8   -> ByteString
         pack,                   -- :: [Word8] -> ByteString
         unpack,                 -- :: ByteString -> [Word8]
         packWith,               -- :: (a -> Word8) -> [a] -> ByteString
@@ -87,10 +87,12 @@ module Data.ByteString (
         -- ** Scans
         scanl,                  -- :: (Word8 -> Word8 -> Word8) -> Word8 -> ByteString -> ByteString
         scanl1,                 -- :: (Word8 -> Word8 -> Word8) -> ByteString -> ByteString
+        scanr,                  -- :: (Word8 -> Word8 -> Word8) -> Word8 -> ByteString -> ByteString
+        scanr1,                 -- :: (Word8 -> Word8 -> Word8) -> ByteString -> ByteString
 
         -- ** Accumulating maps
         mapAccumL,              -- :: (acc -> Word8 -> (acc, Word8)) -> acc -> ByteString -> (acc, ByteString)
---      mapAccumR,              -- :: (acc -> Word8 -> (acc, Word8)) -> acc -> ByteString -> (acc, ByteString)
+        mapAccumR,              -- :: (acc -> Word8 -> (acc, Word8)) -> acc -> ByteString -> (acc, ByteString)
         mapIndexed,             -- :: (Int -> Word8 -> Word8) -> ByteString -> ByteString
 
         -- ** Unfolding ByteStrings
@@ -109,6 +111,7 @@ module Data.ByteString (
         span,                   -- :: (Word8 -> Bool) -> ByteString -> (ByteString, ByteString)
         spanEnd,                -- :: (Word8 -> Bool) -> ByteString -> (ByteString, ByteString)
         break,                  -- :: (Word8 -> Bool) -> ByteString -> (ByteString, ByteString)
+        breakEnd,               -- :: (Word8 -> Bool) -> ByteString -> (ByteString, ByteString)
         group,                  -- :: ByteString -> [ByteString]
         groupBy,                -- :: (Word8 -> Word8 -> Bool) -> ByteString -> [ByteString]
         inits,                  -- :: ByteString -> [ByteString]
@@ -164,47 +167,28 @@ module Data.ByteString (
         -- * Zipping and unzipping ByteStrings
         zip,                    -- :: ByteString -> ByteString -> [(Word8,Word8)]
         zipWith,                -- :: (Word8 -> Word8 -> c) -> ByteString -> ByteString -> [c]
+        zipWith', 
         unzip,                  -- :: [(Word8,Word8)] -> (ByteString,ByteString)
 
         -- * Ordered ByteStrings
         sort,                   -- :: ByteString -> ByteString
 
-        -- * Unchecked access
-        unsafeHead,             -- :: ByteString -> Word8
-        unsafeTail,             -- :: ByteString -> ByteString
-        unsafeIndex,            -- :: ByteString -> Int -> Word8
-        unsafeTake,             -- :: Int -> ByteString -> ByteString
-        unsafeDrop,             -- :: Int -> ByteString -> ByteString
-
-        -- * Low level introduction and elimination
-        generate,               -- :: Int -> (Ptr Word8 -> IO Int) -> IO ByteString
-        create,                 -- :: Int -> (Ptr Word8 -> IO ()) -> ByteString
-        fromForeignPtr,         -- :: ForeignPtr Word8 -> Int -> ByteString
-        toForeignPtr,           -- :: ByteString -> (ForeignPtr Word8, Int, Int)
-        skipIndex,              -- :: ByteString -> Int
+        -- * Low level CString conversions
 
         -- ** Packing CStrings and pointers
         packCString,            -- :: CString -> ByteString
         packCStringLen,         -- :: CString -> ByteString
         packMallocCString,      -- :: CString -> ByteString
 
-#if defined(__GLASGOW_HASKELL__)
-        packCStringFinalizer,   -- :: Ptr Word8 -> Int -> IO () -> IO ByteString
-        packAddress,            -- :: Addr# -> ByteString
-        unsafePackAddress,      -- :: Int -> Addr# -> ByteString
-        unsafeFinalize,         -- :: ByteString -> IO ()
-#endif
-
         -- ** Using ByteStrings as CStrings
         useAsCString,           -- :: ByteString -> (CString -> IO a) -> IO a
-        unsafeUseAsCString,     -- :: ByteString -> (CString -> IO a) -> IO a
-        unsafeUseAsCStringLen,  -- :: ByteString -> (CStringLen -> IO a) -> IO a
+        useAsCStringLen,        -- :: ByteString -> (CStringLen -> IO a) -> IO a
 
         -- ** Copying ByteStrings
         -- | These functions perform memcpy(3) operations
         copy,                   -- :: ByteString -> ByteString
-        copyCString,            -- :: CString -> ByteString
-        copyCStringLen,         -- :: CStringLen -> ByteString
+        copyCString,            -- :: CString -> IO ByteString
+        copyCStringLen,         -- :: CStringLen -> IO ByteString
 
         -- * I\/O with 'ByteString's
 
@@ -220,6 +204,7 @@ module Data.ByteString (
         -- ** Files
         readFile,               -- :: FilePath -> IO ByteString
         writeFile,              -- :: FilePath -> ByteString -> IO ()
+        appendFile,             -- :: FilePath -> ByteString -> IO ()
 --      mmapFile,               -- :: FilePath -> IO ByteString
 
         -- ** I\/O with Handles
@@ -232,16 +217,14 @@ module Data.ByteString (
         hGetContents,           -- :: Handle -> IO ByteString
         hGet,                   -- :: Handle -> Int -> IO ByteString
         hPut,                   -- :: Handle -> ByteString -> IO ()
+        hPutStr,                -- :: Handle -> ByteString -> IO ()
+        hPutStrLn,              -- :: Handle -> ByteString -> IO ()
 
         -- * Fusion utilities
 #if defined(__GLASGOW_HASKELL__)
         unpackList, -- eek, otherwise it gets thrown away by the simplifier
 #endif
-
-        noAL, NoAL, loopArr, loopAcc, loopSndAcc,
-        loopU, mapEFL, filterEFL, foldEFL, foldEFL', fuseEFL, scanEFL,
-        mapAccumEFL, mapIndexEFL,
-
+        lengthU, maximumU, minimumU
   ) where
 
 import qualified Prelude as P
@@ -250,13 +233,16 @@ import Prelude hiding           (reverse,head,tail,last,init,null
                                 ,concat,any,take,drop,splitAt,takeWhile
                                 ,dropWhile,span,break,elem,filter,maximum
                                 ,minimum,all,concatMap,foldl1,foldr1
-                                ,scanl,scanl1,readFile,writeFile,replicate
+                                ,scanl,scanl1,scanr,scanr1
+                                ,readFile,writeFile,appendFile,replicate
                                 ,getContents,getLine,putStr,putStrLn
                                 ,zip,zipWith,unzip,notElem)
 
+import Data.ByteString.Base
+import Data.ByteString.Fusion
+
 import qualified Data.List as List
 
-import Data.Char
 import Data.Word                (Word8)
 import Data.Maybe               (listToMaybe)
 import Data.Array               (listArray)
@@ -267,7 +253,7 @@ import Control.Exception        (bracket, assert)
 import Control.Monad            (when)
 
 import Foreign.C.String         (CString, CStringLen)
-import Foreign.C.Types          (CSize,CInt)
+import Foreign.C.Types          (CSize)
 import Foreign.ForeignPtr
 import Foreign.Marshal.Array
 import Foreign.Ptr
@@ -286,8 +272,6 @@ import System.IO.Unsafe
 
 #if defined(__GLASGOW_HASKELL__)
 
-import Data.Generics            (Data(..), Typeable(..))
-
 import System.IO                (hGetBufNonBlocking)
 import System.IO.Error          (isEOFError)
 
@@ -295,8 +279,8 @@ import Foreign.Marshal          (alloca)
 import qualified Foreign.Concurrent as FC (newForeignPtr)
 
 import GHC.Handle
-import GHC.Prim                 (realWorld#, Addr#, Word#, (+#), writeWord8OffAddr#)
-import GHC.Base                 (build, unsafeChr)
+import GHC.Prim                 (Word#, (+#), writeWord8OffAddr#)
+import GHC.Base                 (build)
 import GHC.Word hiding (Word8)
 import GHC.Ptr                  (Ptr(..))
 import GHC.ST                   (ST(..))
@@ -304,9 +288,6 @@ import GHC.IOBase
 
 #endif
 
--- CFILES stuff is Hugs only
-{-# CFILES cbits/fpstring.c #-}
-
 -- -----------------------------------------------------------------------------
 --
 -- Useful macros, until we have bang patterns
@@ -320,19 +301,6 @@ import GHC.IOBase
 
 -- -----------------------------------------------------------------------------
 
--- | A space-efficient representation of a Word8 vector, supporting many
--- efficient operations.  A 'ByteString' contains 8-bit characters only.
---
--- Instances of Eq, Ord, Read, Show, Data, Typeable
---
-data ByteString = PS {-# UNPACK #-} !(ForeignPtr Word8)
-                     {-# UNPACK #-} !Int
-                     {-# UNPACK #-} !Int
-
-#if defined(__GLASGOW_HASKELL__)
-    deriving (Data, Typeable)
-#endif
-
 instance Eq  ByteString
     where (==)    = eq
 
@@ -373,7 +341,7 @@ compareBytes (PS x1 s1 l1) (PS x2 s2 l2)
         withForeignPtr x1 $ \p1 ->
         withForeignPtr x2 $ \p2 -> do
             i <- memcmp (p1 `plusPtr` s1) (p2 `plusPtr` s2) (fromIntegral $ min l1 l2)
-            return $ case i `compare` 0 of
+            return $! case i `compare` 0 of
                         EQ  -> l1 `compare` l2
                         x   -> x
 {-# INLINE compareBytes #-}
@@ -409,14 +377,12 @@ cmp p1 p2 n len1 len2
 
 -- | /O(1)/ The empty 'ByteString'
 empty :: ByteString
-empty = inlinePerformIO $ mallocByteString 1 >>= \fp -> return $ PS fp 0 0
+empty = unsafeCreate 0 $ const $ return ()
 {-# NOINLINE empty #-}
 
 -- | /O(1)/ Convert a 'Word8' into a 'ByteString'
 singleton :: Word8 -> ByteString
-singleton c = unsafePerformIO $ mallocByteString 2 >>= \fp -> do
-    withForeignPtr fp $ \p -> poke p c
-    return $ PS fp 0 1
+singleton c = unsafeCreate 1 $ \p -> poke p c
 {-# INLINE singleton #-}
 
 --
@@ -446,14 +412,14 @@ pack :: [Word8] -> ByteString
 
 #if !defined(__GLASGOW_HASKELL__)
 
-pack str = create (P.length str) $ \p -> go p str
+pack str = unsafeCreate (P.length str) $ \p -> go p str
     where
         go _ []     = return ()
         go p (x:xs) = poke p x >> go (p `plusPtr` 1) xs -- less space than pokeElemOff
 
 #else /* hack away */
 
-pack str = create (P.length str) $ \(Ptr p) -> stToIO (go p 0# str)
+pack str = unsafeCreate (P.length str) $ \(Ptr p) -> stToIO (go p 0# str)
     where
         go _ _ []        = return ()
         go p i (W8# c:cs) = writeByte p i c >> go p (i +# 1#) cs
@@ -506,6 +472,12 @@ unpackFoldr (PS fp off len) f ch = withPtr fp $ \p -> do
 {-# INLINE [0] unpackFoldr #-}
 
 -- TODO just use normal foldr here.
+--
+-- or
+-- unpack xs | null xs = []
+--           | otherwise = unsafeHead xs : unpack (unsafeTail xs)
+--
+-- ?
 
 #endif
 
@@ -514,7 +486,7 @@ unpackFoldr (PS fp off len) f ch = withPtr fp $ \p -> do
 -- | /O(n)/ Convert a '[a]' into a 'ByteString' using some
 -- conversion function
 packWith :: (a -> Word8) -> [a] -> ByteString
-packWith k str = create (P.length str) $ \p -> go p str
+packWith k str = unsafeCreate (P.length str) $ \p -> go p str
     where
         STRICT2(go)
         go _ []     = return ()
@@ -539,40 +511,50 @@ unpackWith k (PS ps s l) = inlinePerformIO $ withForeignPtr ps $ \p ->
 
 -- | /O(1)/ Test whether a ByteString is empty.
 null :: ByteString -> Bool
-null (PS _ _ l) = l == 0
+null (PS _ _ l) = assert (l >= 0) $ l <= 0
 {-# INLINE null #-}
 
+-- ---------------------------------------------------------------------
 -- | /O(1)/ 'length' returns the length of a ByteString as an 'Int'.
 length :: ByteString -> Int
-length (PS _ _ l) = l
+length (PS _ _ l) = assert (l >= 0) $ l
+
+--
+-- length/loop fusion. When taking the length of any fuseable loop,
+-- rewrite it as a foldl', and thus avoid allocating the result buffer
+-- worth around 10% in speed testing.
+--
 
 #if defined(__GLASGOW_HASKELL__)
 {-# INLINE [1] length #-}
 #endif
 
-{-# 
+lengthU :: ByteString -> Int
+lengthU = foldl' (const . (+1)) (0::Int)
+{-# INLINE lengthU #-}
 
--- Translate length into a loop. 
--- Performace ok, but allocates too much, so disable for now.
+{-# RULES
 
-  "length/loop" forall f acc s .
-  length (loopArr (loopU f acc s)) = foldl' (const . (+1)) (0::Int) (loopArr (loopU f acc s))
+-- v2 fusion
+"length/loop" forall loop s .
+  length  (loopArr (loopWrapper loop s)) =
+  lengthU (loopArr (loopWrapper loop s))
 
   #-}
 
+------------------------------------------------------------------------
+
 -- | /O(n)/ 'cons' is analogous to (:) for lists, but of different
 -- complexity, as it requires a memcpy.
 cons :: Word8 -> ByteString -> ByteString
-cons c (PS x s l) = create (l+1) $ \p -> withForeignPtr x $ \f -> do
+cons c (PS x s l) = unsafeCreate (l+1) $ \p -> withForeignPtr x $ \f -> do
         poke p c
         memcpy (p `plusPtr` 1) (f `plusPtr` s) (fromIntegral l)
 {-# INLINE cons #-}
 
--- todo fuse
-
 -- | /O(n)/ Append a byte to the end of a 'ByteString'
 snoc :: ByteString -> Word8 -> ByteString
-snoc (PS x s l) c = create (l+1) $ \p -> withForeignPtr x $ \f -> do
+snoc (PS x s l) c = unsafeCreate (l+1) $ \p -> withForeignPtr x $ \f -> do
         memcpy p (f `plusPtr` s) (fromIntegral l)
         poke (p `plusPtr` l) c
 {-# INLINE snoc #-}
@@ -580,13 +562,15 @@ snoc (PS x s l) c = create (l+1) $ \p -> withForeignPtr x $ \f -> do
 -- todo fuse
 
 -- | /O(1)/ Extract the first element of a ByteString, which must be non-empty.
+-- An exception will be thrown in the case of an empty ByteString.
 head :: ByteString -> Word8
-head ps@(PS x s _)
-    | null ps   = errorEmptyList "head"
+head (PS x s l)
+    | l <= 0    = errorEmptyList "head"
     | otherwise = inlinePerformIO $ withForeignPtr x $ \p -> peekByteOff p s
 {-# INLINE head #-}
 
 -- | /O(1)/ Extract the elements after the head of a ByteString, which must be non-empty.
+-- An exception will be thrown in the case of an empty ByteString.
 tail :: ByteString -> ByteString
 tail (PS p s l)
     | l <= 0    = errorEmptyList "tail"
@@ -594,6 +578,7 @@ tail (PS p s l)
 {-# INLINE tail #-}
 
 -- | /O(1)/ Extract the last element of a ByteString, which must be finite and non-empty.
+-- An exception will be thrown in the case of an empty ByteString.
 last :: ByteString -> Word8
 last ps@(PS x s l)
     | null ps   = errorEmptyList "last"
@@ -601,9 +586,10 @@ last ps@(PS x s l)
 {-# INLINE last #-}
 
 -- | /O(1)/ Return all the elements of a 'ByteString' except the last one.
+-- An exception will be thrown in the case of an empty ByteString.
 init :: ByteString -> ByteString
-init (PS p s l)
-    | l <= 0    = errorEmptyList "init"
+init ps@(PS p s l)
+    | null ps   = errorEmptyList "init"
     | otherwise = PS p s (l-1)
 {-# INLINE init #-}
 
@@ -620,19 +606,23 @@ append xs ys | null xs   = ys
 -- | /O(n)/ 'map' @f xs@ is the ByteString obtained by applying @f@ to each
 -- element of @xs@. This function is subject to array fusion.
 map :: (Word8 -> Word8) -> ByteString -> ByteString
-map f = loopArr . loopU (mapEFL f) noAL
+#if defined(LOOPU_FUSION)
+map f = loopArr . loopU (mapEFL f) NoAcc
+#elif defined(LOOPUP_FUSION)
+map f = loopArr . loopUp (mapEFL f) NoAcc
+#elif defined(LOOPNOACC_FUSION)
+map f = loopArr . loopNoAcc (mapEFL f)
+#else
+map f = loopArr . loopMap f
+#endif
 {-# INLINE map #-}
 
 -- | /O(n)/ Like 'map', but not fuseable. The benefit is that it is
 -- slightly faster for one-shot cases.
 map' :: (Word8 -> Word8) -> ByteString -> ByteString
-map' f (PS fp s len) = inlinePerformIO $ withForeignPtr fp $ \a -> do
-    np <- mallocByteString (len+1)
-    withForeignPtr np $ \p -> do
-        map_ 0 (a `plusPtr` s) p
-        return (PS np 0 len)
+map' f (PS fp s len) = inlinePerformIO $ withForeignPtr fp $ \a ->
+    create len $ map_ 0 (a `plusPtr` s)
   where
-
     map_ :: Int -> Ptr Word8 -> Ptr Word8 -> IO ()
     STRICT3(map_)
     map_ n p1 p2
@@ -645,12 +635,10 @@ map' f (PS fp s len) = inlinePerformIO $ withForeignPtr fp $ \a -> do
 
 -- | /O(n)/ 'reverse' @xs@ efficiently returns the elements of @xs@ in reverse order.
 reverse :: ByteString -> ByteString
-reverse (PS x s l) = create l $ \p -> withForeignPtr x $ \f ->
+reverse (PS x s l) = unsafeCreate l $ \p -> withForeignPtr x $ \f ->
         c_reverse p (f `plusPtr` s) (fromIntegral l)
 
-{-
-reverse = pack . P.reverse . unpack
--}
+-- todo, fuseable version
 
 -- | /O(n)/ The 'intersperse' function takes a 'Word8' and a
 -- 'ByteString' and \`intersperses\' that byte between the elements of
@@ -659,7 +647,7 @@ reverse = pack . P.reverse . unpack
 intersperse :: Word8 -> ByteString -> ByteString
 intersperse c ps@(PS x s l)
     | length ps < 2  = ps
-    | otherwise      = create (2*l-1) $ \p -> withForeignPtr x $ \f ->
+    | otherwise      = unsafeCreate (2*l-1) $ \p -> withForeignPtr x $ \f ->
         c_intersperse p (f `plusPtr` s) (fromIntegral l) c
 
 {-
@@ -679,7 +667,11 @@ transpose ps = P.map pack (List.transpose (P.map unpack ps))
 -- ByteString using the binary operator, from left to right.
 -- This function is subject to array fusion.
 foldl :: (a -> Word8 -> a) -> a -> ByteString -> a
+#if !defined(LOOPU_FUSION)
+foldl f z = loopAcc . loopUp (foldEFL f) z
+#else
 foldl f z = loopAcc . loopU (foldEFL f) z
+#endif
 {-# INLINE foldl #-}
 
 {-
@@ -697,43 +689,45 @@ foldl f v (PS x s l) = inlinePerformIO $ withForeignPtr x $ \ptr ->
 -}
 
 -- | 'foldl\'' is like 'foldl', but strict in the accumulator.
+-- Though actually foldl is also strict in the accumulator.
 foldl' :: (a -> Word8 -> a) -> a -> ByteString -> a
-foldl' f z = loopAcc . loopU (foldEFL' f) z
+foldl' = foldl
+-- foldl' f z = loopAcc . loopU (foldEFL' f) z
 {-# INLINE foldl' #-}
 
 -- | 'foldr', applied to a binary operator, a starting value
 -- (typically the right-identity of the operator), and a ByteString,
 -- reduces the ByteString using the binary operator, from right to left.
 foldr :: (Word8 -> a -> a) -> a -> ByteString -> a
-foldr k z (PS x s l) = inlinePerformIO $ withForeignPtr x $ \ptr ->
-        go (ptr `plusPtr` s) (ptr `plusPtr` (s+l))
-    where
-        STRICT2(go)
-        go p q | p == q    = return z
-               | otherwise = do c  <- peek p
-                                ws <- go (p `plusPtr` 1) q
-                                return $ c `k` ws
+foldr k z = loopAcc . loopDown (foldEFL (flip k)) z
+{-# INLINE foldr #-}
 
 -- | 'foldl1' is a variant of 'foldl' that has no starting value
 -- argument, and thus must be applied to non-empty 'ByteStrings'.
--- This function is subject to array fusion.
+-- This function is subject to array fusion. 
+-- An exception will be thrown in the case of an empty ByteString.
 foldl1 :: (Word8 -> Word8 -> Word8) -> ByteString -> Word8
 foldl1 f ps
     | null ps   = errorEmptyList "foldl1"
     | otherwise = foldl f (unsafeHead ps) (unsafeTail ps)
+{-# INLINE foldl1 #-}
 
 -- | 'foldl1\'' is like 'foldl1', but strict in the accumulator.
+-- An exception will be thrown in the case of an empty ByteString.
 foldl1' :: (Word8 -> Word8 -> Word8) -> ByteString -> Word8
 foldl1' f ps
     | null ps   = errorEmptyList "foldl1'"
     | otherwise = foldl' f (unsafeHead ps) (unsafeTail ps)
+{-# INLINE foldl1' #-}
 
 -- | 'foldr1' is a variant of 'foldr' that has no starting value argument,
 -- and thus must be applied to non-empty 'ByteString's
+-- An exception will be thrown in the case of an empty ByteString.
 foldr1 :: (Word8 -> Word8 -> Word8) -> ByteString -> Word8
 foldr1 f ps
     | null ps        = errorEmptyList "foldr1"
     | otherwise      = foldr f (last ps) (init ps)
+{-# INLINE foldr1 #-}
 
 -- ---------------------------------------------------------------------
 -- Special folds
@@ -742,7 +736,7 @@ foldr1 f ps
 concat :: [ByteString] -> ByteString
 concat []     = empty
 concat [ps]   = ps
-concat xs     = create len $ \ptr -> go xs ptr
+concat xs     = unsafeCreate len $ \ptr -> go xs ptr
   where len = P.sum . P.map length $ xs
         STRICT2(go)
         go []            _   = return ()
@@ -752,8 +746,9 @@ concat xs     = create len $ \ptr -> go xs ptr
 
 -- | Map a function over a 'ByteString' and concatenate the results
 concatMap :: (Word8 -> ByteString) -> ByteString -> ByteString
-concatMap f = foldr (append . f) empty
--- A silly function for ByteStrings anyway.
+concatMap f = concat . foldr ((:) . f) []
+
+-- foldr (append . f) empty
 
 -- | /O(n)/ Applied to a predicate and a ByteString, 'any' determines if
 -- any element of the 'ByteString' satisfies the predicate.
@@ -783,62 +778,85 @@ all f (PS x s l) = inlinePerformIO $ withForeignPtr x $ \ptr ->
                                  if f c
                                     then go (p `plusPtr` 1) q
                                     else return False
--- todo fuse
+
+------------------------------------------------------------------------
 
 -- | /O(n)/ 'maximum' returns the maximum value from a 'ByteString'
+-- This function will fuse.
+-- An exception will be thrown in the case of an empty ByteString.
 maximum :: ByteString -> Word8
 maximum xs@(PS x s l)
     | null xs   = errorEmptyList "maximum"
     | otherwise = inlinePerformIO $ withForeignPtr x $ \p ->
-                    return $ c_maximum (p `plusPtr` s) (fromIntegral l)
-{-# INLINE maximum #-}
+                      c_maximum (p `plusPtr` s) (fromIntegral l)
 
 -- | /O(n)/ 'minimum' returns the minimum value from a 'ByteString'
+-- This function will fuse.
+-- An exception will be thrown in the case of an empty ByteString.
 minimum :: ByteString -> Word8
 minimum xs@(PS x s l)
     | null xs   = errorEmptyList "minimum"
     | otherwise = inlinePerformIO $ withForeignPtr x $ \p ->
-                    return $ c_minimum (p `plusPtr` s) (fromIntegral l)
-{-# INLINE minimum #-}
+                      c_minimum (p `plusPtr` s) (fromIntegral l)
+
+--
+-- minimum/maximum/loop fusion. As for length (and other folds), when we
+-- see we're applied after a fuseable op, switch from using the C
+-- version, to the fuseable version. The result should then avoid
+-- allocating a buffer.
+--
 
--- fusion is too slow here (10x)
+#if defined(__GLASGOW_HASKELL__)
+{-# INLINE [1] minimum #-}
+{-# INLINE [1] maximum #-}
+#endif
 
-{-
-maximum xs@(PS x s l)
-    | null xs   = errorEmptyList "maximum"
-    | otherwise = inlinePerformIO $ withForeignPtr x $ \p -> do
-                        w <- peek p
-                        maximum_ (p `plusPtr` s) 0 l w
+maximumU :: ByteString -> Word8
+maximumU = foldl1' max
+{-# INLINE maximumU #-}
 
-maximum_ :: Ptr Word8 -> Int -> Int -> Word8 -> IO Word8
-STRICT4(maximum_)
-maximum_ ptr n m c
-    | n >= m    = return c
-    | otherwise = do w <- peekByteOff ptr n
-                     maximum_ ptr (n+1) m (if w > c then w else c)
+minimumU :: ByteString -> Word8
+minimumU = foldl1' min
+{-# INLINE minimumU #-}
 
-minimum xs@(PS x s l)
-    | null xs   = errorEmptyList "minimum"
-    | otherwise = inlinePerformIO $ withForeignPtr x $ \p -> do
-                        w <- peek p
-                        minimum_ (p `plusPtr` s) 0 l w
-
-minimum_ :: Ptr Word8 -> Int -> Int -> Word8 -> IO Word8
-STRICT4(minimum_)
-minimum_ ptr n m c
-    | n >= m    = return c
-    | otherwise = do w <- peekByteOff ptr n
-                     minimum_ ptr (n+1) m (if w < c then w else c)
--}
+{-# RULES
+
+"minimum/loop" forall loop s .
+  minimum  (loopArr (loopWrapper loop s)) =
+  minimumU (loopArr (loopWrapper loop s))
+
+"maximum/loop" forall loop s .
+  maximum  (loopArr (loopWrapper loop s)) =
+  maximumU (loopArr (loopWrapper loop s))
 
+  #-}
+
+------------------------------------------------------------------------
+
+-- | The 'mapAccumL' function behaves like a combination of 'map' and
+-- 'foldl'; it applies a function to each element of a ByteString,
+-- passing an accumulating parameter from left to right, and returning a
+-- final value of this accumulator together with the new list.
 mapAccumL :: (acc -> Word8 -> (acc, Word8)) -> acc -> ByteString -> (acc, ByteString)
-mapAccumL f z = loopU (mapAccumEFL f) z
+#if !defined(LOOPU_FUSION)
+mapAccumL f z = unSP . loopUp (mapAccumEFL f) z
+#else
+mapAccumL f z = unSP . loopU (mapAccumEFL f) z
+#endif
+{-# INLINE mapAccumL #-}
 
---mapAccumR :: (acc -> Word8 -> (acc, Word8)) -> acc -> ByteString -> (acc, ByteString)
+-- | The 'mapAccumR' function behaves like a combination of 'map' and
+-- 'foldr'; it applies a function to each element of a ByteString,
+-- passing an accumulating parameter from right to left, and returning a
+-- final value of this accumulator together with the new ByteString.
+mapAccumR :: (acc -> Word8 -> (acc, Word8)) -> acc -> ByteString -> (acc, ByteString)
+mapAccumR f z = unSP . loopDown (mapAccumEFL f) z
+{-# INLINE mapAccumR #-}
 
 -- | /O(n)/ map Word8 functions, provided with the index at each position
 mapIndexed :: (Int -> Word8 -> Word8) -> ByteString -> ByteString
-mapIndexed f = loopArr . loopU (mapIndexEFL f) 0
+mapIndexed f = loopArr . loopUp (mapIndexEFL f) 0
+{-# INLINE mapIndexed #-}
 
 -- ---------------------------------------------------------------------
 -- Building ByteStrings
@@ -852,7 +870,15 @@ mapIndexed f = loopArr . loopU (mapIndexEFL f) 0
 --
 -- > last (scanl f z xs) == foldl f z xs.
 scanl :: (Word8 -> Word8 -> Word8) -> Word8 -> ByteString -> ByteString
-scanl f z ps = loopArr . loopU (scanEFL f) z $ (ps `snoc` 0) -- extra space
+#if !defined(LOOPU_FUSION)
+scanl f z ps = loopArr . loopUp (scanEFL f) z $ (ps `snoc` 0)
+#else
+scanl f z ps = loopArr . loopU (scanEFL f) z $ (ps `snoc` 0)
+#endif
+
+    -- n.b. haskell's List scan returns a list one bigger than the
+    -- input, so we need to snoc here to get some extra space, however,
+    -- it breaks map/up fusion (i.e. scanl . map no longer fuses)
 {-# INLINE scanl #-}
 
 -- | 'scanl1' is a variant of 'scanl' that has no starting value argument.
@@ -865,6 +891,18 @@ scanl1 f ps
     | otherwise = scanl f (unsafeHead ps) (unsafeTail ps)
 {-# INLINE scanl1 #-}
 
+-- | scanr is the right-to-left dual of scanl.
+scanr :: (Word8 -> Word8 -> Word8) -> Word8 -> ByteString -> ByteString
+scanr f z ps = loopArr . loopDown (scanEFL (flip f)) z $ (0 `cons` ps) -- extra space
+{-# INLINE scanr #-}
+
+-- | 'scanr1' is a variant of 'scanr' that has no starting value argument.
+scanr1 :: (Word8 -> Word8 -> Word8) -> ByteString -> ByteString
+scanr1 f ps
+    | null ps   = empty
+    | otherwise = scanr f (last ps) (init ps) -- todo, unsafe versions
+{-# INLINE scanr1 #-}
+
 -- ---------------------------------------------------------------------
 -- Unfolds and replicates
 
@@ -875,17 +913,10 @@ scanl1 f ps
 --
 -- This implemenation uses @memset(3)@
 replicate :: Int -> Word8 -> ByteString
-replicate w c | w <= 0    = empty
-              | otherwise = create w $ \ptr -> memset ptr c (fromIntegral w) >> return ()
-
-{-
--- About 5x slower
-replicate w c = inlinePerformIO $ generate w $ \ptr -> go ptr w
-    where
-        STRICT2(go)
-        go _   0 = return w
-        go ptr n = poke ptr c >> go (ptr `plusPtr` 1) (n-1)
--}
+replicate w c
+    | w <= 0    = empty
+    | otherwise = unsafeCreate w $ \ptr ->
+                      memset ptr c (fromIntegral w) >> return ()
 
 -- | /O(n)/, where /n/ is the length of the result.  The 'unfoldr' 
 -- function is analogous to the List \'unfoldr\'.  'unfoldr' builds a 
@@ -918,18 +949,15 @@ unfoldr f = concat . unfoldChunk 32 64
 unfoldrN :: Int -> (a -> Maybe (Word8, a)) -> a -> (ByteString, Maybe a)
 unfoldrN i f x0
     | i < 0     = (empty, Just x0)
-    | otherwise = inlinePerformIO $ do
-                    fp <- mallocByteString i
-                    withForeignPtr fp (\p -> go fp p x0 0)
-  where STRICT4(go)
-        go fp p x n =
+    | otherwise = unsafePerformIO $ createAndTrim' i $ \p -> go p x0 0
+  where STRICT3(go)
+        go p x n =
           case f x of
-            Nothing      -> let s = copy (PS fp 0 n)
-                             in s `seq` return (s, Nothing)
+            Nothing      -> return (0, n, Nothing)
             Just (w,x')
-             | n == i    -> return (PS fp 0 i, Just x)
+             | n == i    -> return (0, n, Just x)
              | otherwise -> do poke p w
-                               go fp (p `plusPtr` 1) x' (n+1)
+                               go (p `plusPtr` 1) x' (n+1)
 
 -- ---------------------------------------------------------------------
 -- Substrings
@@ -977,6 +1005,12 @@ break :: (Word8 -> Bool) -> ByteString -> (ByteString, ByteString)
 break p ps = case findIndexOrEnd p ps of n -> (unsafeTake n ps, unsafeDrop n ps)
 {-# INLINE break #-}
 
+-- | 'breakEnd' behaves like 'break' but from the end of the 'ByteString'
+-- 
+-- breakEnd p == spanEnd (not.p)
+breakEnd :: (Word8 -> Bool) -> ByteString -> (ByteString, ByteString)
+breakEnd  p ps = splitAt (findFromEndUntil p ps) ps
+
 -- | 'breakByte' breaks its ByteString argument at the first occurence
 -- of the specified byte. It is more efficient than 'break' as it is
 -- implemented with @memchr(3)@. I.e.
@@ -1095,9 +1129,10 @@ split w (PS x s l) = inlinePerformIO $ withForeignPtr x $ \p -> do
     let ptr = p `plusPtr` s
 
         STRICT1(loop)
-        loop n = do
-            let q = memchr (ptr `plusPtr` n) w (fromIntegral (l-n))
-            if q == nullPtr
+        loop n =
+            let q = inlinePerformIO $ memchr (ptr `plusPtr` n)
+                                           w (fromIntegral (l-n))
+            in if q == nullPtr
                 then [PS x (s+n) (l-n)]
                 else let i = q `minusPtr` ptr in PS x (s+n) (i-n) : loop (i+1)
 
@@ -1175,7 +1210,7 @@ join s = concat . (List.intersperse s)
 -- with a char. Around 4 times faster than the generalised join.
 --
 joinWithByte :: Word8 -> ByteString -> ByteString -> ByteString
-joinWithByte c f@(PS ffp s l) g@(PS fgp t m) = create len $ \ptr ->
+joinWithByte c f@(PS ffp s l) g@(PS fgp t m) = unsafeCreate len $ \ptr ->
     withForeignPtr ffp $ \fp ->
     withForeignPtr fgp $ \gp -> do
         memcpy ptr (fp `plusPtr` s) (fromIntegral l)
@@ -1204,8 +1239,8 @@ index ps n
 elemIndex :: Word8 -> ByteString -> Maybe Int
 elemIndex c (PS x s l) = inlinePerformIO $ withForeignPtr x $ \p -> do
     let p' = p `plusPtr` s
-        q  = memchr p' c (fromIntegral l)
-    return $ if q == nullPtr then Nothing else Just $! q `minusPtr` p'
+    q <- memchr p' c (fromIntegral l)
+    return $! if q == nullPtr then Nothing else Just $! q `minusPtr` p'
 {-# INLINE elemIndex #-}
 
 -- | /O(n)/ The 'elemIndexEnd' function returns the last index of the
@@ -1236,12 +1271,13 @@ elemIndices w (PS x s l) = inlinePerformIO $ withForeignPtr x $ \p -> do
     let ptr = p `plusPtr` s
 
         STRICT1(loop)
-        loop n = let q = memchr (ptr `plusPtr` n) w (fromIntegral (l - n))
+        loop n = let q = inlinePerformIO $ memchr (ptr `plusPtr` n)
+                                                w (fromIntegral (l - n))
                  in if q == nullPtr
                         then []
                         else let i = q `minusPtr` ptr
                              in i : loop (i+1)
-    return (loop 0)
+    return $! loop 0
 {-# INLINE elemIndices #-}
 
 {-
@@ -1261,7 +1297,7 @@ elemIndices c ps = loop 0 ps
 -- But more efficiently than using length on the intermediate list.
 count :: Word8 -> ByteString -> Int
 count w (PS x s m) = inlinePerformIO $ withForeignPtr x $ \p ->
-    return $ fromIntegral $ c_count (p `plusPtr` s) (fromIntegral m) w
+    fmap fromIntegral $ c_count (p `plusPtr` s) (fromIntegral m) w
 {-# INLINE count #-}
 
 {-
@@ -1274,7 +1310,7 @@ count w (PS x s m) = inlinePerformIO $ withForeignPtr x $ \p ->
         go :: Ptr Word8 -> CSize -> Int -> IO Int
         STRICT3(go)
         go p l i = do
-            let q = memchr p w l
+            q <- memchr p w l
             if q == nullPtr
                 then return i
                 else do let k = fromIntegral $ q `minusPtr` p
@@ -1335,7 +1371,15 @@ notElem c ps = not (elem c ps)
 -- returns a ByteString containing those characters that satisfy the
 -- predicate. This function is subject to array fusion.
 filter :: (Word8 -> Bool) -> ByteString -> ByteString
-filter p  = loopArr . loopU (filterEFL p) noAL
+#if defined(LOOPU_FUSION)
+filter p  = loopArr . loopU (filterEFL p) NoAcc
+#elif defined(LOOPUP_FUSION)
+filter p  = loopArr . loopUp (filterEFL p) NoAcc
+#elif defined(LOOPNOACC_FUSION)
+filter p  = loopArr . loopNoAcc (filterEFL p)
+#else
+filter f = loopArr . loopFilter f
+#endif
 {-# INLINE filter #-}
 
 -- | /O(n)/ 'filter\'' is a non-fuseable version of filter, that may be
@@ -1343,9 +1387,9 @@ filter p  = loopArr . loopU (filterEFL p) noAL
 filter' :: (Word8 -> Bool) -> ByteString -> ByteString
 filter' k ps@(PS x s l)
     | null ps   = ps
-    | otherwise = inlinePerformIO $ generate l $ \p -> withForeignPtr x $ \f -> do
+    | otherwise = unsafePerformIO $ createAndTrim l $ \p -> withForeignPtr x $ \f -> do
         t <- go (f `plusPtr` s) p (f `plusPtr` (s + l))
-        return (t `minusPtr` p) -- actual length
+        return $! t `minusPtr` p -- actual length
     where
         STRICT3(go)
         go f t end | f == end  = return t
@@ -1415,7 +1459,7 @@ isPrefixOf (PS x1 s1 l1) (PS x2 s2 l2)
     | otherwise = inlinePerformIO $ withForeignPtr x1 $ \p1 ->
         withForeignPtr x2 $ \p2 -> do
             i <- memcmp (p1 `plusPtr` s1) (p2 `plusPtr` s2) (fromIntegral l1)
-            return (i == 0)
+            return $! i == 0
 
 -- | /O(n)/ The 'isSuffixOf' function takes two ByteStrings and returns 'True'
 -- iff the first is a suffix of the second.
@@ -1433,7 +1477,7 @@ isSuffixOf (PS x1 s1 l1) (PS x2 s2 l2)
     | otherwise = inlinePerformIO $ withForeignPtr x1 $ \p1 ->
         withForeignPtr x2 $ \p2 -> do
             i <- memcmp (p1 `plusPtr` s1) (p2 `plusPtr` s2 `plusPtr` (l2 - l1)) (fromIntegral l1)
-            return (i == 0)
+            return $! i == 0
 
 -- | Check whether one string is a substring of another. @isSubstringOf
 -- p s@ is equivalent to @not (null (findSubstrings p s))@.
@@ -1491,12 +1535,44 @@ zip ps qs
 -- | 'zipWith' generalises 'zip' by zipping with the function given as
 -- the first argument, instead of a tupling function.  For example,
 -- @'zipWith' (+)@ is applied to two ByteStrings to produce the list of
--- corresponding sums.
+-- corresponding sums. 
 zipWith :: (Word8 -> Word8 -> a) -> ByteString -> ByteString -> [a]
 zipWith f ps qs
     | null ps || null qs = []
     | otherwise = f (unsafeHead ps) (unsafeHead qs) : zipWith f (unsafeTail ps) (unsafeTail qs)
 
+--
+-- | A specialised version of zipWith for the common case of a
+-- simultaneous map over two bytestrings, to build a 3rd. Rewrite rules
+-- are used to automatically covert zipWith into zipWith' when a pack is
+-- performed on the result of zipWith, but we also export it for
+-- convenience.
+--
+zipWith' :: (Word8 -> Word8 -> Word8) -> ByteString -> ByteString -> ByteString
+zipWith' f (PS fp s l) (PS fq t m) = inlinePerformIO $
+    withForeignPtr fp $ \a ->
+    withForeignPtr fq $ \b ->
+    create len $ zipWith_ 0 (a `plusPtr` s) (b `plusPtr` t)
+  where
+    zipWith_ :: Int -> Ptr Word8 -> Ptr Word8 -> Ptr Word8 -> IO ()
+    STRICT4(zipWith_)
+    zipWith_ n p1 p2 r
+       | n >= len = return ()
+       | otherwise = do
+            x <- peekByteOff p1 n
+            y <- peekByteOff p2 n
+            pokeByteOff r n (f x y)
+            zipWith_ (n+1) p1 p2 r
+
+    len = min l m
+{-# INLINE zipWith' #-}
+
+{-# RULES
+
+"Specialise zipWith" forall (f :: Word8 -> Word8 -> Word8) p q .
+    pack (zipWith f p q) = zipWith' f p q
+  #-}
+
 -- | /O(n)/ 'unzip' transforms a list of pairs of bytes into a pair of
 -- ByteStrings. Note that this performs two 'pack' operations.
 unzip :: [(Word8,Word8)] -> (ByteString,ByteString)
@@ -1522,10 +1598,10 @@ tails p | null p    = [empty]
 
 -- | /O(n)/ Sort a ByteString efficiently, using counting sort.
 sort :: ByteString -> ByteString
-sort (PS input s l) = create l $ \p -> allocaArray 256 $ \arr -> do
+sort (PS input s l) = unsafeCreate l $ \p -> allocaArray 256 $ \arr -> do
 
     memset (castPtr arr) 0 (256 * fromIntegral (sizeOf (undefined :: CSize)))
-    withForeignPtr input (\x -> countEach arr (x `plusPtr` s) l)
+    withForeignPtr input (\x -> countOccurrences arr (x `plusPtr` s) l)
 
     let STRICT2(go)
         go 256 _   = return ()
@@ -1534,30 +1610,13 @@ sort (PS input s l) = create l $ \p -> allocaArray 256 $ \arr -> do
                         go (i + 1) (ptr `plusPtr` (fromIntegral n))
     go 0 p
 
--- "countEach counts str l" counts the number of occurences of each Word8 in
--- str, and stores the result in counts.
-countEach :: Ptr CSize -> Ptr Word8 -> Int -> IO ()
-STRICT3(countEach)
-countEach counts str l = go 0
- where
-    STRICT1(go)
-    go i | i == l    = return ()
-         | otherwise = do k <- fromIntegral `fmap` peekElemOff str i
-                          x <- peekElemOff counts k
-                          pokeElemOff counts k (x + 1)
-                          go (i + 1)
-
 {-
 sort :: ByteString -> ByteString
-sort (PS x s l) = create l $ \p -> withForeignPtr x $ \f -> do
+sort (PS x s l) = unsafeCreate l $ \p -> withForeignPtr x $ \f -> do
         memcpy p (f `plusPtr` s) l
         c_qsort p l -- inplace
 -}
 
-{-
-sort = pack . List.sort . unpack
--}
-
 -- | The 'sortBy' function is the non-overloaded version of 'sort'.
 --
 -- Try some linear sorts: radix, counting
@@ -1567,167 +1626,51 @@ sort = pack . List.sort . unpack
 -- sortBy f ps = undefined
 
 -- ---------------------------------------------------------------------
---
--- Extensions to the basic interface
---
-
--- | A variety of 'head' for non-empty ByteStrings. 'unsafeHead' omits the
--- check for the empty case, so there is an obligation on the programmer
--- to provide a proof that the ByteString is non-empty.
-unsafeHead :: ByteString -> Word8
-unsafeHead (PS x s _) = inlinePerformIO $ withForeignPtr x $ \p -> peekByteOff p s
-{-# INLINE unsafeHead #-}
-
--- | A variety of 'tail' for non-empty ByteStrings. 'unsafeTail' omits the
--- check for the empty case. As with 'unsafeHead', the programmer must
--- provide a separate proof that the ByteString is non-empty.
-unsafeTail :: ByteString -> ByteString
-unsafeTail (PS ps s l) = PS ps (s+1) (l-1)
-{-# INLINE unsafeTail #-}
-
--- | Unsafe 'ByteString' index (subscript) operator, starting from 0, returning a 'Word8'
--- This omits the bounds check, which means there is an accompanying
--- obligation on the programmer to ensure the bounds are checked in some
--- other way.
-unsafeIndex :: ByteString -> Int -> Word8
-unsafeIndex (PS x s _) i = inlinePerformIO $ withForeignPtr x $ \p -> peekByteOff p (s+i)
-{-# INLINE unsafeIndex #-}
-
--- | A variety of 'take' which omits the checks on @n@ so there is an
--- obligation on the programmer to provide a proof that @0 <= n <= 'length' xs@.
-unsafeTake :: Int -> ByteString -> ByteString
-unsafeTake n (PS x s l) =
-  assert (0 <= n && n <= l) $ PS x s n
-{-# INLINE unsafeTake #-}
-
--- | A variety of 'drop' which omits the checks on @n@ so there is an
--- obligation on the programmer to provide a proof that @0 <= n <= 'length' xs@.
-unsafeDrop  :: Int -> ByteString -> ByteString
-unsafeDrop n (PS x s l) =
-  assert (0 <= n && n <= l) $ PS x (s+n) (l-n)
-{-# INLINE unsafeDrop #-}
-
--- ---------------------------------------------------------------------
 -- Low level constructors
 
-#if defined(__GLASGOW_HASKELL__)
--- | /O(n)/ Pack a null-terminated sequence of bytes, pointed to by an
--- Addr\# (an arbitrary machine address assumed to point outside the
--- garbage-collected heap) into a @ByteString@. A much faster way to
--- create an Addr\# is with an unboxed string literal, than to pack a
--- boxed string. A unboxed string literal is compiled to a static @char
--- []@ by GHC. Establishing the length of the string requires a call to
--- @strlen(3)@, so the Addr# must point to a null-terminated buffer (as
--- is the case with "string"# literals in GHC). Use 'unsafePackAddress'
--- if you know the length of the string statically.
---
--- An example:
---
--- > literalFS = packAddress "literal"#
---
-packAddress :: Addr# -> ByteString
-packAddress addr# = inlinePerformIO $ do
-    p <- newForeignPtr_ cstr
-    return $ PS p 0 (fromIntegral $ c_strlen cstr)
-  where
-    cstr = Ptr addr#
-{-# INLINE packAddress #-}
-
--- | /O(1)/ 'unsafePackAddress' provides constant-time construction of
--- 'ByteStrings' -- which is ideal for string literals. It packs a
--- null-terminated sequence of bytes into a 'ByteString', given a raw
--- 'Addr\#' to the string, and the length of the string. Make sure the
--- length is correct, otherwise use the safer 'packAddress' (where the
--- length will be calculated once at runtime).
-unsafePackAddress :: Int -> Addr# -> ByteString
-unsafePackAddress len addr# = inlinePerformIO $ do
-    p <- newForeignPtr_ cstr
-    return $ PS p 0 len
-    where cstr = Ptr addr#
-
-#endif
-
--- | /O(1)/ Build a ByteString from a ForeignPtr
-fromForeignPtr :: ForeignPtr Word8 -> Int -> ByteString
-fromForeignPtr fp l = PS fp 0 l
-
--- | /O(1)/ Deconstruct a ForeignPtr from a ByteString
-toForeignPtr :: ByteString -> (ForeignPtr Word8, Int, Int)
-toForeignPtr (PS ps s l) = (ps, s, l)
-
--- | /O(1)/ 'skipIndex' returns the internal skipped index of the
--- current 'ByteString' from any larger string it was created from, as
--- an 'Int'.
-skipIndex :: ByteString -> Int
-skipIndex (PS _ s _) = s
-{-# INLINE skipIndex #-}
-
 -- | /O(n)/ Build a @ByteString@ from a @CString@. This value will have /no/
 -- finalizer associated to it. The ByteString length is calculated using
 -- /strlen(3)/, and thus the complexity is a /O(n)/.
 packCString :: CString -> ByteString
-packCString cstr = inlinePerformIO $ do
+packCString cstr = unsafePerformIO $ do
     fp <- newForeignPtr_ (castPtr cstr)
-    return $ PS fp 0 (fromIntegral $ c_strlen cstr)
+    l <- c_strlen cstr
+    return $! PS fp 0 (fromIntegral l)
 
 -- | /O(1)/ Build a @ByteString@ from a @CStringLen@. This value will
 -- have /no/ finalizer associated with it. This operation has /O(1)/
 -- complexity as we already know the final size, so no /strlen(3)/ is
 -- required.
 packCStringLen :: CStringLen -> ByteString
-packCStringLen (ptr,len) = inlinePerformIO $ do
+packCStringLen (ptr,len) = unsafePerformIO $ do
     fp <- newForeignPtr_ (castPtr ptr)
-    return $ PS fp 0 (fromIntegral len)
+    return $! PS fp 0 (fromIntegral len)
 
 -- | /O(n)/ Build a @ByteString@ from a malloced @CString@. This value will
 -- have a @free(3)@ finalizer associated to it.
 packMallocCString :: CString -> ByteString
-packMallocCString cstr = inlinePerformIO $ do
+packMallocCString cstr = unsafePerformIO $ do
     fp <- newForeignFreePtr (castPtr cstr)
-    return $ PS fp 0 (fromIntegral $ c_strlen cstr)
-
-#if defined(__GLASGOW_HASKELL__)
--- | /O(1)/ Construct a 'ByteString' given a C Ptr Word8 buffer, a
--- length, and an IO action representing a finalizer. This function is
--- not available on Hugs.
---
-packCStringFinalizer :: Ptr Word8 -> Int -> IO () -> IO ByteString
-packCStringFinalizer p l f = do
-    fp <- FC.newForeignPtr p f
-    return $ PS fp 0 l
-
--- | Explicitly run the finaliser associated with a 'ByteString'.
--- Further references to this value may generate invalid memory
--- references. This operation is unsafe, as there may be other
--- 'ByteStrings' referring to the same underlying pages. If you use
--- this, you need to have a proof of some kind that all 'ByteString's
--- ever generated from the underlying byte array are no longer live.
-unsafeFinalize :: ByteString -> IO ()
-unsafeFinalize (PS p _ _) = finalizeForeignPtr p
-
-#endif
+    len <- c_strlen cstr
+    return $! PS fp 0 (fromIntegral len)
 
--- | /O(n) construction/ Use a @ByteString@ with a function requiring a null-terminated @CString@.
---   The @CString@ should not be freed afterwards. This is a memcpy(3).
+-- | /O(n) construction/ Use a @ByteString@ with a function requiring a
+-- null-terminated @CString@.  The @CString@ will be freed
+-- automatically. This is a memcpy(3).
 useAsCString :: ByteString -> (CString -> IO a) -> IO a
-useAsCString (PS ps s l) = bracket alloc (c_free.castPtr)
+useAsCString ps f = useAsCStringLen ps (\(s,_) -> f s)
+
+-- | /O(n) construction/ Use a @ByteString@ with a function requiring a
+-- @CStringLen@. The @CStringLen@ will be freed automatically. This is a
+-- memcpy(3).
+useAsCStringLen :: ByteString -> (CStringLen -> IO a) -> IO a
+useAsCStringLen (PS ps s l) = bracket alloc (c_free.castPtr.fst)
     where
       alloc = withForeignPtr ps $ \p -> do
                 buf <- c_malloc (fromIntegral l+1)
                 memcpy (castPtr buf) (castPtr p `plusPtr` s) (fromIntegral l)
-                poke (buf `plusPtr` l) (0::Word8)
-                return $ castPtr buf
-
--- | /O(1) construction/ Use a @ByteString@ with a function requiring a @CString@.
--- Warning: modifying the @CString@ will affect the @ByteString@.
--- Why is this function unsafe? It relies on the null byte at the end of
--- the ByteString to be there. This is /not/ the case if your ByteString
--- has been spliced from a larger string (i.e. with take or drop).
--- Unless you can guarantee the null byte, you should use the safe
--- version, which will copy the string first.
---
-unsafeUseAsCString :: ByteString -> (CString -> IO a) -> IO a
-unsafeUseAsCString (PS ps s _) ac = withForeignPtr ps $ \p -> ac (castPtr p `plusPtr` s)
+                poke (buf `plusPtr` l) (0::Word8) -- n.b.
+                return $! (castPtr buf, l)
 
 -- | /O(n)/ Make a copy of the 'ByteString' with its own storage. 
 --   This is mainly useful to allow the rest of the data pointed
@@ -1735,64 +1678,20 @@ unsafeUseAsCString (PS ps s _) ac = withForeignPtr ps $ \p -> ac (castPtr p `plu
 --   if a large string has been read in, and only a small part of it 
 --   is needed in the rest of the program.
 copy :: ByteString -> ByteString
-copy (PS x s l) = create l $ \p -> withForeignPtr x $ \f ->
+copy (PS x s l) = unsafeCreate l $ \p -> withForeignPtr x $ \f ->
     memcpy p (f `plusPtr` s) (fromIntegral l)
 
 -- | /O(n)/ Duplicate a CString as a ByteString. Useful if you know the
 -- CString is going to be deallocated from C land.
 copyCString :: CString -> IO ByteString
-copyCString cstr = copyCStringLen (cstr, (fromIntegral $ c_strlen cstr))
+copyCString cstr = do
+    len <- c_strlen cstr
+    copyCStringLen (cstr, fromIntegral len)
 
 -- | /O(n)/ Same as copyCString, but saves a strlen call when the length is known.
 copyCStringLen :: CStringLen -> IO ByteString
-copyCStringLen (cstr, len) = do
-    fp <- mallocForeignPtrArray (len+1)
-    withForeignPtr fp $ \p -> do
-        memcpy p (castPtr cstr) (fromIntegral len)
-        poke (p `plusPtr` len) (0 :: Word8)
-    return $! PS fp 0 len
-
--- | /O(1) construction/ Use a @ByteString@ with a function requiring a @CStringLen@.
--- Warning: modifying the @CStringLen@ will affect the @ByteString@.
--- This is analogous to unsafeUseAsCString, and comes with the same
--- safety requirements.
---
-unsafeUseAsCStringLen :: ByteString -> (CStringLen -> IO a) -> IO a
-unsafeUseAsCStringLen (PS ps s l) ac = withForeignPtr ps $ \p -> ac (castPtr p `plusPtr` s,l)
-
--- | Given the maximum size needed and a function to make the contents
--- of a ByteString, generate makes the 'ByteString'. The generating
--- function is required to return the actual final size (<= the maximum
--- size), and the resulting byte array is realloced to this size.  The
--- string is padded at the end with a null byte.
---
--- generate is the main mechanism for creating custom, efficient
--- ByteString functions, using Haskell or C functions to fill the space.
---
-generate :: Int -> (Ptr Word8 -> IO Int) -> IO ByteString
-generate i f = do
-    fp      <- mallocByteString i
-    (ptr,n) <- withForeignPtr fp $ \p -> do
-        i' <- f p
-        if i' == i
-            then return (fp,i')
-            else do fp_ <- mallocByteString i'      -- realloc
-                    withForeignPtr fp_ $ \p' -> memcpy p' p (fromIntegral i')
-                    return (fp_,i')
-    return (PS ptr 0 n)
-
-{-
---
--- On the C malloc heap. Less fun.
---
-generate i f = do
-    p <- mallocArray (i+1)
-    i' <- f p
-    p' <- reallocArray p (i'+1)
-    poke (p' `plusPtr` i') (0::Word8)    -- XXX so CStrings work
-    fp <- newForeignFreePtr p'
-    return $ PS fp 0 i'
--}
+copyCStringLen (cstr, len) = create len $ \p ->
+    memcpy p (castPtr cstr) (fromIntegral len)
 
 -- ---------------------------------------------------------------------
 -- line IO
@@ -1873,12 +1772,11 @@ hGetLine h = wantReadableHandle "Data.ByteString.hGetLine" h $ \ handle_ -> do
 
 -- TODO, rewrite to use normal memcpy
 mkPS :: RawBuffer -> Int -> Int -> IO ByteString
-mkPS buf start end = do
+mkPS buf start end =
     let len = end - start
-    fp <- mallocByteString len
-    withForeignPtr fp $ \p -> do
+    in create len $ \p -> do
         memcpy_ptr_baoff p buf (fromIntegral start) (fromIntegral len)
-        return (PS fp 0 len)
+        return ()
 
 mkBigPS :: Int -> [ByteString] -> IO ByteString
 mkBigPS _ [ps] = return ps
@@ -1891,27 +1789,33 @@ mkBigPS _ pss = return $! concat (P.reverse pss)
 
 -- | Outputs a 'ByteString' to the specified 'Handle'.
 hPut :: Handle -> ByteString -> IO ()
-hPut _ (PS _ _ 0)  = return ()
-hPut h (PS ps 0 l) = withForeignPtr ps $ \p-> hPutBuf h p l
+hPut _ (PS _  _ 0) = return ()
 hPut h (PS ps s l) = withForeignPtr ps $ \p-> hPutBuf h (p `plusPtr` s) l
 
+-- | A synonym for @hPut@, for compatibility 
+hPutStr :: Handle -> ByteString -> IO ()
+hPutStr = hPut
+
+-- | Write a ByteString to a handle, appending a newline byte
+hPutStrLn :: Handle -> ByteString -> IO ()
+hPutStrLn h ps
+    | length ps < 1024 = hPut h (ps `snoc` 0x0a)
+    | otherwise        = hPut h ps >> hPut h (singleton (0x0a)) -- don't copy
+
 -- | Write a ByteString to stdout
 putStr :: ByteString -> IO ()
 putStr = hPut stdout
 
 -- | Write a ByteString to stdout, appending a newline byte
 putStrLn :: ByteString -> IO ()
-putStrLn ps = hPut stdout ps >> hPut stdout nl
-    where nl = singleton 0x0a
+putStrLn = hPutStrLn stdout
 
 -- | Read a 'ByteString' directly from the specified 'Handle'.  This
 -- is far more efficient than reading the characters into a 'String'
 -- and then using 'pack'.
 hGet :: Handle -> Int -> IO ByteString
 hGet _ 0 = return empty
-hGet h i = do fp <- mallocByteString i
-              l  <- withForeignPtr fp $ \p-> hGetBuf h p i
-              return $ PS fp 0 l
+hGet h i = createAndTrim i $ \p -> hGetBuf h p i
 
 #if defined(__GLASGOW_HASKELL__)
 -- | hGetNonBlocking is identical to 'hGet', except that it will never block
@@ -1919,10 +1823,7 @@ hGet h i = do fp <- mallocByteString i
 -- is available.
 hGetNonBlocking :: Handle -> Int -> IO ByteString
 hGetNonBlocking _ 0 = return empty
-hGetNonBlocking h i = do
-    fp <- mallocByteString i
-    l  <- withForeignPtr fp $ \p -> hGetBufNonBlocking h p i
-    return $ PS fp 0 l
+hGetNonBlocking h i = createAndTrim i $ \p -> hGetBufNonBlocking h p i
 #endif
 
 -- | Read entire handle contents into a 'ByteString'.
@@ -1942,7 +1843,7 @@ hGetContents h = do
     if i < start_size
         then do p' <- reallocArray p i
                 fp <- newForeignFreePtr p'
-                return $ PS fp 0 i
+                return $! PS fp 0 i
         else f p start_size
     where
         f p s = do
@@ -1953,7 +1854,7 @@ hGetContents h = do
                 then do let i' = s + i
                         p'' <- reallocArray p' i'
                         fp  <- newForeignFreePtr p''
-                        return $ PS fp 0 i'
+                        return $! PS fp 0 i'
                 else f p' s'
 
 -- | getContents. Equivalent to hGetContents stdin
@@ -1965,18 +1866,19 @@ getContents = hGetContents stdin
 -- 'pack'.  It also may be more efficient than opening the file and
 -- reading it using hGet.
 readFile :: FilePath -> IO ByteString
-readFile f = do
-    h <- openBinaryFile f ReadMode
-    l <- hFileSize h
-    s <- hGet h $ fromIntegral l
-    hClose h
-    return s
+readFile f = bracket (openBinaryFile f ReadMode) hClose
+    (\h -> hFileSize h >>= hGet h . fromIntegral)
 
 -- | Write a 'ByteString' to a file.
 writeFile :: FilePath -> ByteString -> IO ()
 writeFile f ps = bracket (openBinaryFile f WriteMode) hClose
     (\h -> hPut h ps)
 
+-- | Append a 'ByteString' to a file.
+appendFile :: FilePath -> ByteString -> IO ()
+appendFile f txt = bracket (openBinaryFile f AppendMode) hClose
+    (\hdl -> hPut hdl txt)
+
 {-
 --
 -- Disable until we can move it into a portable .hsc file
@@ -1996,7 +1898,7 @@ writeFile f ps = bracket (openBinaryFile f WriteMode) hClose
 -- On systems without mmap, this is the same as a readFile.
 --
 mmapFile :: FilePath -> IO ByteString
-mmapFile f = mmap f >>= \(fp,l) -> return $ PS fp 0 l
+mmapFile f = mmap f >>= \(fp,l) -> return $! PS fp 0 l
 
 mmap :: FilePath -> IO (ForeignPtr Word8, Int)
 mmap f = do
@@ -2020,6 +1922,8 @@ mmap f = do
                      else do
                           -- The munmap leads to crashes on OpenBSD.
                           -- maybe there's a use after unmap in there somewhere?
+                          -- Bulat suggests adding the hClose to the
+                          -- finalizer, excellent idea.
 #if !defined(__OpenBSD__)
                              let unmap = c_munmap p l >> return ()
 #else
@@ -2050,41 +1954,6 @@ getArgs =
 -- ---------------------------------------------------------------------
 -- Internal utilities
 
--- Unsafe conversion between 'Word8' and 'Char'. These are nops, and
--- silently truncate to 8 bits Chars > '\255'. They are provided as
--- convenience for ByteString construction.
-w2c :: Word8 -> Char
-#if !defined(__GLASGOW_HASKELL__)
-w2c = chr . fromIntegral
-#else
-w2c = unsafeChr . fromIntegral
-#endif
-{-# INLINE w2c #-}
-
-c2w :: Char -> Word8
-c2w = fromIntegral . ord
-{-# INLINE c2w #-}
-
--- Wrapper of mallocForeignPtrArray. Any ByteString allocated this way
--- is padded with a null byte.
-mallocByteString :: Int -> IO (ForeignPtr Word8)
-mallocByteString l = do
-    fp <- mallocForeignPtrArray (l+1)
-    withForeignPtr fp $ \p -> poke (p `plusPtr` l) (0::Word8)
-    return fp
-
--- | A way of creating ForeignPtrs outside the IO monad. The @Int@
--- argument gives the final size of the ByteString. Unlike 'generate'
--- the ByteString is not reallocated if the final size is less than the
--- estimated size. Also, unlike 'generate' ByteString's created this way
--- are managed on the Haskell heap.
-create :: Int -> (Ptr Word8 -> IO ()) -> ByteString
-create l write_ptr = inlinePerformIO $ do
-    fp <- mallocByteString (l+1)
-    withForeignPtr fp $ \p -> write_ptr p
-    return $ PS fp 0 l
-{-# INLINE create #-}
-
 -- | Perform an operation with a temporary ByteString
 withPtr :: ForeignPtr a -> (Ptr a -> IO b) -> b
 withPtr fp io = inlinePerformIO (withForeignPtr fp io)
@@ -2108,17 +1977,6 @@ findFromEndUntil f ps@(PS x s l) =
     else if f (last ps) then l
          else findFromEndUntil f (PS x s (l-1))
 
--- Just like inlinePerformIO, but we inline it. Big performance gains as
--- it exposes lots of things to further inlining
---
-{-# INLINE inlinePerformIO #-}
-inlinePerformIO :: IO a -> a
-#if defined(__GLASGOW_HASKELL__)
-inlinePerformIO (IO m) = case m realWorld# of (# _, r #) -> r
-#else
-inlinePerformIO = unsafePerformIO
-#endif
-
 {-# INLINE newForeignFreePtr #-}
 newForeignFreePtr :: Ptr Word8 -> IO (ForeignPtr Word8)
 #if defined(__GLASGOW_HASKELL__)
@@ -2126,259 +1984,3 @@ newForeignFreePtr p = FC.newForeignPtr p (c_free p)
 #else
 newForeignFreePtr p = newForeignPtr c_free_finalizer p
 #endif
-
--- ---------------------------------------------------------------------
--- 
--- Standard C functions
---
-
-foreign import ccall unsafe "string.h strlen" c_strlen
-    :: CString -> CInt
-
-foreign import ccall unsafe "stdlib.h malloc" c_malloc
-    :: CInt -> IO (Ptr Word8)
-
-foreign import ccall unsafe "static stdlib.h free" c_free
-    :: Ptr Word8 -> IO ()
-
-#if !defined(__GLASGOW_HASKELL__)
-foreign import ccall unsafe "static stdlib.h &free" c_free_finalizer
-    :: FunPtr (Ptr Word8 -> IO ())
-#endif
-
-foreign import ccall unsafe "string.h memset" memset
-    :: Ptr Word8 -> Word8 -> CSize -> IO (Ptr Word8)
-
-foreign import ccall unsafe "string.h memchr" memchr
-    :: Ptr Word8 -> Word8 -> CSize -> Ptr Word8
-
-foreign import ccall unsafe "string.h memcmp" memcmp
-    :: Ptr Word8 -> Ptr Word8 -> CSize -> IO CInt
-
-foreign import ccall unsafe "string.h memcpy" memcpy
-    :: Ptr Word8 -> Ptr Word8 -> CSize -> IO ()
-
--- ---------------------------------------------------------------------
---
--- Uses our C code
---
-
-foreign import ccall unsafe "static fpstring.h reverse" c_reverse
-    :: Ptr Word8 -> Ptr Word8 -> CInt -> IO ()
-
-foreign import ccall unsafe "static fpstring.h intersperse" c_intersperse
-    :: Ptr Word8 -> Ptr Word8 -> CInt -> Word8 -> IO ()
-
-foreign import ccall unsafe "static fpstring.h maximum" c_maximum
-    :: Ptr Word8 -> CInt -> Word8
-
-foreign import ccall unsafe "static fpstring.h minimum" c_minimum
-    :: Ptr Word8 -> CInt -> Word8
-
-foreign import ccall unsafe "static fpstring.h count" c_count
-    :: Ptr Word8 -> CInt -> Word8 -> CInt
-
--- ---------------------------------------------------------------------
--- MMap
-
-{-
-foreign import ccall unsafe "static fpstring.h my_mmap" my_mmap
-    :: Int -> Int -> IO (Ptr Word8)
-
-foreign import ccall unsafe "static unistd.h close" c_close
-    :: Int -> IO Int
-
-#  if !defined(__OpenBSD__)
-foreign import ccall unsafe "static sys/mman.h munmap" c_munmap
-    :: Ptr Word8 -> Int -> IO Int
-#  endif
--}
-
--- ---------------------------------------------------------------------
--- Internal GHC Haskell magic
-
-#if defined(__GLASGOW_HASKELL__)
-foreign import ccall unsafe "RtsAPI.h getProgArgv"
-    getProgArgv :: Ptr CInt -> Ptr (Ptr CString) -> IO ()
-
-foreign import ccall unsafe "__hscore_memcpy_src_off"
-   memcpy_ptr_baoff :: Ptr a -> RawBuffer -> CInt -> CSize -> IO (Ptr ())
-#endif
-
--- ---------------------------------------------------------------------
---
--- Functional array fusion for ByteStrings. 
---
--- From the Data Parallel Haskell project, 
---      http://www.cse.unsw.edu.au/~chak/project/dph/
---
-
--- |Data type for accumulators which can be ignored. The rewrite rules rely on
--- the fact that no bottoms of this type are ever constructed; hence, we can
--- assume @(_ :: NoAL) `seq` x = x@.
---
-data NoAL = NoAL
-
--- | Special forms of loop arguments
---
--- * These are common special cases for the three function arguments of gen
---   and loop; we give them special names to make it easier to trigger RULES
---   applying in the special cases represented by these arguments.  The
---   "INLINE [1]" makes sure that these functions are only inlined in the last
---   two simplifier phases.
---
--- * In the case where the accumulator is not needed, it is better to always
---   explicitly return a value `()', rather than just copy the input to the
---   output, as the former gives GHC better local information.
--- 
-
--- | Element function expressing a mapping only
-mapEFL :: (Word8 -> Word8) -> (NoAL -> Word8 -> (NoAL, Maybe Word8))
-mapEFL f = \_ e -> (noAL, (Just $ f e))
-#if defined(__GLASGOW_HASKELL__)
-{-# INLINE [1] mapEFL #-}
-#endif
-
--- | Element function implementing a filter function only
-filterEFL :: (Word8 -> Bool) -> (NoAL -> Word8 -> (NoAL, Maybe Word8))
-filterEFL p = \_ e -> if p e then (noAL, Just e) else (noAL, Nothing)
-#if defined(__GLASGOW_HASKELL__)
-{-# INLINE [1] filterEFL #-}
-#endif
-
--- |Element function expressing a reduction only
-foldEFL :: (acc -> Word8 -> acc) -> (acc -> Word8 -> (acc, Maybe Word8))
-foldEFL f = \a e -> (f a e, Nothing)
-#if defined(__GLASGOW_HASKELL__)
-{-# INLINE [1] foldEFL #-}
-#endif
-
--- | A strict foldEFL.
-foldEFL' :: (acc -> Word8 -> acc) -> (acc -> Word8 -> (acc, Maybe Word8))
-foldEFL' f = \a e -> let a' = f a e in a' `seq` (a', Nothing)
-#if defined(__GLASGOW_HASKELL__)
-{-# INLINE [1] foldEFL' #-}
-#endif
-
--- | Element function expressing a prefix reduction only
---
-scanEFL :: (Word8 -> Word8 -> Word8) -> Word8 -> Word8 -> (Word8, Maybe Word8)
-scanEFL f = \a e -> (f a e, Just a)
-#if defined(__GLASGOW_HASKELL__)
-{-# INLINE [1] scanEFL #-}
-#endif
-
--- | Element function implementing a map and fold
---
-mapAccumEFL :: (acc -> Word8 -> (acc, Word8)) -> acc -> Word8 -> (acc, Maybe Word8)
-mapAccumEFL f = \a e -> case f a e of (a', e') -> (a', Just e')
-#if defined(__GLASGOW_HASKELL__)
-{-# INLINE [1] mapAccumEFL #-}
-#endif
-
--- | Element function implementing a map with index
---
-mapIndexEFL :: (Int -> Word8 -> Word8) -> Int -> Word8 -> (Int, Maybe Word8)
-mapIndexEFL f = \i e -> let i' = i+1 in i' `seq` (i', Just $ f i e)
-#if defined(__GLASGOW_HASKELL__)
-{-# INLINE [1] mapIndexEFL #-}
-#endif
-
--- | No accumulator
-noAL :: NoAL
-noAL = NoAL
-#if defined(__GLASGOW_HASKELL__)
-{-# INLINE [1] noAL #-}
-#endif
-
--- | Projection functions that are fusion friendly (as in, we determine when
--- they are inlined)
-loopArr :: (acc, byteString) -> byteString
-loopArr (_, arr) = arr
-#if defined(__GLASGOW_HASKELL__)
-{-# INLINE [1] loopArr #-}
-#endif
-
-loopAcc :: (acc, byteString) -> acc
-loopAcc (acc, _) = acc
-#if defined(__GLASGOW_HASKELL__)
-{-# INLINE [1] loopAcc #-}
-#endif
-
-loopSndAcc :: ((acc1, acc2), byteString) -> (acc2, byteString)
-loopSndAcc ((_, acc), arr) = (acc, arr)
-#if defined(__GLASGOW_HASKELL__)
-{-# INLINE [1] loopSndAcc #-}
-#endif
-
-------------------------------------------------------------------------
-
---
--- size, and then percentage.
---
-
--- | Iteration over over ByteStrings
-loopU :: (acc -> Word8 -> (acc, Maybe Word8))  -- ^ mapping & folding, once per elem
-      -> acc                                   -- ^ initial acc value
-      -> ByteString                            -- ^ input ByteString
-      -> (acc, ByteString)
-
-loopU f start (PS z s i) = inlinePerformIO $ withForeignPtr z $ \a -> do
-    fp          <- mallocByteString i
-    (ptr,n,acc) <- withForeignPtr fp $ \p -> do
-        (acc, i') <- go (a `plusPtr` s) p start
-        if i' == i
-            then return (fp,i',acc)                 -- no realloc for map
-            else do fp_ <- mallocByteString i'      -- realloc
-                    withForeignPtr fp_ $ \p' -> memcpy p' p (fromIntegral i')
-                    return (fp_,i',acc)
-
-    return (acc, PS ptr 0 n)
-  where
-    go p ma = trans 0 0
-        where
-            STRICT3(trans)
-            trans a_off ma_off acc
-                | a_off >= i = return (acc, ma_off)
-                | otherwise  = do
-                    x <- peekByteOff p a_off
-                    let (acc', oe) = f acc x
-                    ma_off' <- case oe of
-                        Nothing  -> return ma_off
-                        Just e   -> do pokeByteOff ma ma_off e
-                                       return $ ma_off + 1
-                    trans (a_off+1) ma_off' acc'
-
-#if defined(__GLASGOW_HASKELL__)
-{-# INLINE [1] loopU #-}
-#endif
-
-infixr 9 `fuseEFL`
-
--- |Fuse to flat loop functions
-fuseEFL :: (a1 -> Word8  -> (a1, Maybe Word8))
-        -> (a2 -> Word8  -> (a2, Maybe Word8))
-        -> (a1, a2)
-        -> Word8
-        -> ((a1, a2), Maybe Word8)
-fuseEFL f g (acc1, acc2) e1 =
-    case f acc1 e1 of
-        (acc1', Nothing) -> ((acc1', acc2), Nothing)
-        (acc1', Just e2) ->
-            case g acc2 e2 of
-                (acc2', res) -> ((acc1', acc2'), res)
-
-{-# RULES
-
-"loop/loop fusion!" forall em1 em2 start1 start2 arr.
-  loopU em2 start2 (loopArr (loopU em1 start1 arr)) =
-    loopSndAcc (loopU (em1 `fuseEFL` em2) (start1, start2) arr)
-
-"loopArr/loopSndAcc" forall x.
-  loopArr (loopSndAcc x) = loopArr x
-
-"seq/NoAL" forall (u::NoAL) e.
-  u `seq` e = e
-
-  #-}
-
diff --git a/Data/ByteString/Base.hs b/Data/ByteString/Base.hs
new file mode 100644
index 0000000..dac2a16
--- /dev/null
+++ b/Data/ByteString/Base.hs
@@ -0,0 +1,463 @@
+{-# OPTIONS_GHC -cpp -fglasgow-exts #-}
+--
+-- Module      : ByteString.Base
+-- License     : BSD-style
+-- Maintainer  : dons@cse.unsw.edu.au
+-- Stability   : experimental
+-- Portability : portable, requires ffi and cpp
+-- Tested with : GHC 6.4.1 and Hugs March 2005
+-- 
+
+-- | A module containing semi-public ByteString internals. This exposes
+-- the ByteString representation and low level construction functions.
+-- Modules which extend the ByteString system will need to use this module
+-- while ideally most users will be able to make do with the public interface
+-- modules.
+--
+module Data.ByteString.Base (
+
+        -- * The @ByteString@ type and representation
+        ByteString(..),             -- instances: Eq, Ord, Show, Read, Data, Typeable
+
+        -- * Unchecked access
+        unsafeHead,             -- :: ByteString -> Word8
+        unsafeTail,             -- :: ByteString -> ByteString
+        unsafeIndex,            -- :: ByteString -> Int -> Word8
+        unsafeTake,             -- :: Int -> ByteString -> ByteString
+        unsafeDrop,             -- :: Int -> ByteString -> ByteString
+
+        -- * Low level introduction and elimination
+        create,                 -- :: Int -> (Ptr Word8 -> IO ()) -> IO ByteString
+        createAndTrim,          -- :: Int -> (Ptr Word8 -> IO Int) -> IO  ByteString
+        createAndTrim',         -- :: Int -> (Ptr Word8 -> IO (Int, Int, a)) -> IO (ByteString, a)
+
+        unsafeCreate,           -- :: Int -> (Ptr Word8 -> IO ()) ->  ByteString
+        unsafeUseAsCString,     -- :: ByteString -> (CString -> IO a) -> IO a
+        unsafeUseAsCStringLen,  -- :: ByteString -> (CStringLen -> IO a) -> IO a
+
+        fromForeignPtr,         -- :: ForeignPtr Word8 -> Int -> ByteString
+        toForeignPtr,           -- :: ByteString -> (ForeignPtr Word8, Int, Int)
+
+#if defined(__GLASGOW_HASKELL__)
+        packCStringFinalizer,   -- :: Ptr Word8 -> Int -> IO () -> IO ByteString
+        packAddress,            -- :: Addr# -> ByteString
+        unsafePackAddress,      -- :: Int -> Addr# -> ByteString
+        unsafeFinalize,         -- :: ByteString -> IO ()
+#endif
+
+        -- * Utilities
+        inlinePerformIO,            -- :: IO a -> a
+
+        countOccurrences,           -- :: (Storable a, Num a) => Ptr a -> Ptr Word8 -> Int -> IO ()
+
+        -- * Standard C Functions
+        c_strlen,                   -- :: CString -> IO CInt
+        c_malloc,                   -- :: CInt -> IO (Ptr Word8)
+        c_free,                     -- :: Ptr Word8 -> IO ()
+
+#if !defined(__GLASGOW_HASKELL__)
+        c_free_finalizer,           -- :: FunPtr (Ptr Word8 -> IO ())
+#endif
+
+        memchr,                     -- :: Ptr Word8 -> Word8 -> CSize -> IO Ptr Word8
+        memcmp,                     -- :: Ptr Word8 -> Ptr Word8 -> CSize -> IO CInt
+        memcpy,                     -- :: Ptr Word8 -> Ptr Word8 -> CSize -> IO ()
+        memmove,                    -- :: Ptr Word8 -> Ptr Word8 -> CSize -> IO ()
+        memset,                     -- :: Ptr Word8 -> Word8 -> CSize -> IO (Ptr Word8)
+
+        -- * cbits functions
+        c_reverse,                  -- :: Ptr Word8 -> Ptr Word8 -> CInt -> IO ()
+        c_intersperse,              -- :: Ptr Word8 -> Ptr Word8 -> CInt -> Word8 -> IO ()
+        c_maximum,                  -- :: Ptr Word8 -> CInt -> IO Word8
+        c_minimum,                  -- :: Ptr Word8 -> CInt -> IO Word8
+        c_count,                    -- :: Ptr Word8 -> CInt -> Word8 -> IO CInt
+
+        -- * Internal GHC magic
+#if defined(__GLASGOW_HASKELL__)
+        getProgArgv,                -- :: Ptr CInt -> Ptr (Ptr CString) -> IO ()
+        memcpy_ptr_baoff,           -- :: Ptr a -> RawBuffer -> CInt -> CSize -> IO (Ptr ())
+#endif
+
+        -- * Chars
+        w2c, c2w, isSpaceWord8
+
+  ) where
+
+import Foreign.ForeignPtr
+import Foreign.Ptr
+import Foreign.Storable         (Storable(..))
+import Foreign.C.Types
+import Foreign.C.String         (CString, CStringLen)
+
+import Control.Exception        (assert)
+
+import Data.Char                (ord)
+import Data.Word                (Word8)
+
+#if defined(__GLASGOW_HASKELL__)
+import qualified Foreign.Concurrent as FC (newForeignPtr)
+
+import Data.Generics            (Data(..), Typeable(..))
+import GHC.Prim                 (Addr#)
+import GHC.Ptr                  (Ptr(..))
+import GHC.Base                 (realWorld#,unsafeChr)
+import GHC.IOBase
+
+#if defined(__GLASGOW_HASKELL__) && !defined(SLOW_FOREIGN_PTR)
+import GHC.ForeignPtr           (mallocPlainForeignPtrBytes)
+#endif
+
+#else
+import Data.Char                (chr)
+import System.IO.Unsafe         (unsafePerformIO)
+#endif
+
+-- CFILES stuff is Hugs only
+{-# CFILES cbits/fpstring.c #-}
+
+-- -----------------------------------------------------------------------------
+--
+-- Useful macros, until we have bang patterns
+--
+
+#define STRICT1(f) f a | a `seq` False = undefined
+#define STRICT2(f) f a b | a `seq` b `seq` False = undefined
+#define STRICT3(f) f a b c | a `seq` b `seq` c `seq` False = undefined
+#define STRICT4(f) f a b c d | a `seq` b `seq` c `seq` d `seq` False = undefined
+#define STRICT5(f) f a b c d e | a `seq` b `seq` c `seq` d `seq` e `seq` False = undefined
+
+-- -----------------------------------------------------------------------------
+
+-- | A space-efficient representation of a Word8 vector, supporting many
+-- efficient operations.  A 'ByteString' contains 8-bit characters only.
+--
+-- Instances of Eq, Ord, Read, Show, Data, Typeable
+--
+data ByteString = PS {-# UNPACK #-} !(ForeignPtr Word8)
+                     {-# UNPACK #-} !Int                -- offset
+                     {-# UNPACK #-} !Int                -- length
+
+#if defined(__GLASGOW_HASKELL__)
+    deriving (Data, Typeable)
+#endif
+
+-- ---------------------------------------------------------------------
+--
+-- Extensions to the basic interface
+--
+
+-- | A variety of 'head' for non-empty ByteStrings. 'unsafeHead' omits the
+-- check for the empty case, so there is an obligation on the programmer
+-- to provide a proof that the ByteString is non-empty.
+unsafeHead :: ByteString -> Word8
+unsafeHead (PS x s l) = assert (l > 0) $
+    inlinePerformIO $ withForeignPtr x $ \p -> peekByteOff p s
+{-# INLINE unsafeHead #-}
+
+-- | A variety of 'tail' for non-empty ByteStrings. 'unsafeTail' omits the
+-- check for the empty case. As with 'unsafeHead', the programmer must
+-- provide a separate proof that the ByteString is non-empty.
+unsafeTail :: ByteString -> ByteString
+unsafeTail (PS ps s l) = assert (l > 0) $ PS ps (s+1) (l-1)
+{-# INLINE unsafeTail #-}
+
+-- | Unsafe 'ByteString' index (subscript) operator, starting from 0, returning a 'Word8'
+-- This omits the bounds check, which means there is an accompanying
+-- obligation on the programmer to ensure the bounds are checked in some
+-- other way.
+unsafeIndex :: ByteString -> Int -> Word8
+unsafeIndex (PS x s l) i = assert (i >= 0 && i < l) $
+    inlinePerformIO $ withForeignPtr x $ \p -> peekByteOff p (s+i)
+{-# INLINE unsafeIndex #-}
+
+-- | A variety of 'take' which omits the checks on @n@ so there is an
+-- obligation on the programmer to provide a proof that @0 <= n <= 'length' xs@.
+unsafeTake :: Int -> ByteString -> ByteString
+unsafeTake n (PS x s l) = assert (0 <= n && n <= l) $ PS x s n
+{-# INLINE unsafeTake #-}
+
+-- | A variety of 'drop' which omits the checks on @n@ so there is an
+-- obligation on the programmer to provide a proof that @0 <= n <= 'length' xs@.
+unsafeDrop  :: Int -> ByteString -> ByteString
+unsafeDrop n (PS x s l) = assert (0 <= n && n <= l) $ PS x (s+n) (l-n)
+{-# INLINE unsafeDrop #-}
+
+-- ---------------------------------------------------------------------
+-- Low level constructors
+
+-- | /O(1)/ Build a ByteString from a ForeignPtr
+fromForeignPtr :: ForeignPtr Word8 -> Int -> ByteString
+fromForeignPtr fp l = PS fp 0 l
+
+-- | /O(1)/ Deconstruct a ForeignPtr from a ByteString
+toForeignPtr :: ByteString -> (ForeignPtr Word8, Int, Int)
+toForeignPtr (PS ps s l) = (ps, s, l)
+
+-- | A way of creating ByteStrings outside the IO monad. The @Int@
+-- argument gives the final size of the ByteString. Unlike
+-- 'createAndTrim' the ByteString is not reallocated if the final size
+-- is less than the estimated size.
+unsafeCreate :: Int -> (Ptr Word8 -> IO ()) -> ByteString
+unsafeCreate l f = unsafePerformIO (create l f)
+{-# INLINE unsafeCreate #-}
+
+-- | Wrapper of mallocForeignPtrBytes.
+create :: Int -> (Ptr Word8 -> IO ()) -> IO ByteString
+create l f = do
+#if defined(SLOW_FOREIGN_PTR) || !defined(__GLASGOW_HASKELL__)
+    fp <- mallocForeignPtrBytes l
+#else
+    fp <- mallocPlainForeignPtrBytes l
+#endif
+    withForeignPtr fp $ \p -> f p
+    return $! PS fp 0 l
+
+-- | Given the maximum size needed and a function to make the contents
+-- of a ByteString, createAndTrim makes the 'ByteString'. The generating
+-- function is required to return the actual final size (<= the maximum
+-- size), and the resulting byte array is realloced to this size.
+--
+-- createAndTrim is the main mechanism for creating custom, efficient
+-- ByteString functions, using Haskell or C functions to fill the space.
+--
+createAndTrim :: Int -> (Ptr Word8 -> IO Int) -> IO ByteString
+createAndTrim l f = do
+#if defined(SLOW_FOREIGN_PTR) || !defined(__GLASGOW_HASKELL__)
+    fp <- mallocForeignPtrBytes l
+#else
+    fp <- mallocPlainForeignPtrBytes l
+#endif
+    withForeignPtr fp $ \p -> do
+        l' <- f p
+        if assert (l' <= l) $ l' >= l
+            then return $! PS fp 0 l
+            else create l' $ \p' -> memcpy p' p (fromIntegral l')
+
+createAndTrim' :: Int -> (Ptr Word8 -> IO (Int, Int, a)) -> IO (ByteString, a)
+createAndTrim' l f = do
+#if defined(SLOW_FOREIGN_PTR) || !defined(__GLASGOW_HASKELL__)
+    fp <- mallocForeignPtrBytes l
+#else
+    fp <- mallocPlainForeignPtrBytes l
+#endif
+    withForeignPtr fp $ \p -> do
+        (off, l', res) <- f p
+        if assert (l' <= l) $ l' >= l
+            then return $! (PS fp 0 l, res)
+            else do ps <- create l' $ \p' ->
+                            memcpy p' (p `plusPtr` off) (fromIntegral l')
+                    return $! (ps, res)
+
+#if defined(__GLASGOW_HASKELL__)
+-- | /O(n)/ Pack a null-terminated sequence of bytes, pointed to by an
+-- Addr\# (an arbitrary machine address assumed to point outside the
+-- garbage-collected heap) into a @ByteString@. A much faster way to
+-- create an Addr\# is with an unboxed string literal, than to pack a
+-- boxed string. A unboxed string literal is compiled to a static @char
+-- []@ by GHC. Establishing the length of the string requires a call to
+-- @strlen(3)@, so the Addr# must point to a null-terminated buffer (as
+-- is the case with "string"# literals in GHC). Use 'unsafePackAddress'
+-- if you know the length of the string statically.
+--
+-- An example:
+--
+-- > literalFS = packAddress "literal"#
+--
+packAddress :: Addr# -> ByteString
+packAddress addr# = inlinePerformIO $ do
+    p <- newForeignPtr_ cstr
+    l <- c_strlen cstr
+    return $ PS p 0 (fromIntegral l)
+  where
+    cstr = Ptr addr#
+{-# INLINE packAddress #-}
+
+-- | /O(1)/ 'unsafePackAddress' provides constant-time construction of
+-- 'ByteStrings' -- which is ideal for string literals. It packs a
+-- null-terminated sequence of bytes into a 'ByteString', given a raw
+-- 'Addr\#' to the string, and the length of the string. Make sure the
+-- length is correct, otherwise use the safer 'packAddress' (where the
+-- length will be calculated once at runtime).
+unsafePackAddress :: Int -> Addr# -> ByteString
+unsafePackAddress len addr# = inlinePerformIO $ do
+    p <- newForeignPtr_ cstr
+    return $ PS p 0 len
+    where cstr = Ptr addr#
+
+-- | /O(1)/ Construct a 'ByteString' given a C Ptr Word8 buffer, a
+-- length, and an IO action representing a finalizer. This function is
+-- not available on Hugs.
+--
+packCStringFinalizer :: Ptr Word8 -> Int -> IO () -> IO ByteString
+packCStringFinalizer p l f = do
+    fp <- FC.newForeignPtr p f
+    return $ PS fp 0 l
+
+-- | Explicitly run the finaliser associated with a 'ByteString'.
+-- Further references to this value may generate invalid memory
+-- references. This operation is unsafe, as there may be other
+-- 'ByteStrings' referring to the same underlying pages. If you use
+-- this, you need to have a proof of some kind that all 'ByteString's
+-- ever generated from the underlying byte array are no longer live.
+unsafeFinalize :: ByteString -> IO ()
+unsafeFinalize (PS p _ _) = finalizeForeignPtr p
+
+#endif
+
+------------------------------------------------------------------------
+
+-- | Conversion between 'Word8' and 'Char'. Should compile to a no-op.
+w2c :: Word8 -> Char
+#if !defined(__GLASGOW_HASKELL__)
+w2c = chr . fromIntegral
+#else
+w2c = unsafeChr . fromIntegral
+#endif
+{-# INLINE w2c #-}
+
+-- | Unsafe conversion between 'Char' and 'Word8'. This is a no-op and
+-- silently truncates to 8 bits Chars > '\255'. It is provided as
+-- convenience for ByteString construction.
+c2w :: Char -> Word8
+c2w = fromIntegral . ord
+{-# INLINE c2w #-}
+
+-- Selects white-space characters in the Latin-1 range
+-- ordered by frequency
+-- Idea from Ketil
+isSpaceWord8 :: Word8 -> Bool
+isSpaceWord8 w = case w of
+    0x20 -> True -- SPACE
+    0x0A -> True -- LF, \n
+    0x09 -> True -- HT, \t
+    0x0C -> True -- FF, \f
+    0x0D -> True -- CR, \r
+    0x0B -> True -- VT, \v
+    0xA0 -> True -- spotted by QC..
+    _    -> False
+{-# INLINE isSpaceWord8 #-}
+
+------------------------------------------------------------------------
+-- | Just like unsafePerformIO, but we inline it. Big performance gains as
+-- it exposes lots of things to further inlining
+--
+{-# INLINE inlinePerformIO #-}
+inlinePerformIO :: IO a -> a
+#if defined(__GLASGOW_HASKELL__)
+inlinePerformIO (IO m) = case m realWorld# of (# _, r #) -> r
+#else
+inlinePerformIO = unsafePerformIO
+#endif
+
+-- | Count the number of occurrences of each byte.
+--
+{-# SPECIALIZE countOccurrences :: Ptr CSize -> Ptr Word8 -> Int -> IO () #-}
+countOccurrences :: (Storable a, Num a) => Ptr a -> Ptr Word8 -> Int -> IO ()
+STRICT3(countOccurrences)
+countOccurrences counts str l = go 0
+ where
+    STRICT1(go)
+    go i | i == l    = return ()
+         | otherwise = do k <- fromIntegral `fmap` peekElemOff str i
+                          x <- peekElemOff counts k
+                          pokeElemOff counts k (x + 1)
+                          go (i + 1)
+
+-- | /O(1) construction/ Use a @ByteString@ with a function requiring a
+-- @CString@.  Warning: modifying the @CString@ will affect the
+-- @ByteString@.  Why is this function unsafe? It relies on the null
+-- byte at the end of the ByteString to be there. Unless you can
+-- guarantee the null byte, you should use the safe version, which will
+-- copy the string first.
+unsafeUseAsCString :: ByteString -> (CString -> IO a) -> IO a
+unsafeUseAsCString (PS ps s _) ac = withForeignPtr ps $ \p -> ac (castPtr p `plusPtr` s)
+
+-- | /O(1) construction/ Use a @ByteString@ with a function requiring a
+-- @CStringLen@.  Warning: modifying the @CStringLen@ will affect the
+-- @ByteString@.  This is analogous to unsafeUseAsCString, and comes
+-- with the same safety requirements. The user must ensure there is a
+-- null byte at the end of the string.
+unsafeUseAsCStringLen :: ByteString -> (CStringLen -> IO a) -> IO a
+unsafeUseAsCStringLen (PS ps s l) ac = withForeignPtr ps $ \p -> ac (castPtr p `plusPtr` s,l)
+
+-- ---------------------------------------------------------------------
+-- 
+-- Standard C functions
+--
+
+foreign import ccall unsafe "string.h strlen" c_strlen
+    :: CString -> IO CInt
+
+foreign import ccall unsafe "stdlib.h malloc" c_malloc
+    :: CInt -> IO (Ptr Word8)
+
+foreign import ccall unsafe "static stdlib.h free" c_free
+    :: Ptr Word8 -> IO ()
+
+#if !defined(__GLASGOW_HASKELL__)
+foreign import ccall unsafe "static stdlib.h &free" c_free_finalizer
+    :: FunPtr (Ptr Word8 -> IO ())
+#endif
+
+foreign import ccall unsafe "string.h memchr" memchr
+    :: Ptr Word8 -> Word8 -> CSize -> IO (Ptr Word8)
+
+foreign import ccall unsafe "string.h memcmp" memcmp
+    :: Ptr Word8 -> Ptr Word8 -> CSize -> IO CInt
+
+foreign import ccall unsafe "string.h memcpy" memcpy
+    :: Ptr Word8 -> Ptr Word8 -> CSize -> IO ()
+
+foreign import ccall unsafe "string.h memmove" memmove
+    :: Ptr Word8 -> Ptr Word8 -> CSize -> IO ()
+
+foreign import ccall unsafe "string.h memset" memset
+    :: Ptr Word8 -> Word8 -> CSize -> IO (Ptr Word8)
+
+
+-- ---------------------------------------------------------------------
+--
+-- Uses our C code
+--
+
+foreign import ccall unsafe "static fpstring.h fps_reverse" c_reverse
+    :: Ptr Word8 -> Ptr Word8 -> CInt -> IO ()
+
+foreign import ccall unsafe "static fpstring.h fps_intersperse" c_intersperse
+    :: Ptr Word8 -> Ptr Word8 -> CInt -> Word8 -> IO ()
+
+foreign import ccall unsafe "static fpstring.h fps_maximum" c_maximum
+    :: Ptr Word8 -> CInt -> IO Word8
+
+foreign import ccall unsafe "static fpstring.h fps_minimum" c_minimum
+    :: Ptr Word8 -> CInt -> IO Word8
+
+foreign import ccall unsafe "static fpstring.h fps_count" c_count
+    :: Ptr Word8 -> CInt -> Word8 -> IO CInt
+
+-- ---------------------------------------------------------------------
+-- MMap
+
+{-
+foreign import ccall unsafe "static fpstring.h my_mmap" my_mmap
+    :: Int -> Int -> IO (Ptr Word8)
+
+foreign import ccall unsafe "static unistd.h close" c_close
+    :: Int -> IO Int
+
+#  if !defined(__OpenBSD__)
+foreign import ccall unsafe "static sys/mman.h munmap" c_munmap
+    :: Ptr Word8 -> Int -> IO Int
+#  endif
+-}
+
+-- ---------------------------------------------------------------------
+-- Internal GHC Haskell magic
+
+#if defined(__GLASGOW_HASKELL__)
+foreign import ccall unsafe "RtsAPI.h getProgArgv"
+    getProgArgv :: Ptr CInt -> Ptr (Ptr CString) -> IO ()
+
+foreign import ccall unsafe "__hscore_memcpy_src_off"
+   memcpy_ptr_baoff :: Ptr a -> RawBuffer -> CInt -> CSize -> IO (Ptr ())
+#endif
diff --git a/Data/ByteString/Char8.hs b/Data/ByteString/Char8.hs
index 86916f2..71bf394 100644
--- a/Data/ByteString/Char8.hs
+++ b/Data/ByteString/Char8.hs
@@ -1,4 +1,4 @@
-{-# OPTIONS_GHC -cpp -fffi -fglasgow-exts #-}
+{-# OPTIONS_GHC -cpp -fglasgow-exts #-}
 --
 -- Module      : Data.ByteString.Char8
 -- Copyright   : (c) Don Stewart 2006
@@ -35,11 +35,11 @@
 module Data.ByteString.Char8 (
 
         -- * The @ByteString@ type
-        ByteString(..),         -- instances: Eq, Ord, Show, Read, Data, Typeable
+        ByteString,             -- abstract, instances: Eq, Ord, Show, Read, Data, Typeable, Monoid
 
         -- * Introducing and eliminating 'ByteString's
         empty,                  -- :: ByteString
-        singleton,               -- :: Char   -> ByteString
+        singleton,              -- :: Char   -> ByteString
         pack,                   -- :: String -> ByteString
         unpack,                 -- :: ByteString -> String
 
@@ -104,6 +104,7 @@ module Data.ByteString.Char8 (
         span,                   -- :: (Char -> Bool) -> ByteString -> (ByteString, ByteString)
         spanEnd,                -- :: (Char -> Bool) -> ByteString -> (ByteString, ByteString)
         break,                  -- :: (Char -> Bool) -> ByteString -> (ByteString, ByteString)
+        breakEnd,               -- :: (Char -> Bool) -> ByteString -> (ByteString, ByteString)
         group,                  -- :: ByteString -> [ByteString]
         groupBy,                -- :: (Char -> Char -> Bool) -> ByteString -> [ByteString]
         inits,                  -- :: ByteString -> [ByteString]
@@ -179,19 +180,28 @@ module Data.ByteString.Char8 (
         -- * Ordered ByteStrings
         sort,                   -- :: ByteString -> ByteString
 
-        -- * Unchecked access
-        unsafeHead,             -- :: ByteString -> Char
-        unsafeTail,             -- :: ByteString -> ByteString
-        unsafeIndex,            -- :: ByteString -> Int -> Char
+        -- * Conversion
         w2c,                    -- :: Word8 -> Char
         c2w,                    -- :: Char  -> Word8
 
         -- * Reading from ByteStrings
         readInt,                -- :: ByteString -> Maybe Int
-        unsafeReadInt,          -- :: ByteString -> Maybe Int
+
+        -- * Low level CString conversions
+
+        -- ** Packing CStrings and pointers
+        packCString,            -- :: CString -> ByteString
+        packCStringLen,         -- :: CString -> ByteString
+        packMallocCString,      -- :: CString -> ByteString
+
+        -- ** Using ByteStrings as CStrings
+        useAsCString,           -- :: ByteString -> (CString -> IO a) -> IO a
+        useAsCStringLen,        -- :: ByteString -> (CStringLen -> IO a) -> IO a
 
         -- * Copying ByteStrings
         copy,                   -- :: ByteString -> ByteString
+        copyCString,            -- :: CString -> IO ByteString
+        copyCStringLen,         -- :: CStringLen -> IO ByteString
 
         -- * I\/O with @ByteString@s
 
@@ -207,17 +217,21 @@ module Data.ByteString.Char8 (
         -- ** Files
         readFile,               -- :: FilePath -> IO ByteString
         writeFile,              -- :: FilePath -> ByteString -> IO ()
+        appendFile,             -- :: FilePath -> ByteString -> IO ()
 --      mmapFile,               -- :: FilePath -> IO ByteString
 
         -- ** I\/O with Handles
 #if defined(__GLASGOW_HASKELL__)
         getArgs,                -- :: IO [ByteString]
         hGetLine,               -- :: Handle -> IO ByteString
+        hGetLines,              -- :: Handle -> IO ByteString
         hGetNonBlocking,        -- :: Handle -> Int -> IO ByteString
 #endif
         hGetContents,           -- :: Handle -> IO ByteString
         hGet,                   -- :: Handle -> Int -> IO ByteString
         hPut,                   -- :: Handle -> ByteString -> IO ()
+        hPutStr,                -- :: Handle -> ByteString -> IO ()
+        hPutStrLn,              -- :: Handle -> ByteString -> IO ()
 
 #if defined(__GLASGOW_HASKELL__)
         -- * Low level construction
@@ -230,8 +244,6 @@ module Data.ByteString.Char8 (
 #if defined(__GLASGOW_HASKELL__)
         unpackList,
 #endif
-        noAL, NoAL, loopArr, loopAcc, loopSndAcc,
-        loopU, mapEFL, filterEFL, foldEFL, foldEFL', fuseEFL, scanEFL,
         filter', map'
 
     ) where
@@ -242,45 +254,47 @@ import Prelude hiding           (reverse,head,tail,last,init,null
                                 ,concat,any,take,drop,splitAt,takeWhile
                                 ,dropWhile,span,break,elem,filter,unwords
                                 ,words,maximum,minimum,all,concatMap,scanl,scanl1
-                                ,foldl1,foldr1,readFile,writeFile,replicate
+                                ,foldl1,foldr1,readFile,writeFile,appendFile,replicate
                                 ,getContents,getLine,putStr,putStrLn
                                 ,zip,zipWith,unzip,notElem)
 
 import qualified Data.ByteString as B
+import qualified Data.ByteString.Base as B
 
 -- Listy functions transparently exported
-import Data.ByteString (ByteString(..)
-                       ,empty,null,length,tail,init,append
+import Data.ByteString (empty,null,length,tail,init,append
                        ,inits,tails,reverse,transpose
                        ,concat,take,drop,splitAt,join
                        ,sort,isPrefixOf,isSuffixOf,isSubstringOf,findSubstring
-                       ,findSubstrings,unsafeTail,copy,group
+                       ,findSubstrings,copy,group
 
                        ,getContents, putStr, putStrLn
-                       ,readFile, {-mmapFile,-} writeFile
-                       ,hGetContents, hGet, hPut
+                       ,readFile, {-mmapFile,-} writeFile, appendFile
+                       ,hGetContents, hGet, hPut, hPutStr, hPutStrLn
+                       ,packCString,packCStringLen, packMallocCString
+                       ,useAsCString,useAsCStringLen, copyCString,copyCStringLen
 #if defined(__GLASGOW_HASKELL__)
-                       ,getLine, getArgs, hGetLine, hGetNonBlocking
-                       ,packAddress, unsafePackAddress
+                       ,getLine, getArgs, hGetLine, hGetLines, hGetNonBlocking
                        ,unpackList
 #endif
-                       ,noAL, NoAL, loopArr, loopAcc, loopSndAcc
-                       ,loopU, mapEFL, filterEFL, foldEFL, foldEFL', fuseEFL, scanEFL
-                       ,useAsCString, unsafeUseAsCString
                        )
 
-import Data.Char
+import Data.ByteString.Base (
+                        ByteString(..)
+#if defined(__GLASGOW_HASKELL__)
+                       ,packAddress, unsafePackAddress
+#endif
+                       ,c2w, w2c, unsafeTail, inlinePerformIO, isSpaceWord8
+                       )
 
 import qualified Data.List as List (intersperse)
 
 import Foreign
-import Foreign.C.Types          (CLong)
-import Foreign.Marshal.Utils    (with)
 
 #if defined(__GLASGOW_HASKELL__)
-import GHC.Base                 (Char(..),unsafeChr,unpackCString#,unsafeCoerce#)
+import GHC.Base                 (Char(..),unpackCString#,unsafeCoerce#)
 import GHC.IOBase               (IO(..),stToIO)
-import GHC.Prim                 (Addr#,writeWord8OffAddr#,realWorld#,plusAddr#)
+import GHC.Prim                 (Addr#,writeWord8OffAddr#,plusAddr#)
 import GHC.Ptr                  (Ptr(..))
 import GHC.ST                   (ST(..))
 #endif
@@ -288,6 +302,7 @@ import GHC.ST                   (ST(..))
 #define STRICT1(f) f a | a `seq` False = undefined
 #define STRICT2(f) f a b | a `seq` b `seq` False = undefined
 #define STRICT3(f) f a b c | a `seq` b `seq` c `seq` False = undefined
+#define STRICT4(f) f a b c d | a `seq` b `seq` c `seq` d `seq` False = undefined
 
 ------------------------------------------------------------------------
 
@@ -303,13 +318,13 @@ singleton = B.singleton . c2w
 pack :: String -> ByteString
 #if !defined(__GLASGOW_HASKELL__)
 
-pack str = B.create (P.length str) $ \p -> go p str
+pack str = B.unsafeCreate (P.length str) $ \p -> go p str
     where go _ []     = return ()
           go p (x:xs) = poke p (c2w x) >> go (p `plusPtr` 1) xs
 
 #else /* hack away */
 
-pack str = B.create (P.length str) $ \(Ptr p) -> stToIO (go p str)
+pack str = B.unsafeCreate (P.length str) $ \(Ptr p) -> stToIO (go p str)
   where
     go :: Addr# -> [Char] -> ST a ()
     go _ []        = return ()
@@ -527,6 +542,13 @@ spanEnd :: (Char -> Bool) -> ByteString -> (ByteString, ByteString)
 spanEnd f = B.spanEnd (f . w2c)
 {-# INLINE spanEnd #-}
 
+-- | 'breakEnd' behaves like 'break' but from the end of the 'ByteString'
+-- 
+-- breakEnd p == spanEnd (not.p)
+breakEnd :: (Char -> Bool) -> ByteString -> (ByteString, ByteString)
+breakEnd f = B.breakEnd (f . w2c)
+{-# INLINE breakEnd #-}
+
 -- | 'breakChar' breaks its ByteString argument at the first occurence
 -- of the specified Char. It is more efficient than 'break' as it is
 -- implemented with @memchr(3)@. I.e.
@@ -742,22 +764,6 @@ unsafeIndex :: ByteString -> Int -> Char
 unsafeIndex = (w2c .) . B.unsafeIndex
 {-# INLINE unsafeIndex #-}
 
--- | Conversion between 'Word8' and 'Char'. Should compile to a no-op.
-w2c :: Word8 -> Char
-#if !defined(__GLASGOW_HASKELL__)
-w2c = chr . fromIntegral
-#else
-w2c = unsafeChr . fromIntegral
-#endif
-{-# INLINE w2c #-}
-
--- | Unsafe conversion between 'Char' and 'Word8'. This is a no-op and
--- silently truncates to 8 bits Chars > '\255'. It is provided as
--- convenience for ByteString construction.
-c2w :: Char -> Word8
-c2w = fromIntegral . ord
-{-# INLINE c2w #-}
-
 -- ---------------------------------------------------------------------
 -- Things that depend on the encoding
 
@@ -769,7 +775,7 @@ c2w = fromIntegral . ord
 breakSpace :: ByteString -> (ByteString,ByteString)
 breakSpace (PS x s l) = inlinePerformIO $ withForeignPtr x $ \p -> do
     i <- firstspace (p `plusPtr` s) 0 l
-    return $ case () of {_
+    return $! case () of {_
         | i == 0    -> (empty, PS x s l)
         | i == l    -> (PS x s l, empty)
         | otherwise -> (PS x s i, PS x (s+i) (l-i))
@@ -792,7 +798,7 @@ firstspace ptr n m
 dropSpace :: ByteString -> ByteString
 dropSpace (PS x s l) = inlinePerformIO $ withForeignPtr x $ \p -> do
     i <- firstnonspace (p `plusPtr` s) 0 l
-    return $ if i == l then empty else PS x (s+i) (l-i)
+    return $! if i == l then empty else PS x (s+i) (l-i)
 {-# INLINE dropSpace #-}
 
 firstnonspace :: Ptr Word8 -> Int -> Int -> IO Int
@@ -812,7 +818,7 @@ firstnonspace ptr n m
 dropSpaceEnd :: ByteString -> ByteString
 dropSpaceEnd (PS x s l) = inlinePerformIO $ withForeignPtr x $ \p -> do
     i <- lastnonspace (p `plusPtr` s) (l-1)
-    return $ if i == (-1) then empty else PS x s (i+1)
+    return $! if i == (-1) then empty else PS x s (i+1)
 {-# INLINE dropSpaceEnd #-}
 
 lastnonspace :: Ptr Word8 -> Int -> IO Int
@@ -972,90 +978,33 @@ betweenLines start end ps =
 -- ---------------------------------------------------------------------
 -- Reading from ByteStrings
 
--- | readInt skips any whitespace at the beginning of its argument, and
--- reads an Int from the beginning of the ByteString.  If there is no
+-- | readInt reads an Int from the beginning of the ByteString.  If there is no
 -- integer at the beginning of the string, it returns Nothing, otherwise
 -- it just returns the int read, and the rest of the string.
 readInt :: ByteString -> Maybe (Int, ByteString)
-readInt p@(PS x s l) = inlinePerformIO $ useAsCString p $ \cstr ->
-    with (castPtr cstr) $ \endpp -> do
-        val     <- c_strtol (castPtr cstr) endpp 0
-        skipped <- (`minusPtr` cstr) `fmap` peek endpp
-        return $ if skipped == 0
-                 then Nothing
-                 else Just (fromIntegral val, PS x (s+skipped) (l-skipped))
-
--- | unsafeReadInt is like readInt, but requires a null terminated
--- ByteString. It avoids a copy if this is the case. It returns the Int
--- read, if any, and the rest of the string.
-unsafeReadInt :: ByteString -> Maybe (Int, ByteString)
-unsafeReadInt p@(PS x s l) = inlinePerformIO $ unsafeUseAsCString p $ \cstr ->
-    with (castPtr cstr) $ \endpp -> do
-        val     <- c_strtol (castPtr cstr) endpp 0
-        skipped <- (`minusPtr` cstr) `fmap` peek endpp
-        return $ if skipped == 0
-                 then Nothing
-                 else Just (fromIntegral val, PS x (s+skipped) (l-skipped))
-
-foreign import ccall unsafe "stdlib.h strtol" c_strtol
-    :: Ptr Word8 -> Ptr (Ptr Word8) -> Int -> IO CLong
-
-{-
---
--- not quite there yet
---
-readInt :: ByteString -> Maybe (Int, ByteString)
-readInt = go 0
-    where
-        STRICT2(go)
-        go i ps
-            | B.null ps = Nothing
-            | x == '-'  = neg 0 xs
-            | otherwise = pos (parse x) xs
-            where (x, xs) = (ps `unsafeIndex` 0, unsafeTail ps)
-
-        STRICT2(neg)
-        neg n qs | isSpace x   = return $ Just ((i-n),xs)
-                 | otherwise   = neg (parse x + (10 * n)) xs
-                 where (x, xs) = (qs `unsafeIndex` 0, unsafeTail qs)
-
-        STRICT2(pos)
-        pos n qs | isSpace x = go (i+n) xs
-                 | otherwise = pos (parse x + (10 * n)) xs
-                 where (x, xs) = (qs `unsafeIndexWord8` 0, unsafeTail qs)
-
-        parse w = fromIntegral (w - 48) :: Int
-        {-# INLINE parse #-}
--}
-
--- ---------------------------------------------------------------------
--- Internals
-
--- Just like inlinePerformIO, but we inline it. Big performance gains as
--- it exposes lots of things to further inlining
---
-{-# INLINE inlinePerformIO #-}
-inlinePerformIO :: IO a -> a
-#if defined(__GLASGOW_HASKELL__)
-inlinePerformIO (IO m) = case m realWorld# of (# _, r #) -> r
-#else
-inlinePerformIO = unsafePerformIO
-#endif
-
--- Selects white-space characters in the Latin-1 range
--- ordered by frequency
--- Idea from Ketil
-isSpaceWord8 :: Word8 -> Bool
-isSpaceWord8 w = case w of
-    0x20 -> True -- SPACE
-    0x0A -> True -- LF, \n
-    0x09 -> True -- HT, \t
-    0x0C -> True -- FF, \f
-    0x0D -> True -- CR, \r
-    0x0B -> True -- VT, \v
-    0xA0 -> True -- spotted by QC..
-    _    -> False
-{-# INLINE isSpaceWord8 #-}
+readInt as
+    | null as   = Nothing
+    | otherwise =
+        case unsafeHead as of
+            '-' -> loop True  0 0 (unsafeTail as)
+            '+' -> loop False 0 0 (unsafeTail as)
+            _   -> loop False 0 0 as
+
+    where loop :: Bool -> Int -> Int -> ByteString -> Maybe (Int, ByteString)
+          STRICT4(loop)
+          loop neg i n ps
+              | null ps   = end neg i n ps
+              | otherwise =
+                  case B.unsafeHead ps of
+                    w | w >= 0x30
+                     && w <= 0x39 -> loop neg (i+1)
+                                          (n * 10 + (fromIntegral w - 0x30))
+                                          (unsafeTail ps)
+                      | otherwise -> end neg i n ps
+
+          end _    0 _ _  = Nothing
+          end True _ n ps = Just (negate n, ps)
+          end _    _ n ps = Just (n, ps)
 
 -- | /O(n)/ Like 'map', but not fuseable. The benefit is that it is
 -- slightly faster for one-shot cases.
diff --git a/Data/ByteString/Fusion.hs b/Data/ByteString/Fusion.hs
new file mode 100644
index 0000000..99cfa2b
--- /dev/null
+++ b/Data/ByteString/Fusion.hs
@@ -0,0 +1,700 @@
+{-# OPTIONS_GHC -cpp -fglasgow-exts -fno-warn-orphans #-}
+--
+-- Module      : Data.ByteString.Fusion
+-- License     : BSD-style
+-- Maintainer  : dons@cse.unsw.edu.au
+-- Stability   : experimental
+-- Portability : portable, requires ffi and cpp
+-- Tested with : GHC 6.4.1 and Hugs March 2005
+-- 
+
+--
+-- | Functional array fusion for ByteStrings. 
+--
+-- Originally based on code from the Data Parallel Haskell project, 
+--      <http://www.cse.unsw.edu.au/~chak/project/dph>
+--
+module Data.ByteString.Fusion (
+
+    -- * Fusion utilities
+    loopU, loopL, fuseEFL,
+    NoAcc(NoAcc), loopArr, loopAcc, loopSndAcc, unSP,
+    mapEFL, filterEFL, foldEFL, foldEFL', scanEFL, mapAccumEFL, mapIndexEFL,
+
+    -- ** Alternative Fusion stuff
+    -- | This replaces 'loopU' with 'loopUp'
+    -- and adds several further special cases of loops.
+    loopUp, loopDown, loopNoAcc, loopMap, loopFilter,
+    loopWrapper, sequenceLoops,
+    doUpLoop, doDownLoop, doNoAccLoop, doMapLoop, doFilterLoop,
+
+    -- | These are the special fusion cases for combining each loop form perfectly. 
+    fuseAccAccEFL, fuseAccNoAccEFL, fuseNoAccAccEFL, fuseNoAccNoAccEFL,
+    fuseMapAccEFL, fuseAccMapEFL, fuseMapNoAccEFL, fuseNoAccMapEFL,
+    fuseMapMapEFL, fuseAccFilterEFL, fuseFilterAccEFL, fuseNoAccFilterEFL,
+    fuseFilterNoAccEFL, fuseFilterFilterEFL, fuseMapFilterEFL, fuseFilterMapEFL,
+
+    -- * Strict pairs and sums
+    PairS(..), MaybeS(..)
+
+  ) where
+
+import Data.ByteString.Base
+
+import Foreign.ForeignPtr
+import Foreign.Ptr
+import Foreign.Storable         (Storable(..))
+
+import Data.Word                (Word8)
+import System.IO.Unsafe         (unsafePerformIO)
+
+-- -----------------------------------------------------------------------------
+--
+-- Useful macros, until we have bang patterns
+--
+
+#define STRICT1(f) f a | a `seq` False = undefined
+#define STRICT2(f) f a b | a `seq` b `seq` False = undefined
+#define STRICT3(f) f a b c | a `seq` b `seq` c `seq` False = undefined
+#define STRICT4(f) f a b c d | a `seq` b `seq` c `seq` d `seq` False = undefined
+#define STRICT5(f) f a b c d e | a `seq` b `seq` c `seq` d `seq` e `seq` False = undefined
+
+infixl 2 :*:
+
+-- |Strict pair
+data PairS a b = !a :*: !b deriving (Eq,Ord,Show)
+
+-- |Strict Maybe
+data MaybeS a = NothingS | JustS !a deriving (Eq,Ord,Show)
+
+-- |Data type for accumulators which can be ignored. The rewrite rules rely on
+-- the fact that no bottoms of this type are ever constructed; hence, we can
+-- assume @(_ :: NoAcc) `seq` x = x@.
+--
+data NoAcc = NoAcc
+
+-- |Type of loop functions
+type AccEFL acc = acc -> Word8 -> (PairS acc (MaybeS Word8))
+type NoAccEFL   =        Word8 ->             MaybeS Word8
+type MapEFL     =        Word8 ->                    Word8
+type FilterEFL  =        Word8 ->             Bool
+
+infixr 9 `fuseEFL`
+
+-- |Fuse to flat loop functions
+fuseEFL :: AccEFL acc1 -> AccEFL acc2 -> AccEFL (PairS acc1 acc2)
+fuseEFL f g (acc1 :*: acc2) e1 =
+    case f acc1 e1 of
+        acc1' :*: NothingS -> (acc1' :*: acc2) :*: NothingS
+        acc1' :*: JustS e2 ->
+            case g acc2 e2 of
+                acc2' :*: res -> (acc1' :*: acc2') :*: res
+#if defined(__GLASGOW_HASKELL__)
+{-# INLINE [1] fuseEFL #-}
+#endif
+
+-- | Special forms of loop arguments
+--
+-- * These are common special cases for the three function arguments of gen
+--   and loop; we give them special names to make it easier to trigger RULES
+--   applying in the special cases represented by these arguments.  The
+--   "INLINE [1]" makes sure that these functions are only inlined in the last
+--   two simplifier phases.
+--
+-- * In the case where the accumulator is not needed, it is better to always
+--   explicitly return a value `()', rather than just copy the input to the
+--   output, as the former gives GHC better local information.
+-- 
+
+-- | Element function expressing a mapping only
+#if !defined(LOOPNOACC_FUSION)
+mapEFL :: (Word8 -> Word8) -> AccEFL NoAcc
+mapEFL f = \_ e -> (NoAcc :*: (JustS $ f e))
+#else
+mapEFL :: (Word8 -> Word8) -> NoAccEFL
+mapEFL f = \e -> JustS (f e)
+#endif
+#if defined(__GLASGOW_HASKELL__)
+{-# INLINE [1] mapEFL #-}
+#endif
+
+-- | Element function implementing a filter function only
+#if !defined(LOOPNOACC_FUSION)
+filterEFL :: (Word8 -> Bool) -> AccEFL NoAcc
+filterEFL p = \_ e -> if p e then (NoAcc :*: JustS e) else (NoAcc :*: NothingS)
+#else
+filterEFL :: (Word8 -> Bool) -> NoAccEFL
+filterEFL p = \e -> if p e then JustS e else NothingS
+#endif
+
+#if defined(__GLASGOW_HASKELL__)
+{-# INLINE [1] filterEFL #-}
+#endif
+
+-- |Element function expressing a reduction only
+foldEFL :: (acc -> Word8 -> acc) -> AccEFL acc
+foldEFL f = \a e -> (f a e :*: NothingS)
+#if defined(__GLASGOW_HASKELL__)
+{-# INLINE [1] foldEFL #-}
+#endif
+
+-- | A strict foldEFL.
+foldEFL' :: (acc -> Word8 -> acc) -> AccEFL acc
+foldEFL' f = \a e -> let a' = f a e in a' `seq` (a' :*: NothingS)
+#if defined(__GLASGOW_HASKELL__)
+{-# INLINE [1] foldEFL' #-}
+#endif
+
+-- | Element function expressing a prefix reduction only
+--
+scanEFL :: (Word8 -> Word8 -> Word8) -> AccEFL Word8
+scanEFL f = \a e -> (f a e :*: JustS a)
+#if defined(__GLASGOW_HASKELL__)
+{-# INLINE [1] scanEFL #-}
+#endif
+
+-- | Element function implementing a map and fold
+--
+mapAccumEFL :: (acc -> Word8 -> (acc, Word8)) -> AccEFL acc
+mapAccumEFL f = \a e -> case f a e of (a', e') -> (a' :*: JustS e')
+#if defined(__GLASGOW_HASKELL__)
+{-# INLINE [1] mapAccumEFL #-}
+#endif
+
+-- | Element function implementing a map with index
+--
+mapIndexEFL :: (Int -> Word8 -> Word8) -> AccEFL Int
+mapIndexEFL f = \i e -> let i' = i+1 in i' `seq` (i' :*: JustS (f i e))
+#if defined(__GLASGOW_HASKELL__)
+{-# INLINE [1] mapIndexEFL #-}
+#endif
+
+-- | Projection functions that are fusion friendly (as in, we determine when
+-- they are inlined)
+loopArr :: (PairS acc arr) -> arr
+loopArr (_ :*: arr) = arr
+#if defined(__GLASGOW_HASKELL__)
+{-# INLINE [1] loopArr #-}
+#endif
+
+loopAcc :: (PairS acc arr) -> acc
+loopAcc (acc :*: _) = acc
+#if defined(__GLASGOW_HASKELL__)
+{-# INLINE [1] loopAcc #-}
+#endif
+
+loopSndAcc :: (PairS (PairS acc1 acc2) arr) -> (PairS acc2 arr)
+loopSndAcc ((_ :*: acc) :*: arr) = (acc :*: arr)
+#if defined(__GLASGOW_HASKELL__)
+{-# INLINE [1] loopSndAcc #-}
+#endif
+
+unSP :: (PairS acc arr) -> (acc, arr)
+unSP (acc :*: arr) = (acc, arr)
+#if defined(__GLASGOW_HASKELL__)
+{-# INLINE [1] unSP #-}
+#endif
+
+------------------------------------------------------------------------
+--
+-- Loop combinator and fusion rules for flat arrays
+-- |Iteration over over ByteStrings
+
+-- | Iteration over over ByteStrings
+loopU :: AccEFL acc                 -- ^ mapping & folding, once per elem
+      -> acc                        -- ^ initial acc value
+      -> ByteString                 -- ^ input ByteString
+      -> (PairS acc ByteString)
+
+loopU f start (PS z s i) = unsafePerformIO $ withForeignPtr z $ \a -> do
+    (ps, acc) <- createAndTrim' i $ \p -> do
+      (acc' :*: i') <- go (a `plusPtr` s) p start
+      return (0, i', acc')
+    return (acc :*: ps)
+
+  where
+    go p ma = trans 0 0
+        where
+            STRICT3(trans)
+            trans a_off ma_off acc
+                | a_off >= i = return (acc :*: ma_off)
+                | otherwise  = do
+                    x <- peekByteOff p a_off
+                    let (acc' :*: oe) = f acc x
+                    ma_off' <- case oe of
+                        NothingS -> return ma_off
+                        JustS e  -> do pokeByteOff ma ma_off e
+                                       return $ ma_off + 1
+                    trans (a_off+1) ma_off' acc'
+
+#if defined(__GLASGOW_HASKELL__)
+{-# INLINE [1] loopU #-}
+#endif
+
+{-# RULES
+
+"loop/loop fusion!" forall em1 em2 start1 start2 arr.
+  loopU em2 start2 (loopArr (loopU em1 start1 arr)) =
+    loopSndAcc (loopU (em1 `fuseEFL` em2) (start1 :*: start2) arr)
+
+  #-}
+
+--
+-- Functional list/array fusion for lazy ByteStrings.
+--
+loopL :: AccEFL acc          -- ^ mapping & folding, once per elem
+      -> acc                 -- ^ initial acc value
+      -> [ByteString]        -- ^ input ByteString
+      -> PairS acc [ByteString]
+loopL f = loop
+  where loop s []     = (s :*: [])
+        loop s (x:xs)
+          | l == 0    = (s'' :*: ys)
+          | otherwise = (s'' :*: y:ys)
+          where (s'  :*: y@(PS _ _ l)) = loopU f s x -- avoid circular dep on P.null
+                (s'' :*: ys)           = loop s' xs
+
+#if defined(__GLASGOW_HASKELL__)
+{-# INLINE [1] loopL #-}
+#endif
+
+{-# RULES
+
+"lazy loop/loop fusion!" forall em1 em2 start1 start2 arr.
+  loopL em2 start2 (loopArr (loopL em1 start1 arr)) =
+    loopSndAcc (loopL (em1 `fuseEFL` em2) (start1 :*: start2) arr)
+
+  #-}
+
+
+{-
+
+Alternate experimental formulation of loopU which partitions it into
+an allocating wrapper and an imperitive array-mutating loop.
+
+The point in doing this split is that we might be able to fuse multiple
+loops into a single wrapper. This would save reallocating another buffer.
+It should also give better cache locality by reusing the buffer.
+
+Note that this stuff needs ghc-6.5 from May 26 or later for the RULES to
+really work reliably.
+
+-}
+
+loopUp :: AccEFL acc -> acc -> ByteString -> PairS acc ByteString
+loopUp f a arr = loopWrapper (doUpLoop f a) arr
+{-# INLINE loopUp #-}
+
+loopDown :: AccEFL acc -> acc -> ByteString -> PairS acc ByteString
+loopDown f a arr = loopWrapper (doDownLoop f a) arr
+{-# INLINE loopDown #-}
+
+loopNoAcc :: NoAccEFL -> ByteString -> PairS NoAcc ByteString
+loopNoAcc f arr = loopWrapper (doNoAccLoop f NoAcc) arr
+{-# INLINE loopNoAcc #-}
+
+loopMap :: MapEFL -> ByteString -> PairS NoAcc ByteString
+loopMap f arr = loopWrapper (doMapLoop f NoAcc) arr
+{-# INLINE loopMap #-}
+
+loopFilter :: FilterEFL -> ByteString -> PairS NoAcc ByteString
+loopFilter f arr = loopWrapper (doFilterLoop f NoAcc) arr
+{-# INLINE loopFilter #-}
+
+-- The type of imperitive loops that fill in a destination array by
+-- reading a source array. They may not fill in the whole of the dest
+-- array if the loop is behaving as a filter, this is why we return
+-- the length that was filled in. The loop may also accumulate some
+-- value as it loops over the source array.
+--
+type ImperativeLoop acc =
+    Ptr Word8          -- pointer to the start of the source byte array
+ -> Ptr Word8          -- pointer to ther start of the destination byte array
+ -> Int                -- length of the source byte array
+ -> IO (PairS (PairS acc Int) Int) -- result and offset, length of dest that was filled
+
+loopWrapper :: ImperativeLoop acc -> ByteString -> PairS acc ByteString
+loopWrapper body (PS srcFPtr srcOffset srcLen) = unsafePerformIO $
+    withForeignPtr srcFPtr $ \srcPtr -> do
+    (ps, acc) <- createAndTrim' srcLen $ \destPtr -> do
+        (acc :*: destOffset :*: destLen) <-
+          body (srcPtr `plusPtr` srcOffset) destPtr srcLen
+        return (destOffset, destLen, acc)
+    return (acc :*: ps)
+
+doUpLoop :: AccEFL acc -> acc -> ImperativeLoop acc
+doUpLoop f acc0 src dest len = loop 0 0 acc0
+  where STRICT3(loop)
+        loop src_off dest_off acc
+            | src_off >= len = return (acc :*: 0 :*: dest_off)
+            | otherwise      = do
+                x <- peekByteOff src src_off
+                case f acc x of
+                  (acc' :*: NothingS) -> loop (src_off+1) dest_off acc'
+                  (acc' :*: JustS x') -> pokeByteOff dest dest_off x'
+                                      >> loop (src_off+1) (dest_off+1) acc'
+
+doDownLoop :: AccEFL acc -> acc -> ImperativeLoop acc
+doDownLoop f acc0 src dest len = loop (len-1) (len-1) acc0
+  where STRICT3(loop)
+        loop src_off dest_off acc
+            | src_off < 0 = return (acc :*: dest_off + 1 :*: len - (dest_off + 1))
+            | otherwise   = do
+                x <- peekByteOff src src_off
+                case f acc x of
+                  (acc' :*: NothingS) -> loop (src_off-1) dest_off acc'
+                  (acc' :*: JustS x') -> pokeByteOff dest dest_off x'
+                                      >> loop (src_off-1) (dest_off-1) acc'
+
+doNoAccLoop :: NoAccEFL -> noAcc -> ImperativeLoop noAcc
+doNoAccLoop f noAcc src dest len = loop 0 0
+  where STRICT2(loop)
+        loop src_off dest_off
+            | src_off >= len = return (noAcc :*: 0 :*: dest_off)
+            | otherwise      = do
+                x <- peekByteOff src src_off
+                case f x of
+                  NothingS -> loop (src_off+1) dest_off
+                  JustS x' -> pokeByteOff dest dest_off x'
+                           >> loop (src_off+1) (dest_off+1)
+
+doMapLoop :: MapEFL -> noAcc -> ImperativeLoop noAcc
+doMapLoop f noAcc src dest len = loop 0
+  where STRICT1(loop)
+        loop n
+            | n >= len = return (noAcc :*: 0 :*: len)
+            | otherwise      = do
+                x <- peekByteOff src n
+                pokeByteOff dest n (f x)
+                loop (n+1) -- offset always the same, only pass 1 arg
+
+doFilterLoop :: FilterEFL -> noAcc -> ImperativeLoop noAcc
+doFilterLoop f noAcc src dest len = loop 0 0
+  where STRICT2(loop)
+        loop src_off dest_off
+            | src_off >= len = return (noAcc :*: 0 :*: dest_off)
+            | otherwise      = do
+                x <- peekByteOff src src_off
+                if f x
+                  then pokeByteOff dest dest_off x
+                    >> loop (src_off+1) (dest_off+1)
+                  else loop (src_off+1) dest_off
+
+-- run two loops in sequence,
+-- think of it as: loop1 >> loop2
+sequenceLoops :: ImperativeLoop acc1
+              -> ImperativeLoop acc2
+              -> ImperativeLoop (PairS acc1 acc2)
+sequenceLoops loop1 loop2 src dest len0 = do
+  (acc1 :*: off1 :*: len1) <- loop1 src dest len0
+  (acc2 :*: off2 :*: len2) <-
+    let src'  = dest `plusPtr` off1
+        dest' = src' -- note that we are using dest == src
+                     -- for the second loop as we are
+                     -- mutating the dest array in-place!
+     in loop2 src' dest' len1
+  return ((acc1  :*: acc2) :*: off1 + off2 :*: len2)
+
+  -- TODO: prove that this is associative! (I think it is)
+  -- since we can't be sure how the RULES will combine loops.
+
+#if defined(__GLASGOW_HASKELL__)
+
+{-# INLINE [1] doUpLoop             #-}
+{-# INLINE [1] doDownLoop           #-}
+{-# INLINE [1] doNoAccLoop          #-}
+{-# INLINE [1] doMapLoop            #-}
+{-# INLINE [1] doFilterLoop         #-}
+
+{-# INLINE [1] loopWrapper          #-}
+{-# INLINE [1] sequenceLoops        #-}
+
+{-# INLINE [1] fuseAccAccEFL        #-}
+{-# INLINE [1] fuseAccNoAccEFL      #-}
+{-# INLINE [1] fuseNoAccAccEFL      #-}
+{-# INLINE [1] fuseNoAccNoAccEFL    #-}
+{-# INLINE [1] fuseMapAccEFL        #-}
+{-# INLINE [1] fuseAccMapEFL        #-}
+{-# INLINE [1] fuseMapNoAccEFL      #-}
+{-# INLINE [1] fuseNoAccMapEFL      #-}
+{-# INLINE [1] fuseMapMapEFL        #-}
+{-# INLINE [1] fuseAccFilterEFL     #-}
+{-# INLINE [1] fuseFilterAccEFL     #-}
+{-# INLINE [1] fuseNoAccFilterEFL   #-}
+{-# INLINE [1] fuseFilterNoAccEFL   #-}
+{-# INLINE [1] fuseFilterFilterEFL  #-}
+{-# INLINE [1] fuseMapFilterEFL     #-}
+{-# INLINE [1] fuseFilterMapEFL     #-}
+
+#endif
+
+{-# RULES
+
+"loopArr/loopSndAcc" forall x.
+  loopArr (loopSndAcc x) = loopArr x
+
+"seq/NoAcc" forall (u::NoAcc) e.
+  u `seq` e = e
+
+"loop/loop wrapper elimination" forall loop1 loop2 arr.
+  loopWrapper loop2 (loopArr (loopWrapper loop1 arr)) =
+    loopSndAcc (loopWrapper (sequenceLoops loop1 loop2) arr)
+
+--
+-- n.b in the following, when reading n/m fusion, recall sequenceLoops
+-- is monadic, so its really n >> m fusion (i.e. m.n), not n . m fusion.
+--
+
+"up/up loop fusion" forall f1 f2 acc1 acc2.
+  sequenceLoops (doUpLoop f1 acc1) (doUpLoop f2 acc2) =
+    doUpLoop (f1 `fuseAccAccEFL` f2) (acc1 :*: acc2)
+
+"map/map loop fusion" forall f1 f2 acc1 acc2.
+  sequenceLoops (doMapLoop f1 acc1) (doMapLoop f2 acc2) =
+    doMapLoop (f1 `fuseMapMapEFL` f2) (acc1 :*: acc2)
+
+"filter/filter loop fusion" forall f1 f2 acc1 acc2.
+  sequenceLoops (doFilterLoop f1 acc1) (doFilterLoop f2 acc2) =
+    doFilterLoop (f1 `fuseFilterFilterEFL` f2) (acc1 :*: acc2)
+
+"map/filter loop fusion" forall f1 f2 acc1 acc2.
+  sequenceLoops (doMapLoop f1 acc1) (doFilterLoop f2 acc2) =
+    doNoAccLoop (f1 `fuseMapFilterEFL` f2) (acc1 :*: acc2)
+
+"filter/map loop fusion" forall f1 f2 acc1 acc2.
+  sequenceLoops (doFilterLoop f1 acc1) (doMapLoop f2 acc2) =
+    doNoAccLoop (f1 `fuseFilterMapEFL` f2) (acc1 :*: acc2)
+
+"map/up loop fusion" forall f1 f2 acc1 acc2.
+  sequenceLoops (doMapLoop f1 acc1) (doUpLoop f2 acc2) =
+    doUpLoop (f1 `fuseMapAccEFL` f2) (acc1 :*: acc2)
+
+"up/map loop fusion" forall f1 f2 acc1 acc2.
+  sequenceLoops (doUpLoop f1 acc1) (doMapLoop f2 acc2) =
+    doUpLoop (f1 `fuseAccMapEFL` f2) (acc1 :*: acc2)
+
+"filter/up loop fusion" forall f1 f2 acc1 acc2.
+  sequenceLoops (doFilterLoop f1 acc1) (doUpLoop f2 acc2) =
+    doUpLoop (f1 `fuseFilterAccEFL` f2) (acc1 :*: acc2)
+
+"up/filter loop fusion" forall f1 f2 acc1 acc2.
+  sequenceLoops (doUpLoop f1 acc1) (doFilterLoop f2 acc2) =
+    doUpLoop (f1 `fuseAccFilterEFL` f2) (acc1 :*: acc2)
+
+"down/down loop fusion" forall f1 f2 acc1 acc2.
+  sequenceLoops (doDownLoop f1 acc1) (doDownLoop f2 acc2) =
+    doDownLoop (f1 `fuseAccAccEFL` f2) (acc1 :*: acc2)
+
+"map/down fusion" forall f1 f2 acc1 acc2.
+  sequenceLoops (doMapLoop f1 acc1) (doDownLoop f2 acc2) =
+    doDownLoop (f1 `fuseMapAccEFL` f2) (acc1 :*: acc2)
+
+"down/map loop fusion" forall f1 f2 acc1 acc2.
+  sequenceLoops (doDownLoop f1 acc1) (doMapLoop f2 acc2) =
+    doDownLoop (f1 `fuseAccMapEFL` f2) (acc1 :*: acc2)
+
+"filter/down fusion" forall f1 f2 acc1 acc2.
+  sequenceLoops (doFilterLoop f1 acc1) (doDownLoop f2 acc2) =
+    doDownLoop (f1 `fuseFilterAccEFL` f2) (acc1 :*: acc2)
+
+"down/filter loop fusion" forall f1 f2 acc1 acc2.
+  sequenceLoops (doDownLoop f1 acc1) (doFilterLoop f2 acc2) =
+    doDownLoop (f1 `fuseAccFilterEFL` f2) (acc1 :*: acc2)
+
+"noAcc/noAcc loop fusion" forall f1 f2 acc1 acc2.
+  sequenceLoops (doNoAccLoop f1 acc1) (doNoAccLoop f2 acc2) =
+    doNoAccLoop (f1 `fuseNoAccNoAccEFL` f2) (acc1 :*: acc2)
+
+"noAcc/up loop fusion" forall f1 f2 acc1 acc2.
+  sequenceLoops (doNoAccLoop f1 acc1) (doUpLoop f2 acc2) =
+    doUpLoop (f1 `fuseNoAccAccEFL` f2) (acc1 :*: acc2)
+
+"up/noAcc loop fusion" forall f1 f2 acc1 acc2.
+  sequenceLoops (doUpLoop f1 acc1) (doNoAccLoop f2 acc2) =
+    doUpLoop (f1 `fuseAccNoAccEFL` f2) (acc1 :*: acc2)
+
+"map/noAcc loop fusion" forall f1 f2 acc1 acc2.
+  sequenceLoops (doMapLoop f1 acc1) (doNoAccLoop f2 acc2) =
+    doNoAccLoop (f1 `fuseMapNoAccEFL` f2) (acc1 :*: acc2)
+
+"noAcc/map loop fusion" forall f1 f2 acc1 acc2.
+  sequenceLoops (doNoAccLoop f1 acc1) (doMapLoop f2 acc2) =
+    doNoAccLoop (f1 `fuseNoAccMapEFL` f2) (acc1 :*: acc2)
+
+"filter/noAcc loop fusion" forall f1 f2 acc1 acc2.
+  sequenceLoops (doFilterLoop f1 acc1) (doNoAccLoop f2 acc2) =
+    doNoAccLoop (f1 `fuseFilterNoAccEFL` f2) (acc1 :*: acc2)
+
+"noAcc/filter loop fusion" forall f1 f2 acc1 acc2.
+  sequenceLoops (doNoAccLoop f1 acc1) (doFilterLoop f2 acc2) =
+    doNoAccLoop (f1 `fuseNoAccFilterEFL` f2) (acc1 :*: acc2)
+
+"noAcc/down loop fusion" forall f1 f2 acc1 acc2.
+  sequenceLoops (doNoAccLoop f1 acc1) (doDownLoop f2 acc2) =
+    doDownLoop (f1 `fuseNoAccAccEFL` f2) (acc1 :*: acc2)
+
+"down/noAcc loop fusion" forall f1 f2 acc1 acc2.
+  sequenceLoops (doDownLoop f1 acc1) (doNoAccLoop f2 acc2) =
+    doDownLoop (f1 `fuseAccNoAccEFL` f2) (acc1 :*: acc2)
+
+  #-}
+
+{-
+
+up      = up loop
+down    = down loop
+map     = map special case
+filter  = filter special case
+noAcc   = noAcc undirectional loop (unused)
+
+heirarchy:
+  up     down
+   ^     ^
+    \   /
+    noAcc
+     ^ ^
+    /   \
+ map     filter
+
+each is a special case of the things above
+
+so we get rules that combine things on the same level
+and rules that combine things on different levels
+to get something on the higher level
+
+so all the cases:
+up/up         --> up     fuseAccAccEFL
+down/down     --> down   fuseAccAccEFL
+noAcc/noAcc   --> noAcc  fuseNoAccNoAccEFL
+
+noAcc/up      --> up     fuseNoAccAccEFL
+up/noAcc      --> up     fuseAccNoAccEFL
+noAcc/down    --> down   fuseNoAccAccEFL
+down/noAcc    --> down   fuseAccNoAccEFL
+
+and if we do the map, filter special cases then it adds a load more:
+
+map/map       --> map    fuseMapMapEFL
+filter/filter --> filter fuseFilterFilterEFL
+
+map/filter    --> noAcc  fuseMapFilterEFL
+filter/map    --> noAcc  fuseFilterMapEFL
+
+map/noAcc     --> noAcc  fuseMapNoAccEFL
+noAcc/map     --> noAcc  fuseNoAccMapEFL
+
+map/up        --> up     fuseMapAccEFL
+up/map        --> up     fuseAccMapEFL
+
+map/down      --> down   fuseMapAccEFL
+down/map      --> down   fuseAccMapEFL
+
+filter/noAcc  --> noAcc  fuseNoAccFilterEFL
+noAcc/filter  --> noAcc  fuseFilterNoAccEFL
+
+filter/up     --> up     fuseFilterAccEFL
+up/filter     --> up     fuseAccFilterEFL
+
+filter/down   --> down   fuseFilterAccEFL
+down/filter   --> down   fuseAccFilterEFL
+-}
+
+fuseAccAccEFL :: AccEFL acc1 -> AccEFL acc2 -> AccEFL (PairS acc1 acc2)
+fuseAccAccEFL f g (acc1 :*: acc2) e1 =
+    case f acc1 e1 of
+        acc1' :*: NothingS -> (acc1' :*: acc2) :*: NothingS
+        acc1' :*: JustS e2 ->
+            case g acc2 e2 of
+                acc2' :*: res -> (acc1' :*: acc2') :*: res
+
+fuseAccNoAccEFL :: AccEFL acc -> NoAccEFL -> AccEFL (PairS acc noAcc)
+fuseAccNoAccEFL f g (acc :*: noAcc) e1 =
+    case f acc e1 of
+        acc' :*: NothingS -> (acc' :*: noAcc) :*: NothingS
+        acc' :*: JustS e2 -> (acc' :*: noAcc) :*: g e2
+
+fuseNoAccAccEFL :: NoAccEFL -> AccEFL acc -> AccEFL (PairS noAcc acc)
+fuseNoAccAccEFL f g (noAcc :*: acc) e1 =
+    case f e1 of
+        NothingS -> (noAcc :*: acc) :*: NothingS
+        JustS e2 ->
+            case g acc e2 of
+                acc' :*: res -> (noAcc :*: acc') :*: res
+
+fuseNoAccNoAccEFL :: NoAccEFL -> NoAccEFL -> NoAccEFL
+fuseNoAccNoAccEFL f g e1 =
+    case f e1 of
+        NothingS -> NothingS
+        JustS e2 -> g e2
+
+fuseMapAccEFL :: MapEFL -> AccEFL acc -> AccEFL (PairS noAcc acc)
+fuseMapAccEFL f g (noAcc :*: acc) e1 =
+    case g acc (f e1) of
+        (acc' :*: res) -> (noAcc :*: acc') :*: res
+
+fuseAccMapEFL :: AccEFL acc -> MapEFL -> AccEFL (PairS acc noAcc)
+fuseAccMapEFL f g (acc :*: noAcc) e1 =
+    case f acc e1 of
+        (acc' :*: NothingS) -> (acc' :*: noAcc) :*: NothingS
+        (acc' :*: JustS e2) -> (acc' :*: noAcc) :*: JustS (g e2)
+
+fuseMapMapEFL :: MapEFL -> MapEFL -> MapEFL
+fuseMapMapEFL   f g e1 = g (f e1)     -- n.b. perfect fusion
+
+fuseMapNoAccEFL :: MapEFL -> NoAccEFL -> NoAccEFL
+fuseMapNoAccEFL f g e1 = g (f e1)
+
+fuseNoAccMapEFL :: NoAccEFL -> MapEFL -> NoAccEFL
+fuseNoAccMapEFL f g e1 =
+    case f e1 of
+        NothingS -> NothingS
+        JustS e2 -> JustS (g e2)
+
+fuseAccFilterEFL :: AccEFL acc -> FilterEFL -> AccEFL (PairS acc noAcc)
+fuseAccFilterEFL f g (acc :*: noAcc) e1 =
+    case f acc e1 of
+        acc' :*: NothingS -> (acc' :*: noAcc) :*: NothingS
+        acc' :*: JustS e2 ->
+            case g e2 of
+                False -> (acc' :*: noAcc) :*: NothingS
+                True  -> (acc' :*: noAcc) :*: JustS e2
+
+fuseFilterAccEFL :: FilterEFL -> AccEFL acc -> AccEFL (PairS noAcc acc)
+fuseFilterAccEFL f g (noAcc :*: acc) e1 =
+    case f e1 of
+        False -> (noAcc :*: acc) :*: NothingS
+        True  ->
+            case g acc e1 of
+                acc' :*: res -> (noAcc :*: acc') :*: res
+
+fuseNoAccFilterEFL :: NoAccEFL -> FilterEFL -> NoAccEFL
+fuseNoAccFilterEFL f g e1 =
+    case f e1 of
+        NothingS -> NothingS
+        JustS e2 ->
+            case g e2 of
+                False -> NothingS
+                True  -> JustS e2
+
+fuseFilterNoAccEFL :: FilterEFL -> NoAccEFL -> NoAccEFL
+fuseFilterNoAccEFL f g e1 =
+    case f e1 of
+        False -> NothingS
+        True  -> g e1
+
+fuseFilterFilterEFL :: FilterEFL -> FilterEFL -> FilterEFL
+fuseFilterFilterEFL f g e1 = f e1 && g e1
+
+fuseMapFilterEFL :: MapEFL -> FilterEFL -> NoAccEFL
+fuseMapFilterEFL f g e1 =
+    case f e1 of
+        e2 -> case g e2 of
+            False -> NothingS
+            True  -> JustS e2
+
+fuseFilterMapEFL :: FilterEFL -> MapEFL -> NoAccEFL
+fuseFilterMapEFL f g e1 =
+    case f e1 of
+        False -> NothingS
+        True  -> JustS (g e1)
+
diff --git a/Data/ByteString/Lazy.hs b/Data/ByteString/Lazy.hs
new file mode 100644
index 0000000..17e181f
--- /dev/null
+++ b/Data/ByteString/Lazy.hs
@@ -0,0 +1,1246 @@
+{-# OPTIONS_GHC -cpp -optc-O1 -fffi -fglasgow-exts -fno-warn-incomplete-patterns #-}
+--
+-- -optc-O2 breaks with 4.0.4 gcc on debian
+--
+-- Module      : ByteString.Lazy
+-- Copyright   : (c) Don Stewart 2006
+--               (c) Duncan Coutts 2006
+-- License     : BSD-style
+--
+-- Maintainer  : dons@cse.unsw.edu.au
+-- Stability   : experimental
+-- Portability : portable, requires ffi and cpp
+-- Tested with : GHC 6.4.1 and Hugs March 2005
+-- 
+
+--
+-- | A time and space-efficient implementation of lazy byte vectors
+-- using lists of packed 'Word8' arrays, suitable for high performance
+-- use, both in terms of large data quantities, or high speed
+-- requirements. Byte vectors are encoded as lazy lists of strict 'Word8'
+-- arrays of bytes. They provide a means to manipulate large byte vectors
+-- without requiring the entire vector be resident in memory.
+--
+-- Some operations, such as concat, append, reverse and cons, have
+-- better complexity than their "Data.ByteString" equivalents, as due to
+-- optimisations resulting from the list spine structure. And for other
+-- operations Lazy ByteStrings are usually within a few percent of
+-- strict ones, but with better heap usage. For data larger than the
+-- available memory, or if you have tight memory constraints, this
+-- module will be the only option. The default chunk size is 64k, which
+-- should be good in most circumstances. For people with large L2
+-- caches, you may want to increase this to fit your cache.
+--
+-- This module is intended to be imported @qualified@, to avoid name
+-- clashes with "Prelude" functions.  eg.
+--
+-- > import qualified Data.ByteString.Lazy as B
+--
+-- Original GHC implementation by Bryan O\'Sullivan. Rewritten to use
+-- UArray by Simon Marlow. Rewritten to support slices and use
+-- ForeignPtr by David Roundy. Polished and extended by Don Stewart.
+-- Lazy variant by Duncan Coutts and Don Stewart.
+--
+
+module Data.ByteString.Lazy (
+
+        -- * The @ByteString@ type
+        ByteString(..),         -- instances: Eq, Ord, Show, Read, Data, Typeable
+
+        -- * Introducing and eliminating 'ByteString's
+        empty,                  -- :: ByteString
+        singleton,               -- :: Word8   -> ByteString
+        pack,                   -- :: [Word8] -> ByteString
+        unpack,                 -- :: ByteString -> [Word8]
+        packWith,               -- :: (a -> Word8) -> [a] -> ByteString
+        unpackWith,             -- :: (Word8 -> a) -> ByteString -> [a]
+
+        -- * Basic interface
+        cons,                   -- :: Word8 -> ByteString -> ByteString
+        snoc,                   -- :: ByteString -> Word8 -> ByteString
+        append,                 -- :: ByteString -> ByteString -> ByteString
+        head,                   -- :: ByteString -> Word8
+        last,                   -- :: ByteString -> Word8
+        tail,                   -- :: ByteString -> ByteString
+        init,                   -- :: ByteString -> ByteString
+        null,                   -- :: ByteString -> Bool
+        length,                 -- :: ByteString -> Int64
+
+        -- * Transformating ByteStrings
+        map,                    -- :: (Word8 -> Word8) -> ByteString -> ByteString
+        reverse,                -- :: ByteString -> ByteString
+--      intersperse,            -- :: Word8 -> ByteString -> ByteString
+        transpose,              -- :: [ByteString] -> [ByteString]
+
+        -- * Reducing 'ByteString's (folds)
+        foldl,                  -- :: (a -> Word8 -> a) -> a -> ByteString -> a
+        foldl',                 -- :: (a -> Word8 -> a) -> a -> ByteString -> a
+        foldl1,                 -- :: (Word8 -> Word8 -> Word8) -> ByteString -> Word8
+        foldl1',                -- :: (Word8 -> Word8 -> Word8) -> ByteString -> Word8
+        foldr,                  -- :: (Word8 -> a -> a) -> a -> ByteString -> a
+        foldr1,                 -- :: (Word8 -> Word8 -> Word8) -> ByteString -> Word8
+
+        -- ** Special folds
+        concat,                 -- :: [ByteString] -> ByteString
+        concatMap,              -- :: (Word8 -> ByteString) -> ByteString -> ByteString
+        any,                    -- :: (Word8 -> Bool) -> ByteString -> Bool
+        all,                    -- :: (Word8 -> Bool) -> ByteString -> Bool
+        maximum,                -- :: ByteString -> Word8
+        minimum,                -- :: ByteString -> Word8
+
+        -- * Building ByteStrings
+        -- ** Scans
+        scanl,                  -- :: (Word8 -> Word8 -> Word8) -> Word8 -> ByteString -> ByteString
+--      scanl1,                 -- :: (Word8 -> Word8 -> Word8) -> ByteString -> ByteString
+--      scanr,                  -- :: (Word8 -> Word8 -> Word8) -> Word8 -> ByteString -> ByteString
+--      scanr1,                 -- :: (Word8 -> Word8 -> Word8) -> ByteString -> ByteString
+
+        -- ** Accumulating maps
+        mapAccumL,  -- :: (acc -> Word8 -> (acc, Word8)) -> acc -> ByteString -> (acc, ByteString)
+        mapIndexed, -- :: (Int64 -> Word8 -> Word8) -> ByteString -> ByteString
+
+        -- ** Infinite ByteStrings
+        repeat,                 -- :: Word8 -> ByteString
+        replicate,              -- :: Int64 -> Word8 -> ByteString
+        cycle,                  -- :: ByteString -> ByteString
+        iterate,                -- :: (Word8 -> Word8) -> Word8 -> ByteString
+
+        -- ** Unfolding
+        unfoldr,                -- :: (a -> Maybe (Word8, a)) -> a -> ByteString
+
+        -- * Substrings
+
+        -- ** Breaking strings
+        take,                   -- :: Int64 -> ByteString -> ByteString
+        drop,                   -- :: Int64 -> ByteString -> ByteString
+        splitAt,                -- :: Int64 -> ByteString -> (ByteString, ByteString)
+        takeWhile,              -- :: (Word8 -> Bool) -> ByteString -> ByteString
+        dropWhile,              -- :: (Word8 -> Bool) -> ByteString -> ByteString
+        span,                   -- :: (Word8 -> Bool) -> ByteString -> (ByteString, ByteString)
+        break,                  -- :: (Word8 -> Bool) -> ByteString -> (ByteString, ByteString)
+        group,                  -- :: ByteString -> [ByteString]
+        groupBy,                -- :: (Word8 -> Word8 -> Bool) -> ByteString -> [ByteString]
+        inits,                  -- :: ByteString -> [ByteString]
+        tails,                  -- :: ByteString -> [ByteString]
+
+        -- ** Breaking and dropping on specific bytes
+        breakByte,              -- :: Word8 -> ByteString -> (ByteString, ByteString)
+        spanByte,               -- :: Word8 -> ByteString -> (ByteString, ByteString)
+
+        -- ** Breaking into many substrings
+        split,                  -- :: Word8 -> ByteString -> [ByteString]
+        splitWith,              -- :: (Word8 -> Bool) -> ByteString -> [ByteString]
+        tokens,                 -- :: (Word8 -> Bool) -> ByteString -> [ByteString]
+
+        -- ** Joining strings
+        join,                   -- :: ByteString -> [ByteString] -> ByteString
+        joinWithByte,           -- :: Word8 -> ByteString -> ByteString -> ByteString
+
+        -- * Predicates
+        isPrefixOf,             -- :: ByteString -> ByteString -> Bool
+--      isSuffixOf,             -- :: ByteString -> ByteString -> Bool
+
+        -- * Searching ByteStrings
+
+        -- ** Searching by equality
+        elem,                   -- :: Word8 -> ByteString -> Bool
+        notElem,                -- :: Word8 -> ByteString -> Bool
+        filterByte,             -- :: Word8 -> ByteString -> ByteString
+        filterNotByte,          -- :: Word8 -> ByteString -> ByteString
+
+        -- ** Searching with a predicate
+        find,                   -- :: (Word8 -> Bool) -> ByteString -> Maybe Word8
+        filter,                 -- :: (Word8 -> Bool) -> ByteString -> ByteString
+--      partition               -- :: (Word8 -> Bool) -> ByteString -> (ByteString, ByteString)
+
+        -- * Indexing ByteStrings
+        index,                  -- :: ByteString -> Int64 -> Word8
+        elemIndex,              -- :: Word8 -> ByteString -> Maybe Int64
+        elemIndices,            -- :: Word8 -> ByteString -> [Int64]
+        findIndex,              -- :: (Word8 -> Bool) -> ByteString -> Maybe Int64
+        findIndices,            -- :: (Word8 -> Bool) -> ByteString -> [Int64]
+        count,                  -- :: Word8 -> ByteString -> Int64
+
+        -- * Zipping and unzipping ByteStrings
+        zip,                    -- :: ByteString -> ByteString -> [(Word8,Word8)]
+        zipWith,                -- :: (Word8 -> Word8 -> c) -> ByteString -> ByteString -> [c]
+--      unzip,                  -- :: [(Word8,Word8)] -> (ByteString,ByteString)
+
+        -- * Ordered ByteStrings
+--        sort,                   -- :: ByteString -> ByteString
+
+        -- * I\/O with 'ByteString's
+
+        -- ** Standard input and output
+        getContents,            -- :: IO ByteString
+        putStr,                 -- :: ByteString -> IO ()
+        putStrLn,               -- :: ByteString -> IO ()
+        interact,               -- :: (ByteString -> ByteString) -> IO ()
+
+        -- ** Files
+        readFile,               -- :: FilePath -> IO ByteString
+        writeFile,              -- :: FilePath -> ByteString -> IO ()
+        appendFile,             -- :: FilePath -> ByteString -> IO ()
+
+        -- ** I\/O with Handles
+        hGetContents,           -- :: Handle -> IO ByteString
+        hGetContentsN,          -- :: Int -> Handle -> IO ByteString
+        hGet,                   -- :: Handle -> Int -> IO ByteString
+        hGetN,                  -- :: Int -> Handle -> Int -> IO ByteString
+        hPut,                   -- :: Handle -> ByteString -> IO ()
+#if defined(__GLASGOW_HASKELL__)
+        hGetNonBlocking,        -- :: Handle -> IO ByteString
+        hGetNonBlockingN,       -- :: Int -> Handle -> IO ByteString
+#endif
+
+  ) where
+
+import qualified Prelude
+import Prelude hiding
+    (reverse,head,tail,last,init,null,length,map,lines,foldl,foldr,unlines
+    ,concat,any,take,drop,splitAt,takeWhile,dropWhile,span,break,elem,filter,maximum
+    ,minimum,all,concatMap,foldl1,foldr1,scanl, scanl1, scanr, scanr1
+    ,repeat, cycle, interact, iterate,readFile,writeFile,appendFile,replicate
+    ,getContents,getLine,putStr,putStrLn ,zip,zipWith,unzip,notElem)
+
+import qualified Data.List              as L  -- L for list/lazy
+import qualified Data.ByteString        as P  -- P for packed
+import qualified Data.ByteString.Base   as P
+import qualified Data.ByteString.Fusion as P
+import Data.ByteString.Fusion (PairS(..),loopL)
+
+import Data.Monoid              (Monoid(..))
+
+import Data.Word                (Word8)
+import Data.Int                 (Int64)
+import System.IO (Handle,stdin,stdout,openBinaryFile,IOMode(..),hClose)
+import System.IO.Unsafe
+import Control.Exception        (bracket)
+
+#if defined(__GLASGOW_HASKELL__)
+import Data.Generics            (Data(..), Typeable(..))
+#endif
+
+-- -----------------------------------------------------------------------------
+--
+-- Useful macros, until we have bang patterns
+--
+
+#define STRICT1(f) f a | a `seq` False = undefined
+#define STRICT2(f) f a b | a `seq` b `seq` False = undefined
+#define STRICT3(f) f a b c | a `seq` b `seq` c `seq` False = undefined
+#define STRICT4(f) f a b c d | a `seq` b `seq` c `seq` d `seq` False = undefined
+#define STRICT5(f) f a b c d e | a `seq` b `seq` c `seq` d `seq` e `seq` False = undefined
+
+-- -----------------------------------------------------------------------------
+
+-- | A space-efficient representation of a Word8 vector, supporting many
+-- efficient operations.  A 'ByteString' contains 8-bit characters only.
+--
+-- Instances of Eq, Ord, Read, Show, Data, Typeable
+--
+newtype ByteString = LPS [P.ByteString] -- LPS for lazy packed string
+    deriving (Show,Read
+#if defined(__GLASGOW_HASKELL__)
+                        ,Data, Typeable
+#endif
+             )
+
+--
+-- hmm, what about getting the PS constructor unpacked into the cons cell?
+--
+-- data List = Nil | Cons {-# UNPACK #-} !P.ByteString List
+--
+-- Would avoid one indirection per chunk.
+--
+
+unLPS :: ByteString -> [P.ByteString]
+unLPS (LPS xs) = xs
+{-# INLINE unLPS #-}
+
+instance Eq  ByteString
+    where (==)    = eq
+
+instance Ord ByteString
+    where compare = compareBytes
+
+instance Monoid ByteString where
+    mempty  = empty
+    mappend = append
+    mconcat = concat
+
+------------------------------------------------------------------------
+
+-- XXX
+-- The data type invariant:
+-- Every ByteString is either empty or consists of non-null ByteStrings.
+-- All functions must preserve this, and the QC properties must check this.
+--
+_invariant :: ByteString -> Bool
+_invariant (LPS []) = True
+_invariant (LPS xs) = L.all (not . P.null) xs
+
+-- In a form useful for QC testing
+_checkInvariant :: ByteString -> ByteString
+_checkInvariant lps
+    | _invariant lps = lps
+    | otherwise      = moduleError "invariant" ("violation: " ++ show lps)
+
+-- The Data abstraction function
+--
+_abstr :: ByteString -> P.ByteString
+_abstr (LPS []) = P.empty
+_abstr (LPS xs) = P.concat xs
+
+-- The representation uses lists of packed chunks. When we have to convert from
+-- a lazy list to the chunked representation, then by default we'll use this
+-- chunk size. Some functions give you more control over the chunk size.
+--
+-- Measurements here:
+--  http://www.cse.unsw.edu.au/~dons/tmp/chunksize_v_cache.png
+--
+-- indicate that a value around 0.5 to 1 x your L2 cache is best.
+-- The following value assumes people have something greater than 128k,
+-- and need to share the cache with other programs.
+--
+defaultChunkSize :: Int
+defaultChunkSize = 64 * k
+   where k = 1024
+
+smallChunkSize :: Int
+smallChunkSize = 4 * k
+   where k = 1024
+
+-- defaultChunkSize = 1
+
+------------------------------------------------------------------------
+
+eq :: ByteString -> ByteString -> Bool
+eq (LPS xs) (LPS ys) = eq' xs ys
+  where eq' [] [] = True
+        eq' [] _  = False
+        eq' _  [] = False
+        eq' (a:as) (b:bs) =
+          case compare (P.length a) (P.length b) of
+            LT -> a == (P.take (P.length a) b) && eq' as (P.drop (P.length a) b : bs)
+            EQ -> a == b                       && eq' as bs
+            GT -> (P.take (P.length b) a) == b && eq' (P.drop (P.length b) a : as) bs
+
+compareBytes :: ByteString -> ByteString -> Ordering
+compareBytes (LPS xs) (LPS ys) = cmp xs ys
+  where cmp [] [] = EQ
+        cmp [] _  = LT
+        cmp _  [] = GT
+        cmp (a:as) (b:bs) =
+          case compare (P.length a) (P.length b) of
+            LT -> case compare a (P.take (P.length a) b) of
+                    EQ     -> cmp as (P.drop (P.length a) b : bs)
+                    result -> result
+            EQ -> case compare a b of
+                    EQ     -> cmp as bs
+                    result -> result
+            GT -> case compare (P.take (P.length b) a) b of
+                    EQ     -> cmp (P.drop (P.length b) a : as) bs
+                    result -> result
+
+-- -----------------------------------------------------------------------------
+-- Introducing and eliminating 'ByteString's
+
+-- | /O(1)/ The empty 'ByteString'
+empty :: ByteString
+empty = LPS []
+{-# NOINLINE empty #-}
+
+-- | /O(1)/ Convert a 'Word8' into a 'ByteString'
+singleton :: Word8 -> ByteString
+singleton c = LPS [P.singleton c]
+{-# NOINLINE singleton #-}
+
+-- | /O(n)/ Convert a '[Word8]' into a 'ByteString'. 
+pack :: [Word8] -> ByteString
+pack str = LPS $ L.map P.pack (chunk defaultChunkSize str)
+
+-- ?
+chunk :: Int -> [a] -> [[a]]
+chunk _    [] = []
+chunk size xs = case L.splitAt size xs of (xs', xs'') -> xs' : chunk size xs''
+
+-- | /O(n)/ Converts a 'ByteString' to a '[Word8]'.
+unpack :: ByteString -> [Word8]
+unpack (LPS ss) = L.concatMap P.unpack ss
+{-# INLINE unpack #-}
+
+------------------------------------------------------------------------
+
+-- | /O(n)/ Convert a '[a]' into a 'ByteString' using some
+-- conversion function
+packWith :: (a -> Word8) -> [a] -> ByteString
+packWith k str = LPS $ L.map (P.packWith k) (chunk defaultChunkSize str)
+{-# INLINE packWith #-}
+{-# SPECIALIZE packWith :: (Char -> Word8) -> [Char] -> ByteString #-}
+
+-- | /O(n)/ Converts a 'ByteString' to a '[a]', using a conversion function.
+unpackWith :: (Word8 -> a) -> ByteString -> [a]
+unpackWith k (LPS ss) = L.concatMap (P.unpackWith k) ss
+{-# INLINE unpackWith #-}
+{-# SPECIALIZE unpackWith :: (Word8 -> Char) -> ByteString -> [Char] #-}
+
+-- ---------------------------------------------------------------------
+-- Basic interface
+
+-- | /O(1)/ Test whether a ByteString is empty.
+null :: ByteString -> Bool
+null (LPS []) = True
+null (_)      = False  -- TODO: guarantee this invariant is maintained
+{-# INLINE null #-}
+
+-- | /O(n\/c)/ 'length' returns the length of a ByteString as an 'Int64'
+length :: ByteString -> Int64
+length (LPS ss) = L.sum (L.map (fromIntegral.P.length) ss)
+
+-- avoid the intermediate list?
+-- length (LPS ss) = L.foldl lengthF 0 ss
+--     where lengthF n s = let m = n + fromIntegral (P.length s) in m `seq` m
+{-# INLINE length #-}
+
+-- | /O(1)/ 'cons' is analogous to '(:)' for lists. Unlike '(:)' however it is
+-- strict in the ByteString that we are consing onto. More precisely, it forces
+-- the head and the first chunk. It does this because, for space efficiency, it
+-- may coalesce the new byte onto the first \'chunk\' rather than starting a
+-- new \'chunk\'.
+--
+-- So that means you can't use a lazy recursive contruction like this:
+--
+-- > let xs = cons c xs in xs
+--
+-- You can however use 'repeat' and 'cycle' to build infinite lazy ByteStrings.
+--
+cons :: Word8 -> ByteString -> ByteString
+cons c (LPS (s:ss)) | P.length s <= 16 = LPS (P.cons c s : ss)
+cons c (LPS ss)                        = LPS (P.singleton c : ss)
+{-# INLINE cons #-}
+
+-- | /O(n\/c)/ Append a byte to the end of a 'ByteString'
+snoc :: ByteString -> Word8 -> ByteString
+snoc (LPS ss) c = LPS (ss ++ [P.singleton c])
+{-# INLINE snoc #-}
+
+-- | /O(1)/ Extract the first element of a ByteString, which must be non-empty.
+head :: ByteString -> Word8
+head (LPS [])    = errorEmptyList "head"
+head (LPS (x:_)) = P.unsafeHead x
+{-# INLINE head #-}
+
+-- | /O(1)/ Extract the elements after the head of a ByteString, which must be non-empty.
+tail :: ByteString -> ByteString
+tail (LPS [])     = errorEmptyList "tail"
+tail (LPS (x:xs))
+  | P.length x == 1 = LPS xs
+  | otherwise       = LPS (P.unsafeTail x : xs)
+{-# INLINE tail #-}
+
+-- | /O(n\/c)/ Extract the last element of a ByteString, which must be finite and non-empty.
+last :: ByteString -> Word8
+last (LPS []) = errorEmptyList "last"
+last (LPS xs) = P.last (L.last xs)
+{-# INLINE last #-}
+
+-- | /O(1)/ Return all the elements of a 'ByteString' except the last one.
+init :: ByteString -> ByteString
+init (LPS []) = errorEmptyList "init"
+init (LPS xs)
+    | P.length y == 1 = LPS ys
+    | otherwise       = LPS (ys ++ [P.init y])
+    where (y,ys) = (L.last xs, L.init xs)
+{-# INLINE init #-}
+
+-- | /O(n)/ Append two ByteStrings
+append :: ByteString -> ByteString -> ByteString
+append (LPS []) (LPS ys) = LPS ys
+append (LPS xs) (LPS []) = LPS xs
+append (LPS xs) (LPS ys) = LPS (xs ++ ys)
+{-# INLINE append #-}
+
+-- ---------------------------------------------------------------------
+-- Transformations
+
+-- | /O(n)/ 'map' @f xs@ is the ByteString obtained by applying @f@ to each
+-- element of @xs@.
+map :: (Word8 -> Word8) -> ByteString -> ByteString
+--map f (LPS xs) = LPS (L.map (P.map' f) xs)
+map f = LPS . P.loopArr . loopL (P.mapEFL f) P.NoAcc . unLPS
+{-# INLINE map #-}
+
+-- | /O(n)/ 'reverse' @xs@ efficiently returns the elements of @xs@ in reverse order.
+reverse :: ByteString -> ByteString
+reverse (LPS xs) = LPS (L.reverse . L.map P.reverse $ xs)
+{-# INLINE reverse #-}
+
+-- The 'intersperse' function takes a 'Word8' and a 'ByteString' and
+-- \`intersperses\' that byte between the elements of the 'ByteString'.
+-- It is analogous to the intersperse function on Lists.
+-- intersperse :: Word8 -> ByteString -> ByteString
+-- intersperse = error "FIXME: not yet implemented"
+
+{-
+intersperse c (LPS [])     = LPS []
+intersperse c (LPS (x:xs)) = LPS (P.intersperse c x : L.map intersperse')
+  where intersperse' c ps@(PS x s l) =
+          P.create (2*l) $ \p -> withForeignPtr x $ \f ->
+                poke p c
+                c_intersperse (p `plusPtr` 1) (f `plusPtr` s) l c
+-}
+
+-- | The 'transpose' function transposes the rows and columns of its
+-- 'ByteString' argument.
+transpose :: [ByteString] -> [ByteString]
+transpose s = L.map (\ss -> LPS [P.pack ss]) (L.transpose (L.map unpack s))
+
+-- ---------------------------------------------------------------------
+-- Reducing 'ByteString's
+
+-- | 'foldl', applied to a binary operator, a starting value (typically
+-- the left-identity of the operator), and a ByteString, reduces the
+-- ByteString using the binary operator, from left to right.
+foldl :: (a -> Word8 -> a) -> a -> ByteString -> a
+--foldl f z (LPS xs) = L.foldl (P.foldl f) z xs
+foldl f z = P.loopAcc . loopL (P.foldEFL f) z . unLPS
+{-# INLINE foldl #-}
+
+-- | 'foldl\'' is like 'foldl', but strict in the accumulator.
+foldl' :: (a -> Word8 -> a) -> a -> ByteString -> a
+--foldl' f z (LPS xs) = L.foldl' (P.foldl' f) z xs
+foldl' f z = P.loopAcc . loopL (P.foldEFL' f) z . unLPS
+{-# INLINE foldl' #-}
+
+-- | 'foldr', applied to a binary operator, a starting value
+-- (typically the right-identity of the operator), and a ByteString,
+-- reduces the ByteString using the binary operator, from right to left.
+foldr :: (Word8 -> a -> a) -> a -> ByteString -> a
+foldr k z (LPS xs) = L.foldr (flip (P.foldr k)) z xs
+{-# INLINE foldr #-}
+
+-- | 'foldl1' is a variant of 'foldl' that has no starting value
+-- argument, and thus must be applied to non-empty 'ByteStrings'.
+-- This function is subject to array fusion.
+foldl1 :: (Word8 -> Word8 -> Word8) -> ByteString -> Word8
+foldl1 _ (LPS []) = errorEmptyList "foldl1"
+foldl1 f (LPS (x:xs)) = foldl f (P.unsafeHead x) (LPS (P.unsafeTail x : xs))
+
+-- | 'foldl1\'' is like 'foldl1', but strict in the accumulator.
+foldl1' :: (Word8 -> Word8 -> Word8) -> ByteString -> Word8
+foldl1' _ (LPS []) = errorEmptyList "foldl1'"
+foldl1' f (LPS (x:xs)) = foldl' f (P.unsafeHead x) (LPS (P.unsafeTail x : xs))
+
+-- | 'foldr1' is a variant of 'foldr' that has no starting value argument,
+-- and thus must be applied to non-empty 'ByteString's
+foldr1 :: (Word8 -> Word8 -> Word8) -> ByteString -> Word8
+foldr1 _ (LPS []) = errorEmptyList "foldr1"
+foldr1 f (LPS ps) = foldr1' ps
+  where foldr1' (x:[]) = P.foldr1 f x
+        foldr1' (x:xs) = P.foldr  f (foldr1' xs) x
+
+-- ---------------------------------------------------------------------
+-- Special folds
+
+-- | /O(n)/ Concatenate a list of ByteStrings.
+concat :: [ByteString] -> ByteString
+concat lpss = LPS (L.concatMap (\(LPS xs) -> xs) lpss)
+
+-- | Map a function over a 'ByteString' and concatenate the results
+concatMap :: (Word8 -> ByteString) -> ByteString -> ByteString
+concatMap f (LPS lps) = LPS (filterMap (P.concatMap k) lps)
+    where
+      k w = case f w of LPS xs -> P.concat xs
+
+-- | /O(n)/ Applied to a predicate and a ByteString, 'any' determines if
+-- any element of the 'ByteString' satisfies the predicate.
+any :: (Word8 -> Bool) -> ByteString -> Bool
+any f (LPS xs) = L.or (L.map (P.any f) xs)
+-- todo fuse
+
+-- | /O(n)/ Applied to a predicate and a 'ByteString', 'all' determines
+-- if all elements of the 'ByteString' satisfy the predicate.
+all :: (Word8 -> Bool) -> ByteString -> Bool
+all f (LPS xs) = L.and (L.map (P.all f) xs)
+-- todo fuse
+
+-- | /O(n)/ 'maximum' returns the maximum value from a 'ByteString'
+maximum :: ByteString -> Word8
+maximum (LPS []) = errorEmptyList "maximum"
+maximum (LPS xs) = L.maximum (L.map P.maximum xs)
+{-# INLINE maximum #-}
+
+-- | /O(n)/ 'minimum' returns the minimum value from a 'ByteString'
+minimum :: ByteString -> Word8
+minimum (LPS []) = errorEmptyList "minimum"
+minimum (LPS xs) = L.minimum (L.map P.minimum xs)
+{-# INLINE minimum #-}
+
+-- | The 'mapAccumL' function behaves like a combination of 'map' and
+-- 'foldl'; it applies a function to each element of a ByteString,
+-- passing an accumulating parameter from left to right, and returning a
+-- final value of this accumulator together with the new ByteString.
+mapAccumL :: (acc -> Word8 -> (acc, Word8)) -> acc -> ByteString -> (acc, ByteString)
+mapAccumL f z = (\(a :*: ps) -> (a, LPS ps)) . loopL (P.mapAccumEFL f) z . unLPS
+
+-- | /O(n)/ map Word8 functions, provided with the index at each position
+mapIndexed :: (Int -> Word8 -> Word8) -> ByteString -> ByteString
+mapIndexed f = LPS . P.loopArr . loopL (P.mapIndexEFL f) 0 . unLPS
+
+-- ---------------------------------------------------------------------
+-- Building ByteStrings
+
+-- | 'scanl' is similar to 'foldl', but returns a list of successive
+-- reduced values from the left. This function will fuse.
+--
+-- > scanl f z [x1, x2, ...] == [z, z `f` x1, (z `f` x1) `f` x2, ...]
+--
+-- Note that
+--
+-- > last (scanl f z xs) == foldl f z xs.
+scanl :: (Word8 -> Word8 -> Word8) -> Word8 -> ByteString -> ByteString
+scanl f z ps = LPS . P.loopArr . loopL (P.scanEFL f) z . unLPS $ (ps `snoc` 0)
+{-# INLINE scanl #-}
+
+-- ---------------------------------------------------------------------
+-- Unfolds and replicates
+
+-- | @'iterate' f x@ returns an infinite ByteString of repeated applications
+-- of @f@ to @x@:
+--
+-- > iterate f x == [x, f x, f (f x), ...]
+--
+iterate :: (Word8 -> Word8) -> Word8 -> ByteString
+iterate f = unfoldr (\x -> case f x of x' -> x' `seq` Just (x', x'))
+
+-- | @'repeat' x@ is an infinite ByteString, with @x@ the value of every
+-- element.
+--
+repeat :: Word8 -> ByteString
+repeat c = LPS (L.repeat block)
+    where block =  P.replicate smallChunkSize c
+
+-- | /O(n)/ @'replicate' n x@ is a ByteString of length @n@ with @x@
+-- the value of every element.
+--
+replicate :: Int64 -> Word8 -> ByteString
+replicate w c
+    | w <= 0             = empty
+    | w < fromIntegral smallChunkSize = LPS [P.replicate (fromIntegral w) c]
+    | r == 0             = LPS (L.genericReplicate q s) -- preserve invariant
+    | otherwise          = LPS (P.unsafeTake (fromIntegral r) s : L.genericReplicate q s)
+ where
+    s      = P.replicate smallChunkSize c
+    (q, r) = quotRem w (fromIntegral smallChunkSize)
+
+-- | 'cycle' ties a finite ByteString into a circular one, or equivalently,
+-- the infinite repetition of the original ByteString.
+--
+cycle :: ByteString -> ByteString
+cycle (LPS []) = errorEmptyList "cycle"
+cycle (LPS xs) = LPS (L.cycle xs)
+
+-- | /O(n)/ The 'unfoldr' function is analogous to the List \'unfoldr\'.
+-- 'unfoldr' builds a ByteString from a seed value.  The function takes
+-- the element and returns 'Nothing' if it is done producing the
+-- ByteString or returns 'Just' @(a,b)@, in which case, @a@ is a
+-- prepending to the ByteString and @b@ is used as the next element in a
+-- recursive call.
+unfoldr :: (a -> Maybe (Word8, a)) -> a -> ByteString
+unfoldr f = LPS . unfoldChunk 32
+  where unfoldChunk n x =
+          case P.unfoldrN n f x of
+            (s, Nothing)
+              | P.null s  -> []
+              | otherwise -> s : []
+            (s, Just x')  -> s : unfoldChunk ((n*2) `min` smallChunkSize) x'
+
+-- ---------------------------------------------------------------------
+-- Substrings
+
+-- | /O(n\/c)/ 'take' @n@, applied to a ByteString @xs@, returns the prefix
+-- of @xs@ of length @n@, or @xs@ itself if @n > 'length' xs@.
+take :: Int64 -> ByteString -> ByteString
+take n _ | n < 0 = empty
+take i (LPS ps)  = LPS (take' i ps)
+  where take' _ []     = []
+        take' 0 _      = []
+        take' n (x:xs) =
+          if n < fromIntegral (P.length x)
+            then P.take (fromIntegral n) x : []
+            else x : take' (n - fromIntegral (P.length x)) xs
+
+-- | /O(n\/c)/ 'drop' @n xs@ returns the suffix of @xs@ after the first @n@
+-- elements, or @[]@ if @n > 'length' xs@.
+drop  :: Int64 -> ByteString -> ByteString
+drop i p | i <= 0 = p
+drop i (LPS ps) = LPS (drop' i ps)
+  where drop' _ []     = []
+        drop' 0 xs     = xs
+        drop' n (x:xs) =
+          if n < fromIntegral (P.length x)
+            then P.drop (fromIntegral n) x : xs
+            else drop' (n - fromIntegral (P.length x)) xs
+
+-- | /O(n\/c)/ 'splitAt' @n xs@ is equivalent to @('take' n xs, 'drop' n xs)@.
+splitAt :: Int64 -> ByteString -> (ByteString, ByteString)
+splitAt i p        | i <= 0 = (empty, p)
+splitAt i (LPS ps) = case splitAt' i ps of (a,b) -> (LPS a, LPS b)
+  where splitAt' _ []     = ([], [])
+        splitAt' 0 xs     = ([], xs)
+        splitAt' n (x:xs) =
+          if n < fromIntegral (P.length x)
+            then (P.take (fromIntegral n) x : [], 
+                  P.drop (fromIntegral n) x : xs)
+            else let (xs', xs'') = splitAt' (n - fromIntegral (P.length x)) xs
+                   in (x:xs', xs'')
+
+
+-- | 'takeWhile', applied to a predicate @p@ and a ByteString @xs@,
+-- returns the longest prefix (possibly empty) of @xs@ of elements that
+-- satisfy @p@.
+takeWhile :: (Word8 -> Bool) -> ByteString -> ByteString
+takeWhile f (LPS ps) = LPS (takeWhile' ps)
+  where takeWhile' []     = []
+        takeWhile' (x:xs) =
+          case P.findIndexOrEnd (not . f) x of
+            0                  -> []
+            n | n < P.length x -> P.take n x : []
+              | otherwise      -> x : takeWhile' xs
+
+-- | 'dropWhile' @p xs@ returns the suffix remaining after 'takeWhile' @p xs@.
+dropWhile :: (Word8 -> Bool) -> ByteString -> ByteString
+dropWhile f (LPS ps) = LPS (dropWhile' ps)
+  where dropWhile' []     = []
+        dropWhile' (x:xs) =
+          case P.findIndexOrEnd (not . f) x of
+            n | n < P.length x -> P.drop n x : xs
+              | otherwise      -> dropWhile' xs
+
+-- | 'break' @p@ is equivalent to @'span' ('not' . p)@.
+break :: (Word8 -> Bool) -> ByteString -> (ByteString, ByteString)
+break f (LPS ps) = case (break' ps) of (a,b) -> (LPS a, LPS b)
+  where break' []     = ([], [])
+        break' (x:xs) =
+          case P.findIndexOrEnd f x of
+            0                  -> ([], x : xs)
+            n | n < P.length x -> (P.take n x : [], P.drop n x : xs)
+              | otherwise      -> let (xs', xs'') = break' xs
+                                   in (x : xs', xs'')
+
+-- | 'breakByte' breaks its ByteString argument at the first occurence
+-- of the specified byte. It is more efficient than 'break' as it is
+-- implemented with @memchr(3)@. I.e.
+-- 
+-- > break (=='c') "abcd" == breakByte 'c' "abcd"
+--
+breakByte :: Word8 -> ByteString -> (ByteString, ByteString)
+breakByte c (LPS ps) = case (breakByte' ps) of (a,b) -> (LPS a, LPS b)
+  where breakByte' []     = ([], [])
+        breakByte' (x:xs) =
+          case P.elemIndex c x of
+            Just 0  -> ([], x : xs)
+            Just n  -> (P.take n x : [], P.drop n x : xs)
+            Nothing -> let (xs', xs'') = breakByte' xs
+                        in (x : xs', xs'')
+
+-- | 'spanByte' breaks its ByteString argument at the first
+-- occurence of a byte other than its argument. It is more efficient
+-- than 'span (==)'
+--
+-- > span  (=='c') "abcd" == spanByte 'c' "abcd"
+--
+spanByte :: Word8 -> ByteString -> (ByteString, ByteString)
+spanByte c (LPS ps) = case (spanByte' ps) of (a,b) -> (LPS a, LPS b)
+  where spanByte' []     = ([], [])
+        spanByte' (x:xs) =
+          case P.spanByte c x of
+            (x', x'') | P.null x'  -> ([], x : xs)
+                      | P.null x'' -> let (xs', xs'') = spanByte' xs
+                                       in (x : xs', xs'')
+                      | otherwise  -> (x' : [], x'' : xs)
+
+-- | 'span' @p xs@ breaks the ByteString into two segments. It is
+-- equivalent to @('takeWhile' p xs, 'dropWhile' p xs)@
+span :: (Word8 -> Bool) -> ByteString -> (ByteString, ByteString)
+span p = break (not . p)
+
+-- | /O(n)/ Splits a 'ByteString' into components delimited by
+-- separators, where the predicate returns True for a separator element.
+-- The resulting components do not contain the separators.  Two adjacent
+-- separators result in an empty component in the output.  eg.
+--
+-- > splitWith (=='a') "aabbaca" == ["","","bb","c",""]
+-- > splitWith (=='a') []        == []
+--
+splitWith :: (Word8 -> Bool) -> ByteString -> [ByteString]
+splitWith _ (LPS [])     = []
+splitWith p (LPS (a:as)) = comb [] (P.splitWith p a) as
+
+  where comb :: [P.ByteString] -> [P.ByteString] -> [P.ByteString] -> [ByteString]
+        comb acc (s:[]) []     = LPS (L.reverse (cons' s acc)) : []
+        comb acc (s:[]) (x:xs) = comb (cons' s acc) (P.splitWith p x) xs
+        comb acc (s:ss) xs     = LPS (L.reverse (cons' s acc)) : comb [] ss xs
+
+        cons' x xs | P.null x  = xs
+                   | otherwise = x:xs
+        {-# INLINE cons' #-}
+{-# INLINE splitWith #-}
+
+-- | /O(n)/ Break a 'ByteString' into pieces separated by the byte
+-- argument, consuming the delimiter. I.e.
+--
+-- > split '\n' "a\nb\nd\ne" == ["a","b","d","e"]
+-- > split 'a'  "aXaXaXa"    == ["","X","X","X"]
+-- > split 'x'  "x"          == ["",""]
+-- 
+-- and
+--
+-- > join [c] . split c == id
+-- > split == splitWith . (==)
+-- 
+-- As for all splitting functions in this library, this function does
+-- not copy the substrings, it just constructs new 'ByteStrings' that
+-- are slices of the original.
+--
+split :: Word8 -> ByteString -> [ByteString]
+split _ (LPS [])     = []
+split c (LPS (a:as)) = comb [] (P.split c a) as
+
+  where comb :: [P.ByteString] -> [P.ByteString] -> [P.ByteString] -> [ByteString]
+        comb acc (s:[]) []     = LPS (L.reverse (cons' s acc)) : []
+        comb acc (s:[]) (x:xs) = comb (cons' s acc) (P.split c x) xs
+        comb acc (s:ss) xs     = LPS (L.reverse (cons' s acc)) : comb [] ss xs
+
+        cons' x xs | P.null x  = xs
+                   | otherwise = x:xs
+        {-# INLINE cons' #-}
+{-# INLINE split #-}
+
+-- | Like 'splitWith', except that sequences of adjacent separators are
+-- treated as a single separator. eg.
+-- 
+-- > tokens (=='a') "aabbaca" == ["bb","c"]
+--
+tokens :: (Word8 -> Bool) -> ByteString -> [ByteString]
+tokens f = L.filter (not.null) . splitWith f
+
+-- | The 'group' function takes a ByteString and returns a list of
+-- ByteStrings such that the concatenation of the result is equal to the
+-- argument.  Moreover, each sublist in the result contains only equal
+-- elements.  For example,
+--
+-- > group "Mississippi" = ["M","i","ss","i","ss","i","pp","i"]
+--
+-- It is a special case of 'groupBy', which allows the programmer to
+-- supply their own equality test.
+group :: ByteString -> [ByteString]
+group (LPS [])     = []
+group (LPS (a:as)) = group' [] (P.group a) as
+  where group' :: [P.ByteString] -> [P.ByteString] -> [P.ByteString] -> [ByteString]
+        group' acc@(s':_) ss@(s:_) xs
+          | P.unsafeHead s'
+         /= P.unsafeHead s       = LPS (L.reverse acc) : group' [] ss xs
+        group' acc (s:[]) []     = LPS (L.reverse (s : acc)) : []
+        group' acc (s:[]) (x:xs) = group' (s:acc) (P.group x) xs
+        group' acc (s:ss) xs     = LPS (L.reverse (s : acc)) : group' [] ss xs
+
+{-
+TODO: check if something like this might be faster
+
+group :: ByteString -> [ByteString]
+group xs
+    | null xs   = []
+    | otherwise = ys : group zs
+    where
+        (ys, zs) = spanByte (unsafeHead xs) xs
+-}
+
+-- | The 'groupBy' function is the non-overloaded version of 'group'.
+--
+groupBy :: (Word8 -> Word8 -> Bool) -> ByteString -> [ByteString]
+groupBy _ (LPS [])     = []
+groupBy k (LPS (a:as)) = groupBy' [] 0 (P.groupBy k a) as
+  where groupBy' :: [P.ByteString] -> Word8 -> [P.ByteString] -> [P.ByteString] -> [ByteString]
+        groupBy' acc@(_:_) c ss@(s:_) xs
+          | not (c `k` P.unsafeHead s) = LPS (L.reverse acc) : groupBy' [] 0 ss xs
+        groupBy' acc _ (s:[]) []       = LPS (L.reverse (s : acc)) : []
+        groupBy' []  _ (s:[]) (x:xs)   = groupBy' (s:[]) (P.unsafeHead s) (P.groupBy k x) xs
+        groupBy' acc c (s:[]) (x:xs)   = groupBy' (s:acc) c (P.groupBy k x) xs
+        groupBy' acc _ (s:ss) xs       = LPS (L.reverse (s : acc)) : groupBy' [] 0 ss xs
+
+{-
+TODO: check if something like this might be faster
+
+groupBy :: (Word8 -> Word8 -> Bool) -> ByteString -> [ByteString]
+groupBy k xs
+    | null xs   = []
+    | otherwise = take n xs : groupBy k (drop n xs)
+    where
+        n = 1 + findIndexOrEnd (not . k (head xs)) (tail xs)
+-}
+
+-- | /O(n)/ The 'join' function takes a 'ByteString' and a list of
+-- 'ByteString's and concatenates the list after interspersing the first
+-- argument between each element of the list.
+join :: ByteString -> [ByteString] -> ByteString
+join s = concat . (L.intersperse s)
+
+-- | /O(n)/ joinWithByte. An efficient way to join to two ByteStrings
+-- with a char.
+--
+joinWithByte :: Word8 -> ByteString -> ByteString -> ByteString
+joinWithByte c x y = append x (cons c y)
+
+-- ---------------------------------------------------------------------
+-- Indexing ByteStrings
+
+-- | /O(c)/ 'ByteString' index (subscript) operator, starting from 0.
+index :: ByteString -> Int64 -> Word8
+index _        i | i < 0 = moduleError "index" ("negative index: " ++ show i)
+index (LPS ps) i         = index' ps i
+  where index' []     n = moduleError "index" ("index too large: " ++ show n)
+        index' (x:xs) n
+          | n >= fromIntegral (P.length x) = 
+              index' xs (n - fromIntegral (P.length x))
+          | otherwise       = P.unsafeIndex x (fromIntegral n)
+
+-- | /O(n)/ The 'elemIndex' function returns the index of the first
+-- element in the given 'ByteString' which is equal to the query
+-- element, or 'Nothing' if there is no such element. 
+-- This implementation uses memchr(3).
+elemIndex :: Word8 -> ByteString -> Maybe Int64
+elemIndex c (LPS ps) = elemIndex' 0 ps
+  where elemIndex' _ []     = Nothing
+        elemIndex' n (x:xs) =
+          case P.elemIndex c x of
+            Nothing -> elemIndex' (n + fromIntegral (P.length x)) xs
+            Just i  -> Just (n + fromIntegral i)
+
+{-
+-- | /O(n)/ The 'elemIndexEnd' function returns the last index of the
+-- element in the given 'ByteString' which is equal to the query
+-- element, or 'Nothing' if there is no such element. The following
+-- holds:
+--
+-- > elemIndexEnd c xs == 
+-- > (-) (length xs - 1) `fmap` elemIndex c (reverse xs)
+--
+elemIndexEnd :: Word8 -> ByteString -> Maybe Int
+elemIndexEnd ch (PS x s l) = inlinePerformIO $ withForeignPtr x $ \p ->
+    go (p `plusPtr` s) (l-1)
+  where
+    STRICT2(go)
+    go p i | i < 0     = return Nothing
+           | otherwise = do ch' <- peekByteOff p i
+                            if ch == ch'
+                                then return $ Just i
+                                else go p (i-1)
+-}
+-- | /O(n)/ The 'elemIndices' function extends 'elemIndex', by returning
+-- the indices of all elements equal to the query element, in ascending order.
+-- This implementation uses memchr(3).
+elemIndices :: Word8 -> ByteString -> [Int64]
+elemIndices c (LPS ps) = elemIndices' 0 ps
+  where elemIndices' _ []     = []
+        elemIndices' n (x:xs) = L.map ((+n).fromIntegral) (P.elemIndices c x)
+                             ++ elemIndices' (n + fromIntegral (P.length x)) xs
+
+-- | count returns the number of times its argument appears in the ByteString
+--
+-- > count = length . elemIndices
+--
+-- But more efficiently than using length on the intermediate list.
+count :: Word8 -> ByteString -> Int64
+count w (LPS xs) = L.sum (L.map (fromIntegral . P.count w) xs)
+
+-- | The 'findIndex' function takes a predicate and a 'ByteString' and
+-- returns the index of the first element in the ByteString
+-- satisfying the predicate.
+findIndex :: (Word8 -> Bool) -> ByteString -> Maybe Int64
+findIndex k (LPS ps) = findIndex' 0 ps
+  where findIndex' _ []     = Nothing
+        findIndex' n (x:xs) =
+          case P.findIndex k x of
+            Nothing -> findIndex' (n + fromIntegral (P.length x)) xs
+            Just i  -> Just (n + fromIntegral i)
+{-# INLINE findIndex #-}
+
+-- | /O(n)/ The 'find' function takes a predicate and a ByteString,
+-- and returns the first element in matching the predicate, or 'Nothing'
+-- if there is no such element.
+--
+-- > find f p = case findIndex f p of Just n -> Just (p ! n) ; _ -> Nothing
+--
+find :: (Word8 -> Bool) -> ByteString -> Maybe Word8
+find f (LPS ps) = find' ps
+  where find' []     = Nothing
+        find' (x:xs) = case P.find f x of
+            Nothing -> find' xs
+            Just w  -> Just w
+{-# INLINE find #-}
+
+-- | The 'findIndices' function extends 'findIndex', by returning the
+-- indices of all elements satisfying the predicate, in ascending order.
+findIndices :: (Word8 -> Bool) -> ByteString -> [Int64]
+findIndices k (LPS ps) = findIndices' 0 ps
+  where findIndices' _ []     = []
+        findIndices' n (x:xs) = L.map ((+n).fromIntegral) (P.findIndices k x)
+                             ++ findIndices' (n + fromIntegral (P.length x)) xs
+
+-- ---------------------------------------------------------------------
+-- Searching ByteStrings
+
+-- | /O(n)/ 'elem' is the 'ByteString' membership predicate.
+elem :: Word8 -> ByteString -> Bool
+elem c ps = case elemIndex c ps of Nothing -> False ; _ -> True
+
+-- | /O(n)/ 'notElem' is the inverse of 'elem'
+notElem :: Word8 -> ByteString -> Bool
+notElem c ps = not (elem c ps)
+
+-- | /O(n)/ 'filter', applied to a predicate and a ByteString,
+-- returns a ByteString containing those characters that satisfy the
+-- predicate.
+filter :: (Word8 -> Bool) -> ByteString -> ByteString
+--filter f (LPS xs) = LPS (filterMap (P.filter' f) xs)
+filter p = LPS . P.loopArr . loopL (P.filterEFL p) P.NoAcc . unLPS
+{-# INLINE filter #-}
+
+-- | /O(n)/ and /O(n\/c) space/ A first order equivalent of /filter .
+-- (==)/, for the common case of filtering a single byte. It is more
+-- efficient to use /filterByte/ in this case.
+--
+-- > filterByte == filter . (==)
+--
+-- filterByte is around 10x faster, and uses much less space, than its
+-- filter equivalent
+filterByte :: Word8 -> ByteString -> ByteString
+filterByte w ps = replicate (count w ps) w
+-- filterByte w (LPS xs) = LPS (filterMap (P.filterByte w) xs)
+
+-- | /O(n)/ A first order equivalent of /filter . (\/=)/, for the common
+-- case of filtering a single byte out of a list. It is more efficient
+-- to use /filterNotByte/ in this case.
+--
+-- > filterNotByte == filter . (/=)
+--
+-- filterNotByte is around 2x faster than its filter equivalent.
+filterNotByte :: Word8 -> ByteString -> ByteString
+filterNotByte w (LPS xs) = LPS (filterMap (P.filterNotByte w) xs)
+
+-- ---------------------------------------------------------------------
+-- Searching for substrings
+
+-- | /O(n)/ The 'isPrefixOf' function takes two ByteStrings and returns 'True'
+-- iff the first is a prefix of the second.
+isPrefixOf :: ByteString -> ByteString -> Bool
+isPrefixOf (LPS as) (LPS bs) = isPrefixL as bs
+  where isPrefixL [] _  = True
+        isPrefixL _ []  = False
+        isPrefixL (x:xs) (y:ys) | P.length x == P.length y = x == y  && isPrefixL xs ys
+                                | P.length x <  P.length y = x == yh && isPrefixL xs (yt:ys)
+                                | otherwise                = xh == y && isPrefixL (xt:xs) ys
+          where (xh,xt) = P.splitAt (P.length y) x
+                (yh,yt) = P.splitAt (P.length x) y
+
+-- | /O(n)/ The 'isSuffixOf' function takes two ByteStrings and returns 'True'
+-- iff the first is a suffix of the second.
+-- 
+-- The following holds:
+--
+-- > isSuffixOf x y == reverse x `isPrefixOf` reverse y
+--
+-- However, the real implemenation uses memcmp to compare the end of the
+-- string only, with no reverse required..
+--
+--isSuffixOf :: ByteString -> ByteString -> Bool
+--isSuffixOf = error "not yet implemented"
+
+-- ---------------------------------------------------------------------
+-- Zipping
+
+-- | /O(n)/ 'zip' takes two ByteStrings and returns a list of
+-- corresponding pairs of bytes. If one input ByteString is short,
+-- excess elements of the longer ByteString are discarded. This is
+-- equivalent to a pair of 'unpack' operations.
+zip :: ByteString -> ByteString -> [(Word8,Word8)]
+zip = zipWith (,)
+
+-- | 'zipWith' generalises 'zip' by zipping with the function given as
+-- the first argument, instead of a tupling function.  For example,
+-- @'zipWith' (+)@ is applied to two ByteStrings to produce the list of
+-- corresponding sums.
+zipWith :: (Word8 -> Word8 -> a) -> ByteString -> ByteString -> [a]
+zipWith _ (LPS [])     (LPS _)  = []
+zipWith _ (LPS _)      (LPS []) = []
+zipWith f (LPS (a:as)) (LPS (b:bs)) = zipWith' a as b bs
+  where zipWith' x xs y ys =
+          (f (P.unsafeHead x) (P.unsafeHead y) : zipWith'' (P.unsafeTail x) xs (P.unsafeTail y) ys)
+
+        zipWith'' x []      _ _       | P.null x       = []
+        zipWith'' _ _       y []      | P.null y       = []
+        zipWith'' x xs      y ys      | not (P.null x)
+                                     && not (P.null y) = zipWith' x  xs y  ys
+        zipWith'' x xs      _ (y':ys) | not (P.null x) = zipWith' x  xs y' ys
+        zipWith'' _ (x':xs) y ys      | not (P.null y) = zipWith' x' xs y  ys
+        zipWith'' _ (x':xs) _ (y':ys)                  = zipWith' x' xs y' ys
+
+-- | /O(n)/ 'unzip' transforms a list of pairs of bytes into a pair of
+-- ByteStrings. Note that this performs two 'pack' operations.
+{-
+unzip :: [(Word8,Word8)] -> (ByteString,ByteString)
+unzip _ls = error "not yet implemented"
+{-# INLINE unzip #-}
+-}
+
+-- ---------------------------------------------------------------------
+-- Special lists
+
+-- | /O(n)/ Return all initial segments of the given 'ByteString', shortest first.
+inits :: ByteString -> [ByteString]
+inits = (LPS [] :) . inits' . unLPS
+  where inits' []     = []
+        inits' (x:xs) = L.map (\x' -> LPS [x']) (L.tail (P.inits x))
+                     ++ L.map (\(LPS xs') -> LPS (x:xs')) (inits' xs)
+
+-- | /O(n)/ Return all final segments of the given 'ByteString', longest first.
+tails :: ByteString -> [ByteString]
+tails = tails' . unLPS
+  where tails' []           = LPS [] : []
+        tails' xs@(x:xs')
+          | P.length x == 1 = LPS xs : tails' xs'
+          | otherwise       = LPS xs : tails' (P.unsafeTail x : xs')
+
+-- ---------------------------------------------------------------------
+
+-- TODO defrag func that concatenates block together that are below a threshold
+-- defrag :: Int -> ByteString -> ByteString
+
+-- ---------------------------------------------------------------------
+-- Lazy ByteString IO
+
+-- | Read entire handle contents /lazily/ into a 'ByteString'. Chunks
+-- are read on demand, in @k@-sized chunks.
+hGetContentsN :: Int -> Handle -> IO ByteString
+hGetContentsN k h = lazyRead >>= return . LPS
+  where
+    lazyRead = unsafeInterleaveIO $ do
+        ps <- P.hGet h k
+        case P.length ps of
+            0         -> return []
+            n | n < k -> return [ps]
+            _         -> do pss <- lazyRead
+                            return (ps : pss)
+
+-- | Read @n@ bytes into a 'ByteString', directly from the
+-- specified 'Handle', in chunks of size @k@.
+hGetN :: Int -> Handle -> Int -> IO ByteString
+hGetN _ _ 0 = return empty
+hGetN k h n = readChunks n >>= return . LPS
+  where
+    STRICT1(readChunks)
+    readChunks i = do
+        ps <- P.hGet h (min k i)
+        case P.length ps of
+            0          -> return []
+            m | m == i -> return [ps]
+            m          -> do pss <- readChunks (i - m)
+                             return (ps : pss)
+
+#if defined(__GLASGOW_HASKELL__)
+-- | hGetNonBlockingN is similar to 'hGetContentsN', except that it will never block
+-- waiting for data to become available, instead it returns only whatever data
+-- is available. Chunks are read on demand, in @k@-sized chunks.
+hGetNonBlockingN :: Int -> Handle -> Int -> IO ByteString
+hGetNonBlockingN _ _ 0 = return empty
+hGetNonBlockingN k h n = readChunks n >>= return . LPS
+  where
+    readChunks i = do
+        ps <- P.hGetNonBlocking h (min k i)
+        case P.length ps of
+            0         -> return []
+            m | fromIntegral m < i -> return [ps]
+            m         -> do pss <- readChunks (i - m)
+                            return (ps : pss)
+#endif
+
+-- | Read entire handle contents /lazily/ into a 'ByteString'. Chunks
+-- are read on demand, using the default chunk size.
+hGetContents :: Handle -> IO ByteString
+hGetContents = hGetContentsN defaultChunkSize
+
+-- | Read @n@ bytes into a 'ByteString', directly from the specified 'Handle'.
+hGet :: Handle -> Int -> IO ByteString
+hGet = hGetN defaultChunkSize
+
+#if defined(__GLASGOW_HASKELL__)
+-- | hGetNonBlocking is similar to 'hGet', except that it will never block
+-- waiting for data to become available, instead it returns only whatever data
+-- is available.
+hGetNonBlocking :: Handle -> Int -> IO ByteString
+hGetNonBlocking = hGetNonBlockingN defaultChunkSize
+#endif
+
+
+-- | Read an entire file /lazily/ into a 'ByteString'.
+readFile :: FilePath -> IO ByteString
+readFile f = openBinaryFile f ReadMode >>= hGetContents
+
+-- | Write a 'ByteString' to a file.
+writeFile :: FilePath -> ByteString -> IO ()
+writeFile f txt = bracket (openBinaryFile f WriteMode) hClose
+    (\hdl -> hPut hdl txt)
+
+-- | Append a 'ByteString' to a file.
+appendFile :: FilePath -> ByteString -> IO ()
+appendFile f txt = bracket (openBinaryFile f AppendMode) hClose
+    (\hdl -> hPut hdl txt)
+
+-- | getContents. Equivalent to hGetContents stdin. Will read /lazily/
+getContents :: IO ByteString
+getContents = hGetContents stdin
+
+-- | Outputs a 'ByteString' to the specified 'Handle'.
+hPut :: Handle -> ByteString -> IO ()
+hPut h (LPS xs) = mapM_ (P.hPut h) xs
+
+-- | Write a ByteString to stdout
+putStr :: ByteString -> IO ()
+putStr = hPut stdout
+
+-- | Write a ByteString to stdout, appending a newline byte
+putStrLn :: ByteString -> IO ()
+putStrLn ps = hPut stdout ps >> hPut stdout (singleton 0x0a)
+
+-- | The interact function takes a function of type @ByteString -> ByteString@
+-- as its argument. The entire input from the standard input device is passed
+-- to this function as its argument, and the resulting string is output on the
+-- standard output device. It's great for writing one line programs!
+interact :: (ByteString -> ByteString) -> IO ()
+interact transformer = putStr . transformer =<< getContents
+
+-- ---------------------------------------------------------------------
+-- Internal utilities
+
+-- Common up near identical calls to `error' to reduce the number
+-- constant strings created when compiled:
+errorEmptyList :: String -> a
+errorEmptyList fun = moduleError fun "empty ByteString"
+
+moduleError :: String -> String -> a
+moduleError fun msg = error ("Data.ByteString.Lazy." ++ fun ++ ':':' ':msg)
+
+-- A manually fused version of "filter (not.null) . map f", since they
+-- don't seem to fuse themselves. Really helps out filter*, concatMap.
+--
+-- TODO fuse.
+--
+filterMap :: (P.ByteString -> P.ByteString) -> [P.ByteString] -> [P.ByteString]
+filterMap _ []     = []
+filterMap f (x:xs) = case f x of
+                    y | P.null y  ->     filterMap f xs      -- manually fuse the invariant filter
+                      | otherwise -> y : filterMap f xs
+{-# INLINE filterMap #-}
+
diff --git a/Data/ByteString/Lazy/Char8.hs b/Data/ByteString/Lazy/Char8.hs
new file mode 100644
index 0000000..15af132
--- /dev/null
+++ b/Data/ByteString/Lazy/Char8.hs
@@ -0,0 +1,693 @@
+{-# OPTIONS_GHC -cpp -optc-O1 -fno-warn-orphans #-}
+--
+-- -optc-O2 breaks with 4.0.4 gcc on debian
+--
+-- Module      : Data.ByteString.Lazy.Char8
+-- Copyright   : (c) Don Stewart 2006
+-- License     : BSD-style
+--
+-- Maintainer  : dons@cse.unsw.edu.au
+-- Stability   : experimental
+-- Portability : portable (tested with GHC>=6.4.1 and Hugs 2005)
+-- 
+
+--
+-- | Manipulate /lazy/ 'ByteString's using 'Char' operations. All Chars will
+-- be truncated to 8 bits. It can be expected that these functions will
+-- run at identical speeds to their Word8 equivalents in
+-- "Data.ByteString.Lazy".
+--
+-- This module is intended to be imported @qualified@, to avoid name
+-- clashes with "Prelude" functions.  eg.
+--
+-- > import qualified Data.ByteString.Lazy.Char8 as C
+--
+
+module Data.ByteString.Lazy.Char8 (
+
+        -- * The @ByteString@ type
+        ByteString(..),         -- instances: Eq, Ord, Show, Read, Data, Typeable
+
+        -- * Introducing and eliminating 'ByteString's
+        empty,                  -- :: ByteString
+        singleton,               -- :: Char   -> ByteString
+        pack,                   -- :: String -> ByteString
+        unpack,                 -- :: ByteString -> String
+
+        -- * Basic interface
+        cons,                   -- :: Char -> ByteString -> ByteString
+        snoc,                   -- :: ByteString -> Char -> ByteString
+        append,                 -- :: ByteString -> ByteString -> ByteString
+        head,                   -- :: ByteString -> Char
+        last,                   -- :: ByteString -> Char
+        tail,                   -- :: ByteString -> ByteString
+        init,                   -- :: ByteString -> ByteString
+        null,                   -- :: ByteString -> Bool
+        length,                 -- :: ByteString -> Int64
+
+        -- * Transformating ByteStrings
+        map,                    -- :: (Char -> Char) -> ByteString -> ByteString
+        reverse,                -- :: ByteString -> ByteString
+--      intersperse,            -- :: Char -> ByteString -> ByteString
+        transpose,              -- :: [ByteString] -> [ByteString]
+
+        -- * Reducing 'ByteString's (folds)
+        foldl,                  -- :: (a -> Char -> a) -> a -> ByteString -> a
+        foldl',                 -- :: (a -> Char -> a) -> a -> ByteString -> a
+        foldl1,                 -- :: (Char -> Char -> Char) -> ByteString -> Char
+        foldl1',                -- :: (Char -> Char -> Char) -> ByteString -> Char
+        foldr,                  -- :: (Char -> a -> a) -> a -> ByteString -> a
+        foldr1,                 -- :: (Char -> Char -> Char) -> ByteString -> Char
+
+        -- ** Special folds
+        concat,                 -- :: [ByteString] -> ByteString
+        concatMap,              -- :: (Char -> ByteString) -> ByteString -> ByteString
+        any,                    -- :: (Char -> Bool) -> ByteString -> Bool
+        all,                    -- :: (Char -> Bool) -> ByteString -> Bool
+        maximum,                -- :: ByteString -> Char
+        minimum,                -- :: ByteString -> Char
+
+        -- * Building ByteStrings
+        -- ** Scans
+        scanl,                  -- :: (Char -> Char -> Char) -> Char -> ByteString -> ByteString
+--      scanl1,                 -- :: (Char -> Char -> Char) -> ByteString -> ByteString
+--      scanr,                  -- :: (Char -> Char -> Char) -> Char -> ByteString -> ByteString
+--      scanr1,                 -- :: (Char -> Char -> Char) -> ByteString -> ByteString
+
+        -- ** Accumulating maps
+        mapAccumL,   -- :: (acc -> Char -> (acc, Char)) -> acc -> ByteString -> (acc, ByteString)
+        mapIndexed,  -- :: (Int64 -> Char -> Char) -> ByteString -> ByteString
+
+        -- ** Infinite ByteStrings
+        repeat,                 -- :: Char -> ByteString
+        replicate,              -- :: Int64 -> Char -> ByteString
+        cycle,                  -- :: ByteString -> ByteString
+        iterate,                -- :: (Char -> Char) -> Char -> ByteString
+
+        -- ** Unfolding
+        unfoldr,                -- :: (a -> Maybe (Char, a)) -> a -> ByteString
+
+        -- * Substrings
+
+        -- ** Breaking strings
+        take,                   -- :: Int64 -> ByteString -> ByteString
+        drop,                   -- :: Int64 -> ByteString -> ByteString
+        splitAt,                -- :: Int64 -> ByteString -> (ByteString, ByteString)
+        takeWhile,              -- :: (Char -> Bool) -> ByteString -> ByteString
+        dropWhile,              -- :: (Char -> Bool) -> ByteString -> ByteString
+        span,                   -- :: (Char -> Bool) -> ByteString -> (ByteString, ByteString)
+        break,                  -- :: (Char -> Bool) -> ByteString -> (ByteString, ByteString)
+        group,                  -- :: ByteString -> [ByteString]
+        groupBy,                -- :: (Char -> Char -> Bool) -> ByteString -> [ByteString]
+        inits,                  -- :: ByteString -> [ByteString]
+        tails,                  -- :: ByteString -> [ByteString]
+
+        -- ** Breaking and dropping on specific Chars
+        breakChar,              -- :: Char -> ByteString -> (ByteString, ByteString)
+        spanChar,               -- :: Char -> ByteString -> (ByteString, ByteString)
+
+        -- ** Breaking into many substrings
+        split,                  -- :: Char -> ByteString -> [ByteString]
+        splitWith,              -- :: (Char -> Bool) -> ByteString -> [ByteString]
+        tokens,                 -- :: (Char -> Bool) -> ByteString -> [ByteString]
+
+        -- ** Breaking into lines and words
+        lines,                  -- :: ByteString -> [ByteString]
+        words,                  -- :: ByteString -> [ByteString]
+        unlines,                -- :: [ByteString] -> ByteString
+        unwords,                -- :: ByteString -> [ByteString]
+
+        -- ** Joining strings
+        join,                   -- :: ByteString -> [ByteString] -> ByteString
+        joinWithChar,           -- :: Char -> ByteString -> ByteString -> ByteString
+
+        -- * Predicates
+        isPrefixOf,             -- :: ByteString -> ByteString -> Bool
+--      isSuffixOf,             -- :: ByteString -> ByteString -> Bool
+
+        -- * Searching ByteStrings
+
+        -- ** Searching by equality
+        elem,                   -- :: Char -> ByteString -> Bool
+        notElem,                -- :: Char -> ByteString -> Bool
+        filterChar,             -- :: Char -> ByteString -> ByteString
+        filterNotChar,          -- :: Char -> ByteString -> ByteString
+
+        -- ** Searching with a predicate
+        find,                   -- :: (Char -> Bool) -> ByteString -> Maybe Char
+        filter,                 -- :: (Char -> Bool) -> ByteString -> ByteString
+--      partition               -- :: (Char -> Bool) -> ByteString -> (ByteString, ByteString)
+
+        -- * Indexing ByteStrings
+        index,                  -- :: ByteString -> Int64 -> Char
+        elemIndex,              -- :: Char -> ByteString -> Maybe Int64
+        elemIndices,            -- :: Char -> ByteString -> [Int64]
+        findIndex,              -- :: (Char -> Bool) -> ByteString -> Maybe Int64
+        findIndices,            -- :: (Char -> Bool) -> ByteString -> [Int64]
+        count,                  -- :: Char -> ByteString -> Int64
+
+        -- * Zipping and unzipping ByteStrings
+        zip,                    -- :: ByteString -> ByteString -> [(Char,Char)]
+        zipWith,                -- :: (Char -> Char -> c) -> ByteString -> ByteString -> [c]
+--      unzip,                  -- :: [(Char,Char)] -> (ByteString,ByteString)
+
+        -- * Ordered ByteStrings
+--        sort,                   -- :: ByteString -> ByteString
+
+        -- * Reading from ByteStrings
+        readInt,
+
+        -- * I\/O with 'ByteString's
+
+        -- ** Standard input and output
+        getContents,            -- :: IO ByteString
+        putStr,                 -- :: ByteString -> IO ()
+        putStrLn,               -- :: ByteString -> IO ()
+        interact,               -- :: (ByteString -> ByteString) -> IO ()
+
+        -- ** Files
+        readFile,               -- :: FilePath -> IO ByteString
+        writeFile,              -- :: FilePath -> ByteString -> IO ()
+        appendFile,             -- :: FilePath -> ByteString -> IO ()
+
+        -- ** I\/O with Handles
+        hGetContents,           -- :: Handle -> IO ByteString
+        hGetContentsN,          -- :: Int -> Handle -> IO ByteString
+        hGet,                   -- :: Handle -> Int64 -> IO ByteString
+        hGetN,                  -- :: Int -> Handle -> Int64 -> IO ByteString
+        hPut,                   -- :: Handle -> ByteString -> IO ()
+#if defined(__GLASGOW_HASKELL__)
+        hGetNonBlocking,        -- :: Handle -> IO ByteString
+        hGetNonBlockingN,       -- :: Int -> Handle -> IO ByteString
+#endif
+  ) where
+
+-- Functions transparently exported
+import Data.ByteString.Lazy 
+        (ByteString(..)
+        ,empty,null,length,tail,init,append,reverse,transpose
+        ,concat,take,drop,splitAt,join,isPrefixOf,group,inits, tails
+        ,hGetContentsN, hGetN, hGetContents, hGet, hPut, getContents
+#if defined(__GLASGOW_HASKELL__)
+        ,hGetNonBlocking, hGetNonBlockingN
+#endif
+        ,putStr, putStrLn
+        ,readFile, writeFile, appendFile)
+
+-- Functions we need to wrap.
+import qualified Data.ByteString.Lazy as L
+import qualified Data.ByteString as B
+import qualified Data.ByteString.Base as B
+import Data.ByteString.Base (w2c, c2w, isSpaceWord8)
+
+import Data.Int (Int64)
+import qualified Data.List as List (intersperse)
+
+import qualified Prelude as P
+import Prelude hiding           
+        (reverse,head,tail,last,init,null,length,map,lines,foldl,foldr,unlines
+        ,concat,any,take,drop,splitAt,takeWhile,dropWhile,span,break,elem,filter
+        ,unwords,words,maximum,minimum,all,concatMap,scanl,scanl1,foldl1,foldr1
+        ,readFile,writeFile,appendFile,replicate,getContents,getLine,putStr,putStrLn
+        ,zip,zipWith,unzip,notElem,repeat,iterate)
+
+#define STRICT1(f) f a | a `seq` False = undefined
+#define STRICT2(f) f a b | a `seq` b `seq` False = undefined
+#define STRICT3(f) f a b c | a `seq` b `seq` c `seq` False = undefined
+#define STRICT4(f) f a b c d | a `seq` b `seq` c `seq` d `seq` False = undefined
+#define STRICT5(f) f a b c d e | a `seq` b `seq` c `seq` d `seq` e `seq` False = undefined
+
+------------------------------------------------------------------------
+
+-- | /O(1)/ Convert a 'Char' into a 'ByteString'
+singleton :: Char -> ByteString
+singleton = L.singleton . c2w
+{-# INLINE singleton #-}
+
+-- | /O(n)/ Convert a 'String' into a 'ByteString'. 
+pack :: [Char] -> ByteString
+pack = L.packWith c2w
+
+-- | /O(n)/ Converts a 'ByteString' to a 'String'.
+unpack :: ByteString -> [Char]
+unpack = L.unpackWith w2c
+{-# INLINE unpack #-}
+
+-- | /O(n)/ 'cons' is analogous to (:) for lists, but of different
+-- complexity, as it requires a memcpy.
+cons :: Char -> ByteString -> ByteString
+cons = L.cons . c2w
+{-# INLINE cons #-}
+
+-- | /O(n)/ Append a Char to the end of a 'ByteString'. Similar to
+-- 'cons', this function performs a memcpy.
+snoc :: ByteString -> Char -> ByteString
+snoc p = L.snoc p . c2w
+{-# INLINE snoc #-}
+
+-- | /O(1)/ Extract the first element of a ByteString, which must be non-empty.
+head :: ByteString -> Char
+head = w2c . L.head
+{-# INLINE head #-}
+
+-- | /O(1)/ Extract the last element of a packed string, which must be non-empty.
+last :: ByteString -> Char
+last = w2c . L.last
+{-# INLINE last #-}
+
+-- | /O(n)/ 'map' @f xs@ is the ByteString obtained by applying @f@ to each element of @xs@
+map :: (Char -> Char) -> ByteString -> ByteString
+map f = L.map (c2w . f . w2c)
+{-# INLINE map #-}
+
+-- | 'foldl', applied to a binary operator, a starting value (typically
+-- the left-identity of the operator), and a ByteString, reduces the
+-- ByteString using the binary operator, from left to right.
+foldl :: (a -> Char -> a) -> a -> ByteString -> a
+foldl f = L.foldl (\a c -> f a (w2c c))
+{-# INLINE foldl #-}
+
+-- | 'foldl\'' is like foldl, but strict in the accumulator.
+foldl' :: (a -> Char -> a) -> a -> ByteString -> a
+foldl' f = L.foldl' (\a c -> f a (w2c c))
+{-# INLINE foldl' #-}
+
+-- | 'foldr', applied to a binary operator, a starting value
+-- (typically the right-identity of the operator), and a packed string,
+-- reduces the packed string using the binary operator, from right to left.
+foldr :: (Char -> a -> a) -> a -> ByteString -> a
+foldr f = L.foldr (\c a -> f (w2c c) a)
+{-# INLINE foldr #-}
+
+-- | 'foldl1' is a variant of 'foldl' that has no starting value
+-- argument, and thus must be applied to non-empty 'ByteStrings'.
+foldl1 :: (Char -> Char -> Char) -> ByteString -> Char
+foldl1 f ps = w2c (L.foldl1 (\x y -> c2w (f (w2c x) (w2c y))) ps)
+{-# INLINE foldl1 #-}
+
+-- | 'foldl1\'' is like 'foldl1', but strict in the accumulator.
+foldl1' :: (Char -> Char -> Char) -> ByteString -> Char
+foldl1' f ps = w2c (L.foldl1' (\x y -> c2w (f (w2c x) (w2c y))) ps)
+
+-- | 'foldr1' is a variant of 'foldr' that has no starting value argument,
+-- and thus must be applied to non-empty 'ByteString's
+foldr1 :: (Char -> Char -> Char) -> ByteString -> Char
+foldr1 f ps = w2c (L.foldr1 (\x y -> c2w (f (w2c x) (w2c y))) ps)
+{-# INLINE foldr1 #-}
+
+-- | Map a function over a 'ByteString' and concatenate the results
+concatMap :: (Char -> ByteString) -> ByteString -> ByteString
+concatMap f = L.concatMap (f . w2c)
+{-# INLINE concatMap #-}
+
+-- | Applied to a predicate and a ByteString, 'any' determines if
+-- any element of the 'ByteString' satisfies the predicate.
+any :: (Char -> Bool) -> ByteString -> Bool
+any f = L.any (f . w2c)
+{-# INLINE any #-}
+
+-- | Applied to a predicate and a 'ByteString', 'all' determines if
+-- all elements of the 'ByteString' satisfy the predicate.
+all :: (Char -> Bool) -> ByteString -> Bool
+all f = L.all (f . w2c)
+{-# INLINE all #-}
+
+-- | 'maximum' returns the maximum value from a 'ByteString'
+maximum :: ByteString -> Char
+maximum = w2c . L.maximum
+{-# INLINE maximum #-}
+
+-- | 'minimum' returns the minimum value from a 'ByteString'
+minimum :: ByteString -> Char
+minimum = w2c . L.minimum
+{-# INLINE minimum #-}
+
+-- ---------------------------------------------------------------------
+-- Building ByteStrings
+
+-- | 'scanl' is similar to 'foldl', but returns a list of successive
+-- reduced values from the left. This function will fuse.
+--
+-- > scanl f z [x1, x2, ...] == [z, z `f` x1, (z `f` x1) `f` x2, ...]
+--
+-- Note that
+--
+-- > last (scanl f z xs) == foldl f z xs.
+scanl :: (Char -> Char -> Char) -> Char -> ByteString -> ByteString
+scanl f z = L.scanl (\a b -> c2w (f (w2c a) (w2c b))) (c2w z)
+
+-- | The 'mapAccumL' function behaves like a combination of 'map' and
+-- 'foldl'; it applies a function to each element of a ByteString,
+-- passing an accumulating parameter from left to right, and returning a
+-- final value of this accumulator together with the new ByteString.
+mapAccumL :: (acc -> Char -> (acc, Char)) -> acc -> ByteString -> (acc, ByteString)
+mapAccumL f = L.mapAccumL (\a w -> case f a (w2c w) of (a',c) -> (a', c2w c))
+
+-- | /O(n)/ map Char functions, provided with the index at each position
+mapIndexed :: (Int -> Char -> Char) -> ByteString -> ByteString
+mapIndexed f = L.mapIndexed (\i w -> c2w (f i (w2c w)))
+
+------------------------------------------------------------------------
+-- Generating and unfolding ByteStrings
+
+-- | @'iterate' f x@ returns an infinite ByteString of repeated applications
+-- of @f@ to @x@:
+--
+-- > iterate f x == [x, f x, f (f x), ...]
+--
+iterate :: (Char -> Char) -> Char -> ByteString
+iterate f = L.iterate (c2w . f . w2c) . c2w
+
+-- | @'repeat' x@ is an infinite ByteString, with @x@ the value of every
+-- element.
+--
+repeat :: Char -> ByteString
+repeat = L.repeat . c2w
+
+-- | /O(n)/ @'replicate' n x@ is a ByteString of length @n@ with @x@
+-- the value of every element.
+--
+replicate :: Int64 -> Char -> ByteString
+replicate w c = L.replicate w (c2w c)
+
+-- | /O(n)/ The 'unfoldr' function is analogous to the List \'unfoldr\'.
+-- 'unfoldr' builds a ByteString from a seed value.  The function takes
+-- the element and returns 'Nothing' if it is done producing the
+-- ByteString or returns 'Just' @(a,b)@, in which case, @a@ is a
+-- prepending to the ByteString and @b@ is used as the next element in a
+-- recursive call.
+unfoldr :: (a -> Maybe (Char, a)) -> a -> ByteString
+unfoldr f = L.unfoldr $ \a -> case f a of
+                                    Nothing      -> Nothing
+                                    Just (c, a') -> Just (c2w c, a')
+
+------------------------------------------------------------------------
+
+-- | 'takeWhile', applied to a predicate @p@ and a ByteString @xs@,
+-- returns the longest prefix (possibly empty) of @xs@ of elements that
+-- satisfy @p@.
+takeWhile :: (Char -> Bool) -> ByteString -> ByteString
+takeWhile f = L.takeWhile (f . w2c)
+{-# INLINE takeWhile #-}
+
+-- | 'dropWhile' @p xs@ returns the suffix remaining after 'takeWhile' @p xs@.
+dropWhile :: (Char -> Bool) -> ByteString -> ByteString
+dropWhile f = L.dropWhile (f . w2c)
+{-# INLINE dropWhile #-}
+
+-- | 'break' @p@ is equivalent to @'span' ('not' . p)@.
+break :: (Char -> Bool) -> ByteString -> (ByteString, ByteString)
+break f = L.break (f . w2c)
+{-# INLINE break #-}
+
+-- | 'span' @p xs@ breaks the ByteString into two segments. It is
+-- equivalent to @('takeWhile' p xs, 'dropWhile' p xs)@
+span :: (Char -> Bool) -> ByteString -> (ByteString, ByteString)
+span f = L.span (f . w2c)
+{-# INLINE span #-}
+
+-- | 'breakChar' breaks its ByteString argument at the first occurence
+-- of the specified Char. It is more efficient than 'break' as it is
+-- implemented with @memchr(3)@. I.e.
+-- 
+-- > break (=='c') "abcd" == breakChar 'c' "abcd"
+--
+breakChar :: Char -> ByteString -> (ByteString, ByteString)
+breakChar = L.breakByte . c2w
+{-# INLINE breakChar #-}
+
+-- | 'spanChar' breaks its ByteString argument at the first
+-- occurence of a Char other than its argument. It is more efficient
+-- than 'span (==)'
+--
+-- > span  (=='c') "abcd" == spanByte 'c' "abcd"
+--
+spanChar :: Char -> ByteString -> (ByteString, ByteString)
+spanChar = L.spanByte . c2w
+{-# INLINE spanChar #-}
+
+-- | /O(n)/ Break a 'ByteString' into pieces separated by the byte
+-- argument, consuming the delimiter. I.e.
+--
+-- > split '\n' "a\nb\nd\ne" == ["a","b","d","e"]
+-- > split 'a'  "aXaXaXa"    == ["","X","X","X"]
+-- > split 'x'  "x"          == ["",""]
+-- 
+-- and
+--
+-- > join [c] . split c == id
+-- > split == splitWith . (==)
+-- 
+-- As for all splitting functions in this library, this function does
+-- not copy the substrings, it just constructs new 'ByteStrings' that
+-- are slices of the original.
+--
+split :: Char -> ByteString -> [ByteString]
+split = L.split . c2w
+{-# INLINE split #-}
+
+-- | /O(n)/ Splits a 'ByteString' into components delimited by
+-- separators, where the predicate returns True for a separator element.
+-- The resulting components do not contain the separators.  Two adjacent
+-- separators result in an empty component in the output.  eg.
+--
+-- > splitWith (=='a') "aabbaca" == ["","","bb","c",""]
+--
+splitWith :: (Char -> Bool) -> ByteString -> [ByteString]
+splitWith f = L.splitWith (f . w2c)
+{-# INLINE splitWith #-}
+
+-- | Like 'splitWith', except that sequences of adjacent separators are
+-- treated as a single separator. eg.
+-- 
+-- > tokens (=='a') "aabbaca" == ["bb","c"]
+--
+tokens :: (Char -> Bool) -> ByteString -> [ByteString]
+tokens f = L.tokens (f . w2c)
+{-# INLINE tokens #-}
+
+-- | The 'groupBy' function is the non-overloaded version of 'group'.
+groupBy :: (Char -> Char -> Bool) -> ByteString -> [ByteString]
+groupBy k = L.groupBy (\a b -> k (w2c a) (w2c b))
+
+-- | /O(n)/ joinWithChar. An efficient way to join to two ByteStrings with a
+-- char. Around 4 times faster than the generalised join.
+--
+joinWithChar :: Char -> ByteString -> ByteString -> ByteString
+joinWithChar = L.joinWithByte . c2w
+{-# INLINE joinWithChar #-}
+
+-- | /O(1)/ 'ByteString' index (subscript) operator, starting from 0.
+index :: ByteString -> Int64 -> Char
+index = (w2c .) . L.index
+{-# INLINE index #-}
+
+-- | /O(n)/ The 'elemIndex' function returns the index of the first
+-- element in the given 'ByteString' which is equal (by memchr) to the
+-- query element, or 'Nothing' if there is no such element.
+elemIndex :: Char -> ByteString -> Maybe Int64
+elemIndex = L.elemIndex . c2w
+{-# INLINE elemIndex #-}
+
+-- | /O(n)/ The 'elemIndices' function extends 'elemIndex', by returning
+-- the indices of all elements equal to the query element, in ascending order.
+elemIndices :: Char -> ByteString -> [Int64]
+elemIndices = L.elemIndices . c2w
+{-# INLINE elemIndices #-}
+
+-- | The 'findIndex' function takes a predicate and a 'ByteString' and
+-- returns the index of the first element in the ByteString satisfying the predicate.
+findIndex :: (Char -> Bool) -> ByteString -> Maybe Int64
+findIndex f = L.findIndex (f . w2c)
+{-# INLINE findIndex #-}
+
+-- | The 'findIndices' function extends 'findIndex', by returning the
+-- indices of all elements satisfying the predicate, in ascending order.
+findIndices :: (Char -> Bool) -> ByteString -> [Int64]
+findIndices f = L.findIndices (f . w2c)
+
+-- | count returns the number of times its argument appears in the ByteString
+--
+-- > count      == length . elemIndices
+-- > count '\n' == length . lines
+--
+-- But more efficiently than using length on the intermediate list.
+count :: Char -> ByteString -> Int64
+count c = L.count (c2w c)
+
+-- | /O(n)/ 'elem' is the 'ByteString' membership predicate. This
+-- implementation uses @memchr(3)@.
+elem :: Char -> ByteString -> Bool
+elem c = L.elem (c2w c)
+{-# INLINE elem #-}
+
+-- | /O(n)/ 'notElem' is the inverse of 'elem'
+notElem :: Char -> ByteString -> Bool
+notElem c = L.notElem (c2w c)
+{-# INLINE notElem #-}
+
+-- | /O(n)/ 'filter', applied to a predicate and a ByteString,
+-- returns a ByteString containing those characters that satisfy the
+-- predicate.
+filter :: (Char -> Bool) -> ByteString -> ByteString
+filter f = L.filter (f . w2c)
+{-# INLINE filter #-}
+
+-- | /O(n)/ The 'find' function takes a predicate and a ByteString,
+-- and returns the first element in matching the predicate, or 'Nothing'
+-- if there is no such element.
+find :: (Char -> Bool) -> ByteString -> Maybe Char
+find f ps = w2c `fmap` L.find (f . w2c) ps
+{-# INLINE find #-}
+
+-- | /O(n)/ A first order equivalent of /filter . (==)/, for the common
+-- case of filtering a single Char. It is more efficient to use
+-- filterChar in this case.
+--
+-- > filterChar == filter . (==)
+--
+-- filterChar is around 10x faster, and uses much less space, than its
+-- filter equivalent
+--
+filterChar :: Char -> ByteString -> ByteString
+filterChar c = L.filterByte (c2w c)
+{-# INLINE filterChar #-}
+
+-- | /O(n)/ A first order equivalent of /filter . (\/=)/, for the common
+-- case of filtering a single Char out of a list. It is more efficient
+-- to use /filterNotChar/ in this case.
+--
+-- > filterNotChar == filter . (/=)
+--
+-- filterNotChar is around 3x faster, and uses much less space, than its
+-- filter equivalent
+--
+filterNotChar :: Char -> ByteString -> ByteString
+filterNotChar c = L.filterNotByte (c2w c)
+{-# INLINE filterNotChar #-}
+
+-- | /O(n)/ 'zip' takes two ByteStrings and returns a list of
+-- corresponding pairs of Chars. If one input ByteString is short,
+-- excess elements of the longer ByteString are discarded. This is
+-- equivalent to a pair of 'unpack' operations, and so space
+-- usage may be large for multi-megabyte ByteStrings
+zip :: ByteString -> ByteString -> [(Char,Char)]
+zip ps qs
+    | L.null ps || L.null qs = []
+    | otherwise = (head ps, head qs) : zip (L.tail ps) (L.tail qs)
+
+-- | 'zipWith' generalises 'zip' by zipping with the function given as
+-- the first argument, instead of a tupling function.  For example,
+-- @'zipWith' (+)@ is applied to two ByteStrings to produce the list
+-- of corresponding sums.
+zipWith :: (Char -> Char -> a) -> ByteString -> ByteString -> [a]
+zipWith f = L.zipWith ((. w2c) . f . w2c)
+
+-- | 'lines' breaks a ByteString up into a list of ByteStrings at
+-- newline Chars. The resulting strings do not contain newlines.
+--
+lines :: ByteString -> [ByteString]
+lines (LPS [])     = []
+lines (LPS (x:xs)) = loop0 x xs
+    where
+    -- this is a really performance sensitive function but the
+    -- chunked representation makes the general case a bit expensive
+    -- however assuming a large chunk size and normalish line lengths
+    -- we will find line endings much more frequently than chunk
+    -- endings so it makes sense to optimise for that common case.
+    -- So we partition into two special cases depending on whether we
+    -- are keeping back a list of chunks that will eventually be output
+    -- once we get to the end of the current line.
+
+    -- the common special case where we have no existing chunks of
+    -- the current line
+    loop0 :: B.ByteString -> [B.ByteString] -> [ByteString]
+    STRICT2(loop0)
+    loop0 ps pss =
+        case B.elemIndex (c2w '\n') ps of
+            Nothing -> case pss of
+                           []  | B.null ps ->            []
+                               | otherwise -> LPS [ps] : []
+                           (ps':pss')
+                               | B.null ps -> loop0 ps'      pss'
+                               | otherwise -> loop  ps' [ps] pss'
+
+            Just n | n /= 0    -> LPS [B.unsafeTake n ps]
+                                : loop0 (B.unsafeDrop (n+1) ps) pss
+                   | otherwise -> loop0 (B.unsafeTail ps) pss
+
+    -- the general case when we are building a list of chunks that are
+    -- part of the same line
+    loop :: B.ByteString -> [B.ByteString] -> [B.ByteString] -> [ByteString]
+    STRICT3(loop)
+    loop ps line pss =
+        case B.elemIndex (c2w '\n') ps of
+            Nothing ->
+                case pss of
+                    [] -> let ps' | B.null ps = P.reverse line
+                                  | otherwise = P.reverse (ps : line)
+                           in ps' `seq` (LPS ps' : [])
+
+                    (ps':pss')
+                        | B.null ps -> loop ps'       line  pss'
+                        | otherwise -> loop ps' (ps : line) pss'
+
+            Just n ->
+                let ps' | n == 0    = P.reverse line
+                        | otherwise = P.reverse (B.unsafeTake n ps : line)
+                 in ps' `seq` (LPS ps' : loop0 (B.unsafeDrop (n+1) ps) pss)
+
+-- | 'unlines' is an inverse operation to 'lines'.  It joins lines,
+-- after appending a terminating newline to each.
+unlines :: [ByteString] -> ByteString
+unlines [] = empty
+unlines ss = (concat $ List.intersperse nl ss) `append` nl -- half as much space
+    where nl = singleton '\n'
+
+-- | 'words' breaks a ByteString up into a list of words, which
+-- were delimited by Chars representing white space. And
+--
+-- > tokens isSpace = words
+--
+words :: ByteString -> [ByteString]
+words = L.tokens isSpaceWord8
+{-# INLINE words #-}
+
+-- | The 'unwords' function is analogous to the 'unlines' function, on words.
+unwords :: [ByteString] -> ByteString
+unwords = join (singleton ' ')
+{-# INLINE unwords #-}
+
+-- | readInt reads an Int from the beginning of the ByteString.  If
+-- there is no integer at the beginning of the string, it returns
+-- Nothing, otherwise it just returns the int read, and the rest of the
+-- string.
+readInt :: ByteString -> Maybe (Int, ByteString)
+readInt (LPS [])     = Nothing
+readInt (LPS (x:xs)) =
+        case w2c (B.unsafeHead x) of
+            '-' -> loop True  0 0 (B.unsafeTail x) xs
+            '+' -> loop False 0 0 (B.unsafeTail x) xs
+            _   -> loop False 0 0 x xs
+
+    where loop :: Bool -> Int -> Int -> B.ByteString -> [B.ByteString] -> Maybe (Int, ByteString)
+          STRICT5(loop)
+          loop neg i n ps pss
+              | B.null ps = case pss of
+                                []         -> end  neg i n ps  pss
+                                (ps':pss') -> loop neg i n ps' pss'
+              | otherwise =
+                  case B.unsafeHead ps of
+                    w | w >= 0x30
+                     && w <= 0x39 -> loop neg (i+1)
+                                          (n * 10 + (fromIntegral w - 0x30))
+                                          (B.unsafeTail ps) pss
+                      | otherwise -> end neg i n ps pss
+
+          end _   0 _ _  _   = Nothing
+          end neg _ n ps pss = let n'  | neg       = negate n
+                                       | otherwise = n
+                                   ps' | B.null ps =    pss
+                                       | otherwise = ps:pss
+                                in n' `seq` ps' `seq` Just $! (n', LPS ps')
+
diff --git a/Makefile b/Makefile
index d00bffe..7d96616 100644
--- a/Makefile
+++ b/Makefile
@@ -13,6 +13,7 @@ ALL_DIRS = \
 	Control/Monad/ST \
 	Data \
 	Data/ByteString \
+	Data/ByteString/Lazy \
 	Data/Generics \
 	Data/Array \
 	Data/Array/IO \
diff --git a/base.cabal b/base.cabal
index 4aead21..cd40db8 100644
--- a/base.cabal
+++ b/base.cabal
@@ -40,6 +40,10 @@ exposed-modules:
 	Data.Bool,
 	Data.ByteString,
 	Data.ByteString.Char8,
+    Data.ByteString.Lazy
+    Data.ByteString.Lazy.Char8
+    Data.ByteString.Base
+    Data.ByteString.Fusion
 	Data.Char,
 	Data.Complex,
 	Data.Dynamic,
diff --git a/cbits/fpstring.c b/cbits/fpstring.c
index b8fc540..d42ebe5 100644
--- a/cbits/fpstring.c
+++ b/cbits/fpstring.c
@@ -32,7 +32,7 @@
 #include "fpstring.h"
 
 /* copy a string in reverse */
-void reverse(unsigned char *dest, unsigned char *from, int len) {
+void fps_reverse(unsigned char *dest, unsigned char *from, int len) {
     unsigned char *p, *q;
     p = from + len - 1;
     q = dest;
@@ -43,7 +43,7 @@ void reverse(unsigned char *dest, unsigned char *from, int len) {
 
 /* duplicate a string, interspersing the character through the elements
    of the duplicated string */
-void intersperse(unsigned char *dest, unsigned char *from, int len, char c) {
+void fps_intersperse(unsigned char *dest, unsigned char *from, int len, char c) {
     unsigned char *p, *q;
     p = from;
     q = dest;
@@ -55,7 +55,7 @@ void intersperse(unsigned char *dest, unsigned char *from, int len, char c) {
 }
 
 /* find maximum char in a packed string */
-unsigned char maximum(unsigned char *p, int len) {
+unsigned char fps_maximum(unsigned char *p, int len) {
     unsigned char *q, c = *p;
     for (q = p; q < p + len; q++)
         if (*q > c)
@@ -64,7 +64,7 @@ unsigned char maximum(unsigned char *p, int len) {
 }
 
 /* find minimum char in a packed string */
-unsigned char minimum(unsigned char *p, int len) {
+unsigned char fps_minimum(unsigned char *p, int len) {
     unsigned char *q, c = *p;
     for (q = p; q < p + len; q++)
         if (*q < c)
@@ -73,7 +73,7 @@ unsigned char minimum(unsigned char *p, int len) {
 }
 
 /* count the number of occurences of a char in a string */
-int count(unsigned char *p, int len, unsigned char w) {
+int fps_count(unsigned char *p, int len, unsigned char w) {
     int c;
     for (c = 0; len--; ++p)
         if (*p == w)
diff --git a/include/fpstring.h b/include/fpstring.h
index 614162d..42e8346 100644
--- a/include/fpstring.h
+++ b/include/fpstring.h
@@ -1,6 +1,6 @@
 
-void reverse(unsigned char *dest, unsigned char *from, int len);
-void intersperse(unsigned char *dest, unsigned char *from, int len, char c);
-unsigned char maximum(unsigned char *p, int len);
-unsigned char minimum(unsigned char *p, int len);
-int count(unsigned char *p, int len, unsigned char w);
+void fps_reverse(unsigned char *dest, unsigned char *from, int len);
+void fps_intersperse(unsigned char *dest, unsigned char *from, int len, char c);
+unsigned char fps_maximum(unsigned char *p, int len);
+unsigned char fps_minimum(unsigned char *p, int len);
+int fps_count(unsigned char *p, int len, unsigned char w);
diff --git a/package.conf.in b/package.conf.in
index ad57400..577f50b 100644
--- a/package.conf.in
+++ b/package.conf.in
@@ -38,6 +38,10 @@ exposed-modules:
 	Data.Bool,
 	Data.ByteString,
 	Data.ByteString.Char8,
+    Data.ByteString.Lazy
+    Data.ByteString.Lazy.Char8
+    Data.ByteString.Base
+    Data.ByteString.Fusion
 	Data.Char,
 	Data.Complex,
 	Data.Dynamic,
-- 
1.7.10.4