-{-# OPTIONS_GHC -cpp -optc-O1 -fffi -fglasgow-exts -fno-warn-incomplete-patterns #-}
---
--- -optc-O2 breaks with 4.0.4 gcc on debian
---
--- Module : ByteString.Lazy
+{-# OPTIONS_GHC -cpp -fglasgow-exts -fno-warn-orphans -fno-warn-incomplete-patterns #-}
+-- |
+-- Module : Data.ByteString.Lazy
-- Copyright : (c) Don Stewart 2006
-- (c) Duncan Coutts 2006
-- License : BSD-style
--
-- Maintainer : dons@cse.unsw.edu.au
-- Stability : experimental
--- Portability : portable, requires ffi and cpp
--- Tested with : GHC 6.4.1 and Hugs March 2005
+-- Portability : non-portable (instance of type synonym)
--
-
---
--- | A time and space-efficient implementation of lazy byte vectors
+-- A time and space-efficient implementation of lazy byte vectors
-- using lists of packed 'Word8' arrays, suitable for high performance
-- use, both in terms of large data quantities, or high speed
-- requirements. Byte vectors are encoded as lazy lists of strict 'Word8'
-- without requiring the entire vector be resident in memory.
--
-- Some operations, such as concat, append, reverse and cons, have
--- better complexity than their "Data.ByteString" equivalents, as due to
+-- better complexity than their "Data.ByteString" equivalents, due to
-- optimisations resulting from the list spine structure. And for other
--- operations Lazy ByteStrings are usually within a few percent of
+-- operations lazy ByteStrings are usually within a few percent of
-- strict ones, but with better heap usage. For data larger than the
-- available memory, or if you have tight memory constraints, this
-- module will be the only option. The default chunk size is 64k, which
--
-- > import qualified Data.ByteString.Lazy as B
--
--- Original GHC implementation by Bryan O\'Sullivan. Rewritten to use
--- UArray by Simon Marlow. Rewritten to support slices and use
--- ForeignPtr by David Roundy. Polished and extended by Don Stewart.
+-- Original GHC implementation by Bryan O\'Sullivan.
+-- Rewritten to use 'Data.Array.Unboxed.UArray' by Simon Marlow.
+-- Rewritten to support slices and use 'Foreign.ForeignPtr.ForeignPtr'
+-- by David Roundy.
+-- Polished and extended by Don Stewart.
-- Lazy variant by Duncan Coutts and Don Stewart.
--
module Data.ByteString.Lazy (
-- * The @ByteString@ type
- ByteString(..), -- instances: Eq, Ord, Show, Read, Data, Typeable
+ ByteString, -- instances: Eq, Ord, Show, Read, Data, Typeable
-- * Introducing and eliminating 'ByteString's
empty, -- :: ByteString
- singleton, -- :: Word8 -> ByteString
+ singleton, -- :: Word8 -> ByteString
pack, -- :: [Word8] -> ByteString
unpack, -- :: ByteString -> [Word8]
- packWith, -- :: (a -> Word8) -> [a] -> ByteString
- unpackWith, -- :: (Word8 -> a) -> ByteString -> [a]
+ fromChunks, -- :: [Strict.ByteString] -> ByteString
+ toChunks, -- :: ByteString -> [Strict.ByteString]
-- * Basic interface
cons, -- :: Word8 -> ByteString -> ByteString
inits, -- :: ByteString -> [ByteString]
tails, -- :: ByteString -> [ByteString]
- -- ** Breaking and dropping on specific bytes
- breakByte, -- :: Word8 -> ByteString -> (ByteString, ByteString)
- spanByte, -- :: Word8 -> ByteString -> (ByteString, ByteString)
-
-- ** Breaking into many substrings
split, -- :: Word8 -> ByteString -> [ByteString]
splitWith, -- :: (Word8 -> Bool) -> ByteString -> [ByteString]
- tokens, -- :: (Word8 -> Bool) -> ByteString -> [ByteString]
-- ** Joining strings
join, -- :: ByteString -> [ByteString] -> ByteString
- joinWithByte, -- :: Word8 -> ByteString -> ByteString -> ByteString
-- * Predicates
isPrefixOf, -- :: ByteString -> ByteString -> Bool
-- ** Searching by equality
elem, -- :: Word8 -> ByteString -> Bool
notElem, -- :: Word8 -> ByteString -> Bool
- filterByte, -- :: Word8 -> ByteString -> ByteString
- filterNotByte, -- :: Word8 -> ByteString -> ByteString
-- ** Searching with a predicate
find, -- :: (Word8 -> Bool) -> ByteString -> Maybe Word8
-- * Ordered ByteStrings
-- sort, -- :: ByteString -> ByteString
+ copy, -- :: ByteString -> ByteString
+
-- * I\/O with 'ByteString's
-- ** Standard input and output
-- ** I\/O with Handles
hGetContents, -- :: Handle -> IO ByteString
- hGetContentsN, -- :: Int -> Handle -> IO ByteString
hGet, -- :: Handle -> Int -> IO ByteString
- hGetN, -- :: Int -> Handle -> Int -> IO ByteString
hPut, -- :: Handle -> ByteString -> IO ()
-#if defined(__GLASGOW_HASKELL__)
hGetNonBlocking, -- :: Handle -> IO ByteString
- hGetNonBlockingN, -- :: Int -> Handle -> IO ByteString
-#endif
+
+-- hGetN, -- :: Int -> Handle -> Int -> IO ByteString
+-- hGetContentsN, -- :: Int -> Handle -> IO ByteString
+-- hGetNonBlockingN, -- :: Int -> Handle -> IO ByteString
) where
import qualified Data.List as L -- L for list/lazy
import qualified Data.ByteString as P -- P for packed
import qualified Data.ByteString.Base as P
+import Data.ByteString.Base (LazyByteString(..))
import qualified Data.ByteString.Fusion as P
import Data.ByteString.Fusion (PairS(..),loopL)
import Data.Word (Word8)
import Data.Int (Int64)
-import System.IO (Handle,stdin,stdout,openBinaryFile,IOMode(..),hClose)
+import System.IO (Handle,stdin,stdout,openBinaryFile,IOMode(..)
+ ,hClose,hWaitForInput,hIsEOF)
import System.IO.Unsafe
import Control.Exception (bracket)
-#if defined(__GLASGOW_HASKELL__)
-import Data.Generics (Data(..), Typeable(..))
-#endif
+import Foreign.ForeignPtr (withForeignPtr)
+import Foreign.Ptr
+import Foreign.Storable
-- -----------------------------------------------------------------------------
--
-- -----------------------------------------------------------------------------
--- | A space-efficient representation of a Word8 vector, supporting many
--- efficient operations. A 'ByteString' contains 8-bit characters only.
---
--- Instances of Eq, Ord, Read, Show, Data, Typeable
---
-newtype ByteString = LPS [P.ByteString] -- LPS for lazy packed string
- deriving (Show,Read
-#if defined(__GLASGOW_HASKELL__)
- ,Data, Typeable
-#endif
- )
+type ByteString = LazyByteString
--
-- hmm, what about getting the PS constructor unpacked into the cons cell?
-- and need to share the cache with other programs.
--
defaultChunkSize :: Int
-defaultChunkSize = 64 * k
+defaultChunkSize = 32 * k - overhead
where k = 1024
+ overhead = 2 * sizeOf (undefined :: Int)
smallChunkSize :: Int
-smallChunkSize = 4 * k
+smallChunkSize = 4 * k - overhead
where k = 1024
+ overhead = 2 * sizeOf (undefined :: Int)
-- defaultChunkSize = 1
unpack (LPS ss) = L.concatMap P.unpack ss
{-# INLINE unpack #-}
+-- | /O(c)/ Convert a list of strict 'ByteString' into a lazy 'ByteString'
+fromChunks :: [P.ByteString] -> ByteString
+fromChunks ls = LPS $ L.filter (not . P.null) ls
+
+-- | /O(n)/ Convert a lazy 'ByteString' into a list of strict 'ByteString'
+toChunks :: ByteString -> [P.ByteString]
+toChunks (LPS s) = s
+
------------------------------------------------------------------------
+{-
-- | /O(n)/ Convert a '[a]' into a 'ByteString' using some
-- conversion function
packWith :: (a -> Word8) -> [a] -> ByteString
unpackWith k (LPS ss) = L.concatMap (P.unpackWith k) ss
{-# INLINE unpackWith #-}
{-# SPECIALIZE unpackWith :: (Word8 -> Char) -> ByteString -> [Char] #-}
+-}
-- ---------------------------------------------------------------------
-- Basic interface
-- | /O(1)/ Test whether a ByteString is empty.
null :: ByteString -> Bool
null (LPS []) = True
-null (_) = False -- TODO: guarantee this invariant is maintained
+null (_) = False
{-# INLINE null #-}
-- | /O(n\/c)/ 'length' returns the length of a ByteString as an 'Int64'
length :: ByteString -> Int64
-length (LPS ss) = L.sum (L.map (fromIntegral.P.length) ss)
+length (LPS ss) = L.foldl' (\n ps -> n + fromIntegral (P.length ps)) 0 ss
-- avoid the intermediate list?
-- length (LPS ss) = L.foldl lengthF 0 ss
-- You can however use 'repeat' and 'cycle' to build infinite lazy ByteStrings.
--
cons :: Word8 -> ByteString -> ByteString
-cons c (LPS (s:ss)) | P.length s <= 16 = LPS (P.cons c s : ss)
-cons c (LPS ss) = LPS (P.singleton c : ss)
+cons c (LPS (s:ss)) | P.length s < 16 = LPS (P.cons c s : ss)
+cons c (LPS ss) = LPS (P.singleton c : ss)
{-# INLINE cons #-}
-- | /O(n\/c)/ Append a byte to the end of a 'ByteString'
last (LPS xs) = P.last (L.last xs)
{-# INLINE last #-}
--- | /O(1)/ Return all the elements of a 'ByteString' except the last one.
+-- | /O(n\/c)/ Return all the elements of a 'ByteString' except the last one.
init :: ByteString -> ByteString
init (LPS []) = errorEmptyList "init"
init (LPS xs)
-- | /O(n)/ 'reverse' @xs@ efficiently returns the elements of @xs@ in reverse order.
reverse :: ByteString -> ByteString
-reverse (LPS xs) = LPS (L.reverse . L.map P.reverse $ xs)
+reverse (LPS ps) = LPS (rev [] ps)
+ where rev a [] = a
+ rev a (x:xs) = rev (P.reverse x:a) xs
+-- note, here is one example where the extra element lazyness is an advantage.
+-- we can reerse the list of chunks strictly but reverse each chunk lazily
+-- so while we may force the whole lot into memory we do not need to copy
+-- each chunk until it is used.
{-# INLINE reverse #-}
-- The 'intersperse' function takes a 'Word8' and a 'ByteString' and
-- | /O(n)/ 'maximum' returns the maximum value from a 'ByteString'
maximum :: ByteString -> Word8
-maximum (LPS []) = errorEmptyList "maximum"
-maximum (LPS xs) = L.maximum (L.map P.maximum xs)
+maximum (LPS []) = errorEmptyList "maximum"
+maximum (LPS (x:xs)) = L.foldl' (\n ps -> n `max` P.maximum ps) (P.maximum x) xs
{-# INLINE maximum #-}
-- | /O(n)/ 'minimum' returns the minimum value from a 'ByteString'
minimum :: ByteString -> Word8
-minimum (LPS []) = errorEmptyList "minimum"
-minimum (LPS xs) = L.minimum (L.map P.minimum xs)
+minimum (LPS []) = errorEmptyList "minimum"
+minimum (LPS (x:xs)) = L.foldl' (\n ps -> n `min` P.minimum ps) (P.minimum x) xs
{-# INLINE minimum #-}
-- | The 'mapAccumL' function behaves like a combination of 'map' and
-- | /O(n\/c)/ 'take' @n@, applied to a ByteString @xs@, returns the prefix
-- of @xs@ of length @n@, or @xs@ itself if @n > 'length' xs@.
take :: Int64 -> ByteString -> ByteString
-take n _ | n < 0 = empty
-take i (LPS ps) = LPS (take' i ps)
- where take' _ [] = []
- take' 0 _ = []
+take i _ | i <= 0 = empty
+take i (LPS ps) = LPS (take' i ps)
+ where take' 0 _ = []
+ take' _ [] = []
take' n (x:xs) =
if n < fromIntegral (P.length x)
then P.take (fromIntegral n) x : []
drop :: Int64 -> ByteString -> ByteString
drop i p | i <= 0 = p
drop i (LPS ps) = LPS (drop' i ps)
- where drop' _ [] = []
- drop' 0 xs = xs
+ where drop' 0 xs = xs
+ drop' _ [] = []
drop' n (x:xs) =
if n < fromIntegral (P.length x)
then P.drop (fromIntegral n) x : xs
splitAt :: Int64 -> ByteString -> (ByteString, ByteString)
splitAt i p | i <= 0 = (empty, p)
splitAt i (LPS ps) = case splitAt' i ps of (a,b) -> (LPS a, LPS b)
- where splitAt' _ [] = ([], [])
- splitAt' 0 xs = ([], xs)
+ where splitAt' 0 xs = ([], xs)
+ splitAt' _ [] = ([], [])
splitAt' n (x:xs) =
if n < fromIntegral (P.length x)
then (P.take (fromIntegral n) x : [],
takeWhile f (LPS ps) = LPS (takeWhile' ps)
where takeWhile' [] = []
takeWhile' (x:xs) =
- case P.findIndexOrEnd (not . f) x of
+ case findIndexOrEnd (not . f) x of
0 -> []
n | n < P.length x -> P.take n x : []
| otherwise -> x : takeWhile' xs
dropWhile f (LPS ps) = LPS (dropWhile' ps)
where dropWhile' [] = []
dropWhile' (x:xs) =
- case P.findIndexOrEnd (not . f) x of
+ case findIndexOrEnd (not . f) x of
n | n < P.length x -> P.drop n x : xs
| otherwise -> dropWhile' xs
break f (LPS ps) = case (break' ps) of (a,b) -> (LPS a, LPS b)
where break' [] = ([], [])
break' (x:xs) =
- case P.findIndexOrEnd f x of
+ case findIndexOrEnd f x of
0 -> ([], x : xs)
n | n < P.length x -> (P.take n x : [], P.drop n x : xs)
| otherwise -> let (xs', xs'') = break' xs
in (x : xs', xs'')
+--
+-- TODO
+--
+-- Add rules
+--
+
+{-
-- | 'breakByte' breaks its ByteString argument at the first occurence
-- of the specified byte. It is more efficient than 'break' as it is
-- implemented with @memchr(3)@. I.e.
| P.null x'' -> let (xs', xs'') = spanByte' xs
in (x : xs', xs'')
| otherwise -> (x' : [], x'' : xs)
+-}
-- | 'span' @p xs@ breaks the ByteString into two segments. It is
-- equivalent to @('takeWhile' p xs, 'dropWhile' p xs)@
-- argument, consuming the delimiter. I.e.
--
-- > split '\n' "a\nb\nd\ne" == ["a","b","d","e"]
--- > split 'a' "aXaXaXa" == ["","X","X","X"]
+-- > split 'a' "aXaXaXa" == ["","X","X","X",""]
-- > split 'x' "x" == ["",""]
--
-- and
{-# INLINE cons' #-}
{-# INLINE split #-}
+{-
-- | Like 'splitWith', except that sequences of adjacent separators are
-- treated as a single separator. eg.
--
--
tokens :: (Word8 -> Bool) -> ByteString -> [ByteString]
tokens f = L.filter (not.null) . splitWith f
+-}
-- | The 'group' function takes a ByteString and returns a list of
-- ByteStrings such that the concatenation of the result is equal to the
-- | The 'groupBy' function is the non-overloaded version of 'group'.
--
groupBy :: (Word8 -> Word8 -> Bool) -> ByteString -> [ByteString]
+groupBy = error "Data.ByteString.Lazy.groupBy: unimplemented"
+{-
groupBy _ (LPS []) = []
groupBy k (LPS (a:as)) = groupBy' [] 0 (P.groupBy k a) as
where groupBy' :: [P.ByteString] -> Word8 -> [P.ByteString] -> [P.ByteString] -> [ByteString]
groupBy' [] _ (s:[]) (x:xs) = groupBy' (s:[]) (P.unsafeHead s) (P.groupBy k x) xs
groupBy' acc c (s:[]) (x:xs) = groupBy' (s:acc) c (P.groupBy k x) xs
groupBy' acc _ (s:ss) xs = LPS (L.reverse (s : acc)) : groupBy' [] 0 ss xs
+-}
{-
TODO: check if something like this might be faster
join :: ByteString -> [ByteString] -> ByteString
join s = concat . (L.intersperse s)
--- | /O(n)/ joinWithByte. An efficient way to join to two ByteStrings
--- with a char.
---
-joinWithByte :: Word8 -> ByteString -> ByteString -> ByteString
-joinWithByte c x y = append x (cons c y)
-
-- ---------------------------------------------------------------------
-- Indexing ByteStrings
--
-- But more efficiently than using length on the intermediate list.
count :: Word8 -> ByteString -> Int64
-count w (LPS xs) = L.sum (L.map (fromIntegral . P.count w) xs)
+count w (LPS xs) = L.foldl' (\n ps -> n + fromIntegral (P.count w ps)) 0 xs
-- | The 'findIndex' function takes a predicate and a 'ByteString' and
-- returns the index of the first element in the ByteString
filter p = LPS . P.loopArr . loopL (P.filterEFL p) P.NoAcc . unLPS
{-# INLINE filter #-}
+{-
-- | /O(n)/ and /O(n\/c) space/ A first order equivalent of /filter .
-- (==)/, for the common case of filtering a single byte. It is more
-- efficient to use /filterByte/ in this case.
-- filterNotByte is around 2x faster than its filter equivalent.
filterNotByte :: Word8 -> ByteString -> ByteString
filterNotByte w (LPS xs) = LPS (filterMap (P.filterNotByte w) xs)
+-}
-- ---------------------------------------------------------------------
-- Searching for substrings
| otherwise = LPS xs : tails' (P.unsafeTail x : xs')
-- ---------------------------------------------------------------------
+-- Low level constructors
+
+-- | /O(n)/ Make a copy of the 'ByteString' with its own storage.
+-- This is mainly useful to allow the rest of the data pointed
+-- to by the 'ByteString' to be garbage collected, for example
+-- if a large string has been read in, and only a small part of it
+-- is needed in the rest of the program.
+copy :: ByteString -> ByteString
+copy (LPS lps) = LPS (L.map P.copy lps)
+--TODO, we could coalese small blocks here
+--FIXME: probably not strict enough, if we're doing this to avoid retaining
+-- the parent blocks then we'd better copy strictly.
+
+-- ---------------------------------------------------------------------
-- TODO defrag func that concatenates block together that are below a threshold
-- defrag :: Int -> ByteString -> ByteString
-- Lazy ByteString IO
-- | Read entire handle contents /lazily/ into a 'ByteString'. Chunks
--- are read on demand, in @k@-sized chunks.
+-- are read on demand, in at most @k@-sized chunks. It does not block
+-- waiting for a whole @k@-sized chunk, so if less than @k@ bytes are
+-- available then they will be returned immediately as a smaller chunk.
hGetContentsN :: Int -> Handle -> IO ByteString
hGetContentsN k h = lazyRead >>= return . LPS
where
- lazyRead = unsafeInterleaveIO $ do
- ps <- P.hGet h k
- case P.length ps of
- 0 -> return []
- n | n < k -> return [ps]
- _ -> do pss <- lazyRead
- return (ps : pss)
+ lazyRead = unsafeInterleaveIO loop
+
+ loop = do
+ ps <- P.hGetNonBlocking h k
+ --TODO: I think this should distinguish EOF from no data available
+ -- the otherlying POSIX call makes this distincion, returning either
+ -- 0 or EAGAIN
+ if P.null ps
+ then do eof <- hIsEOF h
+ if eof then return []
+ else hWaitForInput h (-1)
+ >> loop
+ else do pss <- lazyRead
+ return (ps : pss)
-- | Read @n@ bytes into a 'ByteString', directly from the
-- specified 'Handle', in chunks of size @k@.
readChunks i = do
ps <- P.hGet h (min k i)
case P.length ps of
- 0 -> return []
- m | m == i -> return [ps]
- m -> do pss <- readChunks (i - m)
- return (ps : pss)
+ 0 -> return []
+ m -> do pss <- readChunks (i - m)
+ return (ps : pss)
-#if defined(__GLASGOW_HASKELL__)
-- | hGetNonBlockingN is similar to 'hGetContentsN', except that it will never block
-- waiting for data to become available, instead it returns only whatever data
-- is available. Chunks are read on demand, in @k@-sized chunks.
hGetNonBlockingN :: Int -> Handle -> Int -> IO ByteString
+#if defined(__GLASGOW_HASKELL__)
hGetNonBlockingN _ _ 0 = return empty
hGetNonBlockingN k h n = readChunks n >>= return . LPS
where
+ STRICT1(readChunks)
readChunks i = do
ps <- P.hGetNonBlocking h (min k i)
case P.length ps of
- 0 -> return []
- m | fromIntegral m < i -> return [ps]
- m -> do pss <- readChunks (i - m)
- return (ps : pss)
+ 0 -> return []
+ m -> do pss <- readChunks (i - m)
+ return (ps : pss)
+#else
+hGetNonBlockingN = hGetN
#endif
-- | Read entire handle contents /lazily/ into a 'ByteString'. Chunks
hGet :: Handle -> Int -> IO ByteString
hGet = hGetN defaultChunkSize
-#if defined(__GLASGOW_HASKELL__)
-- | hGetNonBlocking is similar to 'hGet', except that it will never block
-- waiting for data to become available, instead it returns only whatever data
-- is available.
+#if defined(__GLASGOW_HASKELL__)
hGetNonBlocking :: Handle -> Int -> IO ByteString
hGetNonBlocking = hGetNonBlockingN defaultChunkSize
+#else
+hGetNonBlocking = hGet
#endif
-
-- | Read an entire file /lazily/ into a 'ByteString'.
readFile :: FilePath -> IO ByteString
readFile f = openBinaryFile f ReadMode >>= hGetContents
| otherwise -> y : filterMap f xs
{-# INLINE filterMap #-}
+
+-- | 'findIndexOrEnd' is a variant of findIndex, that returns the length
+-- of the string if no element is found, rather than Nothing.
+findIndexOrEnd :: (Word8 -> Bool) -> P.ByteString -> Int
+findIndexOrEnd k (P.PS x s l) = P.inlinePerformIO $ withForeignPtr x $ \f -> go (f `plusPtr` s) 0
+ where
+ STRICT2(go)
+ go ptr n | n >= l = return l
+ | otherwise = do w <- peek ptr
+ if k w
+ then return n
+ else go (ptr `plusPtr` 1) (n+1)
+{-# INLINE findIndexOrEnd #-}