From: Don Stewart Date: Wed, 23 Aug 2006 14:33:38 +0000 (+0000) Subject: Sync Data.ByteString with current stable branch, 0.7 X-Git-Url: http://git.megacz.com/?a=commitdiff_plain;h=a87eeb296e699c8000b371c9676158499be3d17f;p=ghc-base.git Sync Data.ByteString with current stable branch, 0.7 --- diff --git a/Data/ByteString.hs b/Data/ByteString.hs index f030970..6dbc3e6 100644 --- a/Data/ByteString.hs +++ b/Data/ByteString.hs @@ -46,8 +46,6 @@ module Data.ByteString ( singleton, -- :: Word8 -> ByteString pack, -- :: [Word8] -> ByteString unpack, -- :: ByteString -> [Word8] - packWith, -- :: (a -> Word8) -> [a] -> ByteString - unpackWith, -- :: (Word8 -> a) -> ByteString -> [a] -- * Basic interface cons, -- :: Word8 -> ByteString -> ByteString @@ -62,7 +60,6 @@ module Data.ByteString ( -- * Transformating ByteStrings map, -- :: (Word8 -> Word8) -> ByteString -> ByteString - map', -- :: (Word8 -> Word8) -> ByteString -> ByteString reverse, -- :: ByteString -> ByteString intersperse, -- :: Word8 -> ByteString -> ByteString transpose, -- :: [ByteString] -> [ByteString] @@ -72,8 +69,11 @@ module Data.ByteString ( foldl', -- :: (a -> Word8 -> a) -> a -> ByteString -> a foldl1, -- :: (Word8 -> Word8 -> Word8) -> ByteString -> Word8 foldl1', -- :: (Word8 -> Word8 -> Word8) -> ByteString -> Word8 + foldr, -- :: (Word8 -> a -> a) -> a -> ByteString -> a + foldr', -- :: (Word8 -> a -> a) -> a -> ByteString -> a foldr1, -- :: (Word8 -> Word8 -> Word8) -> ByteString -> Word8 + foldr1', -- :: (Word8 -> Word8 -> Word8) -> ByteString -> Word8 -- ** Special folds concat, -- :: [ByteString] -> ByteString @@ -98,7 +98,7 @@ module Data.ByteString ( -- ** Unfolding ByteStrings replicate, -- :: Int -> Word8 -> ByteString unfoldr, -- :: (a -> Maybe (Word8, a)) -> a -> ByteString - unfoldrN, -- :: Int -> (a -> Maybe (Word8, a)) -> a -> ByteString + unfoldrN, -- :: Int -> (a -> Maybe (Word8, a)) -> a -> (ByteString, Maybe a) -- * Substrings @@ -117,18 +117,12 @@ module Data.ByteString ( inits, -- :: ByteString -> [ByteString] tails, -- :: ByteString -> [ByteString] - -- ** Breaking and dropping on specific bytes - breakByte, -- :: Word8 -> ByteString -> (ByteString, ByteString) - spanByte, -- :: Word8 -> ByteString -> (ByteString, ByteString) - -- ** Breaking into many substrings split, -- :: Word8 -> ByteString -> [ByteString] splitWith, -- :: (Word8 -> Bool) -> ByteString -> [ByteString] - tokens, -- :: (Word8 -> Bool) -> ByteString -> [ByteString] -- ** Joining strings join, -- :: ByteString -> [ByteString] -> ByteString - joinWithByte, -- :: Word8 -> ByteString -> ByteString -> ByteString -- * Predicates isPrefixOf, -- :: ByteString -> ByteString -> Bool @@ -145,13 +139,10 @@ module Data.ByteString ( -- | These functions use memchr(3) to efficiently search the ByteString elem, -- :: Word8 -> ByteString -> Bool notElem, -- :: Word8 -> ByteString -> Bool - filterByte, -- :: Word8 -> ByteString -> ByteString - filterNotByte, -- :: Word8 -> ByteString -> ByteString -- ** Searching with a predicate find, -- :: (Word8 -> Bool) -> ByteString -> Maybe Word8 filter, -- :: (Word8 -> Bool) -> ByteString -> ByteString - filter', -- :: (Word8 -> Bool) -> ByteString -> ByteString -- partition -- :: (Word8 -> Bool) -> ByteString -> (ByteString, ByteString) -- * Indexing ByteStrings @@ -162,12 +153,10 @@ module Data.ByteString ( findIndex, -- :: (Word8 -> Bool) -> ByteString -> Maybe Int findIndices, -- :: (Word8 -> Bool) -> ByteString -> [Int] count, -- :: Word8 -> ByteString -> Int - findIndexOrEnd, -- :: (Word8 -> Bool) -> ByteString -> Int -- * Zipping and unzipping ByteStrings zip, -- :: ByteString -> ByteString -> [(Word8,Word8)] zipWith, -- :: (Word8 -> Word8 -> c) -> ByteString -> ByteString -> [c] - zipWith', unzip, -- :: [(Word8,Word8)] -> (ByteString,ByteString) -- * Ordered ByteStrings @@ -193,13 +182,11 @@ module Data.ByteString ( -- * I\/O with 'ByteString's -- ** Standard input and output - -#if defined(__GLASGOW_HASKELL__) getLine, -- :: IO ByteString -#endif getContents, -- :: IO ByteString putStr, -- :: ByteString -> IO () putStrLn, -- :: ByteString -> IO () + interact, -- :: (ByteString -> ByteString) -> IO () -- ** Files readFile, -- :: FilePath -> IO ByteString @@ -208,23 +195,20 @@ module Data.ByteString ( -- mmapFile, -- :: FilePath -> IO ByteString -- ** I\/O with Handles -#if defined(__GLASGOW_HASKELL__) - getArgs, -- :: IO [ByteString] hGetLine, -- :: Handle -> IO ByteString - hGetLines, -- :: Handle -> IO [ByteString] - hGetNonBlocking, -- :: Handle -> Int -> IO ByteString -#endif hGetContents, -- :: Handle -> IO ByteString hGet, -- :: Handle -> Int -> IO ByteString + hGetNonBlocking, -- :: Handle -> Int -> IO ByteString hPut, -- :: Handle -> ByteString -> IO () hPutStr, -- :: Handle -> ByteString -> IO () hPutStrLn, -- :: Handle -> ByteString -> IO () - -- * Fusion utilities #if defined(__GLASGOW_HASKELL__) + -- * Fusion utilities unpackList, -- eek, otherwise it gets thrown away by the simplifier -#endif lengthU, maximumU, minimumU +#endif + ) where import qualified Prelude as P @@ -235,7 +219,7 @@ import Prelude hiding (reverse,head,tail,last,init,null ,minimum,all,concatMap,foldl1,foldr1 ,scanl,scanl1,scanr,scanr1 ,readFile,writeFile,appendFile,replicate - ,getContents,getLine,putStr,putStrLn + ,getContents,getLine,putStr,putStrLn,interact ,zip,zipWith,unzip,notElem) import Data.ByteString.Base @@ -250,6 +234,7 @@ import qualified Data.Array as Array ((!)) -- Control.Exception.bracket not available in yhc or nhc import Control.Exception (bracket, assert) +import qualified Control.Exception as Exception import Control.Monad (when) import Foreign.C.String (CString, CStringLen) @@ -268,6 +253,8 @@ import Data.Monoid (Monoid, mempty, mappend, mconcat) #if !defined(__GLASGOW_HASKELL__) import System.IO.Unsafe +import qualified System.Environment +import qualified System.IO (hGetLine) #endif #if defined(__GLASGOW_HASKELL__) @@ -275,9 +262,6 @@ import System.IO.Unsafe import System.IO (hGetBufNonBlocking) import System.IO.Error (isEOFError) -import Foreign.Marshal (alloca) -import qualified Foreign.Concurrent as FC (newForeignPtr) - import GHC.Handle import GHC.Prim (Word#, (+#), writeWord8OffAddr#) import GHC.Base (build) @@ -375,15 +359,10 @@ cmp p1 p2 n len1 len2 -- ----------------------------------------------------------------------------- -- Introducing and eliminating 'ByteString's --- | /O(1)/ The empty 'ByteString' -empty :: ByteString -empty = unsafeCreate 0 $ const $ return () -{-# NOINLINE empty #-} - -- | /O(1)/ Convert a 'Word8' into a 'ByteString' singleton :: Word8 -> ByteString singleton c = unsafeCreate 1 $ \p -> poke p c -{-# INLINE singleton #-} +{-# INLINE [1] singleton #-} -- -- XXX The unsafePerformIO is critical! @@ -445,17 +424,14 @@ unpack (PS ps s l) = inlinePerformIO $ withForeignPtr ps $ \p -> #else --- --- Interacting with head/build fusion rule in ghc 6.5. Disable for now --- - unpack ps = build (unpackFoldr ps) {-# INLINE unpack #-} -- -- critical this isn't strict in the acc -- as it will break in the presence of list fusion. this is a known --- issue with seq and rewrite rules +-- issue with seq and build/foldr rewrite rules, which rely on lazy +-- demanding to avoid bottoms in the list. -- unpackFoldr :: ByteString -> (Word8 -> a -> a) -> a -> a unpackFoldr (PS fp off len) f ch = withPtr fp $ \p -> do @@ -618,6 +594,7 @@ map f = loopArr . loopMap f #endif {-# INLINE map #-} +{- -- | /O(n)/ Like 'map', but not fuseable. The benefit is that it is -- slightly faster for one-shot cases. map' :: (Word8 -> Word8) -> ByteString -> ByteString @@ -633,6 +610,7 @@ map' f (PS fp s len) = inlinePerformIO $ withForeignPtr fp $ \a -> pokeByteOff p2 n (f x) map_ (n+1) p1 p2 {-# INLINE map' #-} +-} -- | /O(n)/ 'reverse' @xs@ efficiently returns the elements of @xs@ in reverse order. reverse :: ByteString -> ByteString @@ -703,6 +681,17 @@ foldr :: (Word8 -> a -> a) -> a -> ByteString -> a foldr k z = loopAcc . loopDown (foldEFL (flip k)) z {-# INLINE foldr #-} +-- | 'foldr\'' is like 'foldr', but strict in the accumulator. +foldr' :: (Word8 -> a -> a) -> a -> ByteString -> a +foldr' k v (PS x s l) = inlinePerformIO $ withForeignPtr x $ \ptr -> + go v (ptr `plusPtr` (s+l-1)) (ptr `plusPtr` (s-1)) + where + STRICT3(go) + go z p q | p == q = return z + | otherwise = do c <- peek p + go (c `k` z) (p `plusPtr` (-1)) q -- tail recursive +{-# INLINE [1] foldr' #-} + -- | 'foldl1' is a variant of 'foldl' that has no starting value -- argument, and thus must be applied to non-empty 'ByteStrings'. -- This function is subject to array fusion. @@ -730,6 +719,14 @@ foldr1 f ps | otherwise = foldr f (last ps) (init ps) {-# INLINE foldr1 #-} +-- | 'foldr1\'' is a variant of 'foldr1', but is strict in the +-- accumulator. +foldr1' :: (Word8 -> Word8 -> Word8) -> ByteString -> Word8 +foldr1' f ps + | null ps = errorEmptyList "foldr1" + | otherwise = foldr' f (last ps) (init ps) +{-# INLINE [1] foldr1' #-} + -- --------------------------------------------------------------------- -- Special folds @@ -1004,13 +1001,19 @@ dropWhile f ps = unsafeDrop (findIndexOrEnd (not . f) ps) ps -- | 'break' @p@ is equivalent to @'span' ('not' . p)@. break :: (Word8 -> Bool) -> ByteString -> (ByteString, ByteString) break p ps = case findIndexOrEnd p ps of n -> (unsafeTake n ps, unsafeDrop n ps) -{-# INLINE break #-} +{-# INLINE [1] break #-} --- | 'breakEnd' behaves like 'break' but from the end of the 'ByteString' --- --- breakEnd p == spanEnd (not.p) -breakEnd :: (Word8 -> Bool) -> ByteString -> (ByteString, ByteString) -breakEnd p ps = splitAt (findFromEndUntil p ps) ps +{-# RULES +"FPS specialise break (x==)" forall x. + break ((==) x) = breakByte x + #-} + +#if __GLASGOW_HASKELL__ >= 605 +-- {-# RULES +-- "FPS specialise break (==x)" forall x. +-- break (==x) = breakByte x +-- #-} +#endif -- | 'breakByte' breaks its ByteString argument at the first occurence -- of the specified byte. It is more efficient than 'break' as it is @@ -1024,6 +1027,18 @@ breakByte c p = case elemIndex c p of Just n -> (unsafeTake n p, unsafeDrop n p) {-# INLINE breakByte #-} +-- | 'breakEnd' behaves like 'break' but from the end of the 'ByteString' +-- +-- breakEnd p == spanEnd (not.p) +breakEnd :: (Word8 -> Bool) -> ByteString -> (ByteString, ByteString) +breakEnd p ps = splitAt (findFromEndUntil p ps) ps + +-- | 'span' @p xs@ breaks the ByteString into two segments. It is +-- equivalent to @('takeWhile' p xs, 'dropWhile' p xs)@ +span :: (Word8 -> Bool) -> ByteString -> (ByteString, ByteString) +span p ps = break (not . p) ps +{-# INLINE [1] span #-} + -- | 'spanByte' breaks its ByteString argument at the first -- occurence of a byte other than its argument. It is more efficient -- than 'span (==)' @@ -1042,11 +1057,17 @@ spanByte c ps@(PS x s l) = inlinePerformIO $ withForeignPtr x $ \p -> else go p (i+1) {-# INLINE spanByte #-} --- | 'span' @p xs@ breaks the ByteString into two segments. It is --- equivalent to @('takeWhile' p xs, 'dropWhile' p xs)@ -span :: (Word8 -> Bool) -> ByteString -> (ByteString, ByteString) -span p ps = break (not . p) ps -{-# INLINE span #-} +{-# RULES +"FPS specialise span (x==)" forall x. + span ((==) x) = spanByte x + #-} + +#if __GLASGOW_HASKELL__ >= 605 +-- {-# RULES +-- "FPS specialise span (==x)" forall x. +-- span (==x) = spanByte x +-- #-} +#endif -- | 'spanEnd' behaves like 'span' but from the end of the 'ByteString'. -- We have @@ -1165,6 +1186,7 @@ split (W8# w#) (PS fp off len) = splitWith' off len fp else splitLoop p (idx'+1) off' len' fp' -} +{- -- | Like 'splitWith', except that sequences of adjacent separators are -- treated as a single separator. eg. -- @@ -1173,6 +1195,7 @@ split (W8# w#) (PS fp off len) = splitWith' off len fp tokens :: (Word8 -> Bool) -> ByteString -> [ByteString] tokens f = P.filter (not.null) . splitWith f {-# INLINE tokens #-} +-} -- | The 'group' function takes a ByteString and returns a list of -- ByteStrings such that the concatenation of the result is equal to the @@ -1204,7 +1227,12 @@ groupBy k xs -- argument between each element of the list. join :: ByteString -> [ByteString] -> ByteString join s = concat . (List.intersperse s) -{-# INLINE join #-} +{-# INLINE [1] join #-} + +{-# RULES +"FPS specialise join c -> joinByte" forall c s1 s2 . + join (singleton c) (s1 : s2 : []) = joinWithByte c s1 s2 + #-} -- -- | /O(n)/ joinWithByte. An efficient way to join to two ByteStrings @@ -1342,19 +1370,6 @@ findIndices p ps = loop 0 ps | p (unsafeHead qs) = n : loop (n+1) (unsafeTail qs) | otherwise = loop (n+1) (unsafeTail qs) --- | 'findIndexOrEnd' is a variant of findIndex, that returns the length --- of the string if no element is found, rather than Nothing. -findIndexOrEnd :: (Word8 -> Bool) -> ByteString -> Int -findIndexOrEnd k (PS x s l) = inlinePerformIO $ withForeignPtr x $ \f -> go (f `plusPtr` s) 0 - where - STRICT2(go) - go ptr n | n >= l = return l - | otherwise = do w <- peek ptr - if k w - then return n - else go (ptr `plusPtr` 1) (n+1) -{-# INLINE findIndexOrEnd #-} - -- --------------------------------------------------------------------- -- Searching ByteStrings @@ -1383,6 +1398,7 @@ filter f = loopArr . loopFilter f #endif {-# INLINE filter #-} +{- -- | /O(n)/ 'filter\'' is a non-fuseable version of filter, that may be -- around 2x faster for some one-shot applications. filter' :: (Word8 -> Bool) -> ByteString -> ByteString @@ -1400,6 +1416,7 @@ filter' k ps@(PS x s l) then poke t w >> go (f `plusPtr` 1) (t `plusPtr` 1) end else go (f `plusPtr` 1) t end {-# INLINE filter' #-} +-} -- -- | /O(n)/ A first order equivalent of /filter . (==)/, for the common @@ -1414,6 +1431,10 @@ filterByte :: Word8 -> ByteString -> ByteString filterByte w ps = replicate (count w ps) w {-# INLINE filterByte #-} +{-# RULES + "FPS specialise filter (== x)" forall x. + filter ((==) x) = filterByte x + #-} -- -- | /O(n)/ A first order equivalent of /filter . (\/=)/, for the common -- case of filtering a single byte out of a list. It is more efficient @@ -1423,9 +1444,13 @@ filterByte w ps = replicate (count w ps) w -- -- filterNotByte is around 2x faster than its filter equivalent. filterNotByte :: Word8 -> ByteString -> ByteString -filterNotByte w = filter' (/= w) +filterNotByte w = filter (/= w) {-# INLINE filterNotByte #-} +{-# RULES +"FPS specialise filter (/= x)" forall x. + filter ((/=) x) = filterNotByte x + #-} -- | /O(n)/ The 'find' function takes a predicate and a ByteString, -- and returns the first element in matching the predicate, or 'Nothing' -- if there is no such element. @@ -1570,8 +1595,8 @@ zipWith' f (PS fp s l) (PS fq t m) = inlinePerformIO $ {-# RULES -"Specialise zipWith" forall (f :: Word8 -> Word8 -> Word8) p q . - pack (zipWith f p q) = zipWith' f p q +"FPS specialise zipWith" forall (f :: Word8 -> Word8 -> Word8) p q . + zipWith f p q = unpack (zipWith' f p q) #-} -- | /O(n)/ 'unzip' transforms a list of pairs of bytes into a pair of @@ -1659,19 +1684,29 @@ packMallocCString cstr = unsafePerformIO $ do -- null-terminated @CString@. The @CString@ will be freed -- automatically. This is a memcpy(3). useAsCString :: ByteString -> (CString -> IO a) -> IO a -useAsCString ps f = useAsCStringLen ps (\(s,_) -> f s) - --- | /O(n) construction/ Use a @ByteString@ with a function requiring a --- @CStringLen@. The @CStringLen@ will be freed automatically. This is a --- memcpy(3). +useAsCString (PS ps s l) = bracket alloc (c_free.castPtr) + where alloc = withForeignPtr ps $ \p -> do + buf <- c_malloc (fromIntegral l+1) + memcpy (castPtr buf) (castPtr p `plusPtr` s) (fromIntegral l) + poke (buf `plusPtr` l) (0::Word8) -- n.b. + return (castPtr buf) + +-- | /O(1) construction/ Use a @ByteString@ with a function requiring a @CStringLen@. useAsCStringLen :: ByteString -> (CStringLen -> IO a) -> IO a -useAsCStringLen (PS ps s l) = bracket alloc (c_free.castPtr.fst) - where - alloc = withForeignPtr ps $ \p -> do - buf <- c_malloc (fromIntegral l+1) - memcpy (castPtr buf) (castPtr p `plusPtr` s) (fromIntegral l) - poke (buf `plusPtr` l) (0::Word8) -- n.b. - return $! (castPtr buf, l) +useAsCStringLen = unsafeUseAsCStringLen + +-- +-- why were we doing this? +-- +-- useAsCStringLen :: ByteString -> (CStringLen -> IO a) -> IO a +-- useAsCStringLen (PS ps s l) = bracket alloc (c_free.castPtr.fst) +-- where +-- alloc = withForeignPtr ps $ \p -> do +-- buf <- c_malloc (fromIntegral l+1) +-- memcpy (castPtr buf) (castPtr p `plusPtr` s) (fromIntegral l) +-- poke (buf `plusPtr` l) (0::Word8) -- n.b. +-- return $! (castPtr buf, l) +-- -- | /O(n)/ Make a copy of the 'ByteString' with its own storage. -- This is mainly useful to allow the rest of the data pointed @@ -1697,27 +1732,36 @@ copyCStringLen (cstr, len) = create len $ \p -> -- --------------------------------------------------------------------- -- line IO -#if defined(__GLASGOW_HASKELL__) - --- | getLine, read a line from stdin. +-- | Read a line from stdin. getLine :: IO ByteString getLine = hGetLine stdin +{- -- | Lazily construct a list of lines of ByteStrings. This will be much --- better on memory consumption than using lines =<< getContents. +-- better on memory consumption than using 'hGetContents >>= lines' +-- If you're considering this, a better choice might be to use +-- Data.ByteString.Lazy hGetLines :: Handle -> IO [ByteString] hGetLines h = go where go = unsafeInterleaveIO $ do - ms <- catch (hGetLine h >>= return . Just) - (\_ -> return Nothing) - case ms of - Nothing -> return [] - Just s -> do ss <- go - return (s:ss) - --- | hGetLine. read a ByteString from a handle + e <- hIsEOF h + if e + then return [] + else do + x <- hGetLine h + xs <- go + return (x:xs) +-} + +-- | Read a line from a handle + hGetLine :: Handle -> IO ByteString +#if !defined(__GLASGOW_HASKELL__) +hGetLine h = do + string <- System.IO.hGetLine h + return $ packWith c2w string +#else hGetLine h = wantReadableHandle "Data.ByteString.hGetLine" h $ \ handle_ -> do case haBufferMode handle_ of NoBuffering -> error "no buffering" @@ -1818,13 +1862,15 @@ hGet :: Handle -> Int -> IO ByteString hGet _ 0 = return empty hGet h i = createAndTrim i $ \p -> hGetBuf h p i -#if defined(__GLASGOW_HASKELL__) -- | hGetNonBlocking is identical to 'hGet', except that it will never block -- waiting for data to become available, instead it returns only whatever data -- is available. hGetNonBlocking :: Handle -> Int -> IO ByteString +#if defined(__GLASGOW_HASKELL__) hGetNonBlocking _ 0 = return empty hGetNonBlocking h i = createAndTrim i $ \p -> hGetBufNonBlocking h p i +#else +hGetNonBlocking = hGet #endif -- | Read entire handle contents into a 'ByteString'. @@ -1862,23 +1908,31 @@ hGetContents h = do getContents :: IO ByteString getContents = hGetContents stdin +-- | The interact function takes a function of type @ByteString -> ByteString@ +-- as its argument. The entire input from the standard input device is passed +-- to this function as its argument, and the resulting string is output on the +-- standard output device. It's great for writing one line programs! +interact :: (ByteString -> ByteString) -> IO () +interact transformer = putStr . transformer =<< getContents + -- | Read an entire file strictly into a 'ByteString'. This is far more -- efficient than reading the characters into a 'String' and then using -- 'pack'. It also may be more efficient than opening the file and --- reading it using hGet. +-- reading it using hGet. Files are read using 'binary mode' on Windows, +-- for 'text mode' use the Char8 version of this function. readFile :: FilePath -> IO ByteString readFile f = bracket (openBinaryFile f ReadMode) hClose (\h -> hFileSize h >>= hGet h . fromIntegral) -- | Write a 'ByteString' to a file. writeFile :: FilePath -> ByteString -> IO () -writeFile f ps = bracket (openBinaryFile f WriteMode) hClose - (\h -> hPut h ps) +writeFile f txt = bracket (openBinaryFile f WriteMode) hClose + (\h -> hPut h txt) -- | Append a 'ByteString' to a file. appendFile :: FilePath -> ByteString -> IO () appendFile f txt = bracket (openBinaryFile f AppendMode) hClose - (\hdl -> hPut hdl txt) + (\h -> hPut h txt) {- -- @@ -1930,7 +1984,7 @@ mmap f = do #else let unmap = return () #endif - fp <- FC.newForeignPtr p unmap + fp <- newForeignPtr p unmap return fp c_close fd hClose h @@ -1938,23 +1992,22 @@ mmap f = do where mmap_limit = 16*1024 -} -#if defined(__GLASGOW_HASKELL__) --- --- | A ByteString equivalent for getArgs. More efficient for large argument lists --- -getArgs :: IO [ByteString] -getArgs = - alloca $ \ p_argc -> - alloca $ \ p_argv -> do - getProgArgv p_argc p_argv - p <- fromIntegral `fmap` peek p_argc - argv <- peek p_argv - P.map packCString `fmap` peekArray (p - 1) (advancePtr argv 1) -#endif - -- --------------------------------------------------------------------- -- Internal utilities +-- | 'findIndexOrEnd' is a variant of findIndex, that returns the length +-- of the string if no element is found, rather than Nothing. +findIndexOrEnd :: (Word8 -> Bool) -> ByteString -> Int +findIndexOrEnd k (PS x s l) = inlinePerformIO $ withForeignPtr x $ \f -> go (f `plusPtr` s) 0 + where + STRICT2(go) + go ptr n | n >= l = return l + | otherwise = do w <- peek ptr + if k w + then return n + else go (ptr `plusPtr` 1) (n+1) +{-# INLINE findIndexOrEnd #-} + -- | Perform an operation with a temporary ByteString withPtr :: ForeignPtr a -> (Ptr a -> IO b) -> b withPtr fp io = inlinePerformIO (withForeignPtr fp io) @@ -1980,8 +2033,4 @@ findFromEndUntil f ps@(PS x s l) = {-# INLINE newForeignFreePtr #-} newForeignFreePtr :: Ptr Word8 -> IO (ForeignPtr Word8) -#if defined(__GLASGOW_HASKELL__) -newForeignFreePtr p = FC.newForeignPtr p (c_free p) -#else newForeignFreePtr p = newForeignPtr c_free_finalizer p -#endif diff --git a/Data/ByteString/Base.hs b/Data/ByteString/Base.hs index dac2a16..f3c869d 100644 --- a/Data/ByteString/Base.hs +++ b/Data/ByteString/Base.hs @@ -27,9 +27,11 @@ module Data.ByteString.Base ( unsafeDrop, -- :: Int -> ByteString -> ByteString -- * Low level introduction and elimination + empty, -- :: ByteString create, -- :: Int -> (Ptr Word8 -> IO ()) -> IO ByteString createAndTrim, -- :: Int -> (Ptr Word8 -> IO Int) -> IO ByteString createAndTrim', -- :: Int -> (Ptr Word8 -> IO (Int, Int, a)) -> IO (ByteString, a) + mallocByteString, -- :: Int -> IO (ForeignPtr a) unsafeCreate, -- :: Int -> (Ptr Word8 -> IO ()) -> ByteString unsafeUseAsCString, -- :: ByteString -> (CString -> IO a) -> IO a @@ -47,6 +49,7 @@ module Data.ByteString.Base ( -- * Utilities inlinePerformIO, -- :: IO a -> a + nullForeignPtr, -- :: ForeignPtr Word8 countOccurrences, -- :: (Storable a, Num a) => Ptr a -> Ptr Word8 -> Int -> IO () @@ -54,10 +57,7 @@ module Data.ByteString.Base ( c_strlen, -- :: CString -> IO CInt c_malloc, -- :: CInt -> IO (Ptr Word8) c_free, -- :: Ptr Word8 -> IO () - -#if !defined(__GLASGOW_HASKELL__) c_free_finalizer, -- :: FunPtr (Ptr Word8 -> IO ()) -#endif memchr, -- :: Ptr Word8 -> Word8 -> CSize -> IO Ptr Word8 memcmp, -- :: Ptr Word8 -> Ptr Word8 -> CSize -> IO CInt @@ -74,7 +74,6 @@ module Data.ByteString.Base ( -- * Internal GHC magic #if defined(__GLASGOW_HASKELL__) - getProgArgv, -- :: Ptr CInt -> Ptr (Ptr CString) -> IO () memcpy_ptr_baoff, -- :: Ptr a -> RawBuffer -> CInt -> CSize -> IO (Ptr ()) #endif @@ -83,10 +82,10 @@ module Data.ByteString.Base ( ) where -import Foreign.ForeignPtr -import Foreign.Ptr +import Foreign.ForeignPtr (ForeignPtr, newForeignPtr_, withForeignPtr) +import Foreign.Ptr (Ptr, FunPtr, plusPtr, castPtr) import Foreign.Storable (Storable(..)) -import Foreign.C.Types +import Foreign.C.Types (CInt, CSize, CULong) import Foreign.C.String (CString, CStringLen) import Control.Exception (assert) @@ -95,21 +94,30 @@ import Data.Char (ord) import Data.Word (Word8) #if defined(__GLASGOW_HASKELL__) +import qualified Foreign.ForeignPtr as FC (finalizeForeignPtr) import qualified Foreign.Concurrent as FC (newForeignPtr) import Data.Generics (Data(..), Typeable(..)) import GHC.Prim (Addr#) import GHC.Ptr (Ptr(..)) import GHC.Base (realWorld#,unsafeChr) -import GHC.IOBase +import GHC.IOBase (IO(IO), unsafePerformIO, RawBuffer) +#else +import Data.Char (chr) +import System.IO.Unsafe (unsafePerformIO) +#endif -#if defined(__GLASGOW_HASKELL__) && !defined(SLOW_FOREIGN_PTR) +#if __GLASGOW_HASKELL__ >= 605 && !defined(SLOW_FOREIGN_PTR) import GHC.ForeignPtr (mallocPlainForeignPtrBytes) +#else +import Foreign.ForeignPtr (mallocForeignPtrBytes) #endif +#if __GLASGOW_HASKELL__>=605 +import GHC.ForeignPtr (ForeignPtr(ForeignPtr)) +import GHC.Base (nullAddr#) #else -import Data.Char (chr) -import System.IO.Unsafe (unsafePerformIO) +import Foreign.Ptr (nullPtr) #endif -- CFILES stuff is Hugs only @@ -141,6 +149,18 @@ data ByteString = PS {-# UNPACK #-} !(ForeignPtr Word8) deriving (Data, Typeable) #endif +-- | /O(1)/ The empty 'ByteString' +empty :: ByteString +empty = PS nullForeignPtr 0 0 + +nullForeignPtr :: ForeignPtr Word8 +#if __GLASGOW_HASKELL__>=605 +nullForeignPtr = ForeignPtr nullAddr# undefined --TODO: should ForeignPtrContents be strict? +#else +nullForeignPtr = unsafePerformIO $ newForeignPtr_ nullPtr +{-# NOINLINE nullForeignPtr #-} +#endif + -- --------------------------------------------------------------------- -- -- Extensions to the basic interface @@ -201,14 +221,10 @@ unsafeCreate :: Int -> (Ptr Word8 -> IO ()) -> ByteString unsafeCreate l f = unsafePerformIO (create l f) {-# INLINE unsafeCreate #-} --- | Wrapper of mallocForeignPtrBytes. +-- | Create ByteString of size @l@ and use action @f@ to fill it's contents. create :: Int -> (Ptr Word8 -> IO ()) -> IO ByteString create l f = do -#if defined(SLOW_FOREIGN_PTR) || !defined(__GLASGOW_HASKELL__) - fp <- mallocForeignPtrBytes l -#else - fp <- mallocPlainForeignPtrBytes l -#endif + fp <- mallocByteString l withForeignPtr fp $ \p -> f p return $! PS fp 0 l @@ -222,11 +238,7 @@ create l f = do -- createAndTrim :: Int -> (Ptr Word8 -> IO Int) -> IO ByteString createAndTrim l f = do -#if defined(SLOW_FOREIGN_PTR) || !defined(__GLASGOW_HASKELL__) - fp <- mallocForeignPtrBytes l -#else - fp <- mallocPlainForeignPtrBytes l -#endif + fp <- mallocByteString l withForeignPtr fp $ \p -> do l' <- f p if assert (l' <= l) $ l' >= l @@ -235,11 +247,7 @@ createAndTrim l f = do createAndTrim' :: Int -> (Ptr Word8 -> IO (Int, Int, a)) -> IO (ByteString, a) createAndTrim' l f = do -#if defined(SLOW_FOREIGN_PTR) || !defined(__GLASGOW_HASKELL__) - fp <- mallocForeignPtrBytes l -#else - fp <- mallocPlainForeignPtrBytes l -#endif + fp <- mallocByteString l withForeignPtr fp $ \p -> do (off, l', res) <- f p if assert (l' <= l) $ l' >= l @@ -248,6 +256,16 @@ createAndTrim' l f = do memcpy p' (p `plusPtr` off) (fromIntegral l') return $! (ps, res) +-- | Wrapper of mallocForeignPtrBytes with faster implementation +-- for GHC 6.5 builds newer than 06/06/06 +mallocByteString :: Int -> IO (ForeignPtr a) +mallocByteString l = do +#if __GLASGOW_HASKELL__ >= 605 && !defined(SLOW_FOREIGN_PTR) + mallocPlainForeignPtrBytes l +#else + mallocForeignPtrBytes l +#endif + #if defined(__GLASGOW_HASKELL__) -- | /O(n)/ Pack a null-terminated sequence of bytes, pointed to by an -- Addr\# (an arbitrary machine address assumed to point outside the @@ -300,7 +318,7 @@ packCStringFinalizer p l f = do -- this, you need to have a proof of some kind that all 'ByteString's -- ever generated from the underlying byte array are no longer live. unsafeFinalize :: ByteString -> IO () -unsafeFinalize (PS p _ _) = finalizeForeignPtr p +unsafeFinalize (PS p _ _) = FC.finalizeForeignPtr p #endif @@ -373,12 +391,9 @@ unsafeUseAsCString :: ByteString -> (CString -> IO a) -> IO a unsafeUseAsCString (PS ps s _) ac = withForeignPtr ps $ \p -> ac (castPtr p `plusPtr` s) -- | /O(1) construction/ Use a @ByteString@ with a function requiring a --- @CStringLen@. Warning: modifying the @CStringLen@ will affect the --- @ByteString@. This is analogous to unsafeUseAsCString, and comes --- with the same safety requirements. The user must ensure there is a --- null byte at the end of the string. +-- @CStringLen@. unsafeUseAsCStringLen :: ByteString -> (CStringLen -> IO a) -> IO a -unsafeUseAsCStringLen (PS ps s l) ac = withForeignPtr ps $ \p -> ac (castPtr p `plusPtr` s,l) +unsafeUseAsCStringLen (PS ps s l) f = withForeignPtr ps $ \p -> f (castPtr p `plusPtr` s,l) -- --------------------------------------------------------------------- -- @@ -386,18 +401,16 @@ unsafeUseAsCStringLen (PS ps s l) ac = withForeignPtr ps $ \p -> ac (castPtr p ` -- foreign import ccall unsafe "string.h strlen" c_strlen - :: CString -> IO CInt + :: CString -> IO CSize foreign import ccall unsafe "stdlib.h malloc" c_malloc - :: CInt -> IO (Ptr Word8) + :: CSize -> IO (Ptr Word8) foreign import ccall unsafe "static stdlib.h free" c_free :: Ptr Word8 -> IO () -#if !defined(__GLASGOW_HASKELL__) foreign import ccall unsafe "static stdlib.h &free" c_free_finalizer :: FunPtr (Ptr Word8 -> IO ()) -#endif foreign import ccall unsafe "string.h memchr" memchr :: Ptr Word8 -> Word8 -> CSize -> IO (Ptr Word8) @@ -421,19 +434,19 @@ foreign import ccall unsafe "string.h memset" memset -- foreign import ccall unsafe "static fpstring.h fps_reverse" c_reverse - :: Ptr Word8 -> Ptr Word8 -> CInt -> IO () + :: Ptr Word8 -> Ptr Word8 -> CULong -> IO () foreign import ccall unsafe "static fpstring.h fps_intersperse" c_intersperse - :: Ptr Word8 -> Ptr Word8 -> CInt -> Word8 -> IO () + :: Ptr Word8 -> Ptr Word8 -> CULong -> Word8 -> IO () foreign import ccall unsafe "static fpstring.h fps_maximum" c_maximum - :: Ptr Word8 -> CInt -> IO Word8 + :: Ptr Word8 -> CULong -> IO Word8 foreign import ccall unsafe "static fpstring.h fps_minimum" c_minimum - :: Ptr Word8 -> CInt -> IO Word8 + :: Ptr Word8 -> CULong -> IO Word8 foreign import ccall unsafe "static fpstring.h fps_count" c_count - :: Ptr Word8 -> CInt -> Word8 -> IO CInt + :: Ptr Word8 -> CULong -> Word8 -> IO CULong -- --------------------------------------------------------------------- -- MMap @@ -455,9 +468,6 @@ foreign import ccall unsafe "static sys/mman.h munmap" c_munmap -- Internal GHC Haskell magic #if defined(__GLASGOW_HASKELL__) -foreign import ccall unsafe "RtsAPI.h getProgArgv" - getProgArgv :: Ptr CInt -> Ptr (Ptr CString) -> IO () - foreign import ccall unsafe "__hscore_memcpy_src_off" memcpy_ptr_baoff :: Ptr a -> RawBuffer -> CInt -> CSize -> IO (Ptr ()) #endif diff --git a/Data/ByteString/Char8.hs b/Data/ByteString/Char8.hs index 71bf394..93f6dc5 100644 --- a/Data/ByteString/Char8.hs +++ b/Data/ByteString/Char8.hs @@ -65,8 +65,11 @@ module Data.ByteString.Char8 ( foldl', -- :: (a -> Char -> a) -> a -> ByteString -> a foldl1, -- :: (Char -> Char -> Char) -> ByteString -> Char foldl1', -- :: (Char -> Char -> Char) -> ByteString -> Char + foldr, -- :: (Char -> a -> a) -> a -> ByteString -> a + foldr', -- :: (Char -> a -> a) -> a -> ByteString -> a foldr1, -- :: (Char -> Char -> Char) -> ByteString -> Char + foldr1', -- :: (Char -> Char -> Char) -> ByteString -> Char -- ** Special folds concat, -- :: [ByteString] -> ByteString @@ -80,12 +83,12 @@ module Data.ByteString.Char8 ( -- ** Scans scanl, -- :: (Char -> Char -> Char) -> Char -> ByteString -> ByteString scanl1, -- :: (Char -> Char -> Char) -> ByteString -> ByteString --- scanr, -- :: (Char -> Char -> Char) -> Char -> ByteString -> ByteString --- scanr1, -- :: (Char -> Char -> Char) -> ByteString -> ByteString + scanr, -- :: (Char -> Char -> Char) -> Char -> ByteString -> ByteString + scanr1, -- :: (Char -> Char -> Char) -> ByteString -> ByteString -- ** Accumulating maps --- mapAccumL, -- :: (acc -> Char -> (acc, Char)) -> acc -> ByteString -> (acc, ByteString) --- mapAccumR, -- :: (acc -> Char -> (acc, Char)) -> acc -> ByteString -> (acc, ByteString) + mapAccumL, -- :: (acc -> Char -> (acc, Char)) -> acc -> ByteString -> (acc, ByteString) + mapAccumR, -- :: (acc -> Char -> (acc, Char)) -> acc -> ByteString -> (acc, ByteString) mapIndexed, -- :: (Int -> Char -> Char) -> ByteString -> ByteString -- * Generating and unfolding ByteStrings @@ -110,17 +113,9 @@ module Data.ByteString.Char8 ( inits, -- :: ByteString -> [ByteString] tails, -- :: ByteString -> [ByteString] - -- ** Breaking and dropping on specific Chars - breakChar, -- :: Char -> ByteString -> (ByteString, ByteString) - spanChar, -- :: Char -> ByteString -> (ByteString, ByteString) - breakSpace, -- :: ByteString -> (ByteString,ByteString) - dropSpace, -- :: ByteString -> ByteString - dropSpaceEnd, -- :: ByteString -> ByteString - -- ** Breaking into many substrings split, -- :: Char -> ByteString -> [ByteString] splitWith, -- :: (Char -> Bool) -> ByteString -> [ByteString] - tokens, -- :: (Char -> Bool) -> ByteString -> [ByteString] -- ** Breaking into lines and words lines, -- :: ByteString -> [ByteString] @@ -128,20 +123,8 @@ module Data.ByteString.Char8 ( unlines, -- :: [ByteString] -> ByteString unwords, -- :: ByteString -> [ByteString] - lines', -- :: ByteString -> [ByteString] - unlines', -- :: [ByteString] -> ByteString - linesCRLF', -- :: ByteString -> [ByteString] - unlinesCRLF', -- :: [ByteString] -> ByteString - words', -- :: ByteString -> [ByteString] - unwords', -- :: ByteString -> [ByteString] - - lineIndices, -- :: ByteString -> [Int] - betweenLines, -- :: ByteString -> ByteString -> ByteString -> Maybe (ByteString) - -- ** Joining strings join, -- :: ByteString -> [ByteString] -> ByteString - joinWithChar, -- :: Char -> ByteString -> ByteString -> ByteString - -- ** Searching for substrings isPrefixOf, -- :: ByteString -> ByteString -> Bool @@ -155,8 +138,6 @@ module Data.ByteString.Char8 ( -- ** Searching by equality elem, -- :: Char -> ByteString -> Bool notElem, -- :: Char -> ByteString -> Bool - filterChar, -- :: Char -> ByteString -> ByteString - filterNotChar, -- :: Char -> ByteString -> ByteString -- ** Searching with a predicate find, -- :: (Char -> Bool) -> ByteString -> Maybe Char @@ -180,10 +161,6 @@ module Data.ByteString.Char8 ( -- * Ordered ByteStrings sort, -- :: ByteString -> ByteString - -- * Conversion - w2c, -- :: Word8 -> Char - c2w, -- :: Char -> Word8 - -- * Reading from ByteStrings readInt, -- :: ByteString -> Maybe Int @@ -206,13 +183,11 @@ module Data.ByteString.Char8 ( -- * I\/O with @ByteString@s -- ** Standard input and output - -#if defined(__GLASGOW_HASKELL__) getLine, -- :: IO ByteString -#endif getContents, -- :: IO ByteString putStr, -- :: ByteString -> IO () putStrLn, -- :: ByteString -> IO () + interact, -- :: (ByteString -> ByteString) -> IO () -- ** Files readFile, -- :: FilePath -> IO ByteString @@ -221,12 +196,8 @@ module Data.ByteString.Char8 ( -- mmapFile, -- :: FilePath -> IO ByteString -- ** I\/O with Handles -#if defined(__GLASGOW_HASKELL__) - getArgs, -- :: IO [ByteString] hGetLine, -- :: Handle -> IO ByteString - hGetLines, -- :: Handle -> IO ByteString hGetNonBlocking, -- :: Handle -> Int -> IO ByteString -#endif hGetContents, -- :: Handle -> IO ByteString hGet, -- :: Handle -> Int -> IO ByteString hPut, -- :: Handle -> ByteString -> IO () @@ -244,7 +215,6 @@ module Data.ByteString.Char8 ( #if defined(__GLASGOW_HASKELL__) unpackList, #endif - filter', map' ) where @@ -253,9 +223,11 @@ import Prelude hiding (reverse,head,tail,last,init,null ,length,map,lines,foldl,foldr,unlines ,concat,any,take,drop,splitAt,takeWhile ,dropWhile,span,break,elem,filter,unwords - ,words,maximum,minimum,all,concatMap,scanl,scanl1 - ,foldl1,foldr1,readFile,writeFile,appendFile,replicate - ,getContents,getLine,putStr,putStrLn + ,words,maximum,minimum,all,concatMap + ,scanl,scanl1,scanr,scanr1 + ,appendFile,readFile,writeFile + ,foldl1,foldr1,replicate + ,getContents,getLine,putStr,putStrLn,interact ,zip,zipWith,unzip,notElem) import qualified Data.ByteString as B @@ -268,13 +240,12 @@ import Data.ByteString (empty,null,length,tail,init,append ,sort,isPrefixOf,isSuffixOf,isSubstringOf,findSubstring ,findSubstrings,copy,group - ,getContents, putStr, putStrLn - ,readFile, {-mmapFile,-} writeFile, appendFile + ,getLine, getContents, putStr, putStrLn, interact ,hGetContents, hGet, hPut, hPutStr, hPutStrLn + ,hGetLine, hGetNonBlocking ,packCString,packCStringLen, packMallocCString ,useAsCString,useAsCStringLen, copyCString,copyCStringLen #if defined(__GLASGOW_HASKELL__) - ,getLine, getArgs, hGetLine, hGetLines, hGetNonBlocking ,unpackList #endif ) @@ -284,11 +255,14 @@ import Data.ByteString.Base ( #if defined(__GLASGOW_HASKELL__) ,packAddress, unsafePackAddress #endif - ,c2w, w2c, unsafeTail, inlinePerformIO, isSpaceWord8 + ,c2w, w2c, unsafeTail, isSpaceWord8, inlinePerformIO ) +import Data.Char ( isSpace ) import qualified Data.List as List (intersperse) +import System.IO (openFile,hClose,hFileSize,IOMode(..)) +import Control.Exception (bracket) import Foreign #if defined(__GLASGOW_HASKELL__) @@ -345,7 +319,7 @@ pack str = B.unsafeCreate (P.length str) $ \(Ptr p) -> stToIO (go p str) -- | /O(n)/ Converts a 'ByteString' to a 'String'. unpack :: ByteString -> [Char] -unpack = B.unpackWith w2c +unpack = P.map w2c . B.unpack {-# INLINE unpack #-} -- | /O(n)/ 'cons' is analogous to (:) for lists, but of different @@ -401,6 +375,11 @@ foldr :: (Char -> a -> a) -> a -> ByteString -> a foldr f = B.foldr (\c a -> f (w2c c) a) {-# INLINE foldr #-} +-- | 'foldr\'' is a strict variant of foldr +foldr' :: (Char -> a -> a) -> a -> ByteString -> a +foldr' f = B.foldr' (\c a -> f (w2c c) a) +{-# INLINE foldr' #-} + -- | 'foldl1' is a variant of 'foldl' that has no starting value -- argument, and thus must be applied to non-empty 'ByteStrings'. foldl1 :: (Char -> Char -> Char) -> ByteString -> Char @@ -418,6 +397,11 @@ foldr1 :: (Char -> Char -> Char) -> ByteString -> Char foldr1 f ps = w2c (B.foldr1 (\x y -> c2w (f (w2c x) (w2c y))) ps) {-# INLINE foldr1 #-} +-- | A strict variant of foldr1 +foldr1' :: (Char -> Char -> Char) -> ByteString -> Char +foldr1' f ps = w2c (B.foldr1' (\x y -> c2w (f (w2c x) (w2c y))) ps) +{-# INLINE foldr1' #-} + -- | Map a function over a 'ByteString' and concatenate the results concatMap :: (Char -> ByteString) -> ByteString -> ByteString concatMap f = B.concatMap (f . w2c) @@ -450,6 +434,20 @@ mapIndexed :: (Int -> Char -> Char) -> ByteString -> ByteString mapIndexed f = B.mapIndexed (\i c -> c2w (f i (w2c c))) {-# INLINE mapIndexed #-} +-- | The 'mapAccumL' function behaves like a combination of 'map' and +-- 'foldl'; it applies a function to each element of a ByteString, +-- passing an accumulating parameter from left to right, and returning a +-- final value of this accumulator together with the new list. +mapAccumL :: (acc -> Char -> (acc, Char)) -> acc -> ByteString -> (acc, ByteString) +mapAccumL f = B.mapAccumL (\acc w -> case f acc (w2c w) of (acc', c) -> (acc', c2w c)) + +-- | The 'mapAccumR' function behaves like a combination of 'map' and +-- 'foldr'; it applies a function to each element of a ByteString, +-- passing an accumulating parameter from right to left, and returning a +-- final value of this accumulator together with the new ByteString. +mapAccumR :: (acc -> Char -> (acc, Char)) -> acc -> ByteString -> (acc, ByteString) +mapAccumR f = B.mapAccumR (\acc w -> case f acc (w2c w) of (acc', c) -> (acc', c2w c)) + -- | 'scanl' is similar to 'foldl', but returns a list of successive -- reduced values from the left: -- @@ -467,6 +465,14 @@ scanl f z = B.scanl (\a b -> c2w (f (w2c a) (w2c b))) (c2w z) scanl1 :: (Char -> Char -> Char) -> ByteString -> ByteString scanl1 f = B.scanl1 (\a b -> c2w (f (w2c a) (w2c b))) +-- | scanr is the right-to-left dual of scanl. +scanr :: (Char -> Char -> Char) -> Char -> ByteString -> ByteString +scanr f z = B.scanr (\a b -> c2w (f (w2c a) (w2c b))) (c2w z) + +-- | 'scanr1' is a variant of 'scanr' that has no starting value argument. +scanr1 :: (Char -> Char -> Char) -> ByteString -> ByteString +scanr1 f = B.scanr1 (\a b -> c2w (f (w2c a) (w2c b))) + -- | /O(n)/ 'replicate' @n x@ is a ByteString of length @n@ with @x@ -- the value of every element. The following holds: -- @@ -549,6 +555,7 @@ breakEnd :: (Char -> Bool) -> ByteString -> (ByteString, ByteString) breakEnd f = B.breakEnd (f . w2c) {-# INLINE breakEnd #-} +{- -- | 'breakChar' breaks its ByteString argument at the first occurence -- of the specified Char. It is more efficient than 'break' as it is -- implemented with @memchr(3)@. I.e. @@ -568,6 +575,7 @@ breakChar = B.breakByte . c2w spanChar :: Char -> ByteString -> (ByteString, ByteString) spanChar = B.spanByte . c2w {-# INLINE spanChar #-} +-} -- | /O(n)/ Break a 'ByteString' into pieces separated by the byte -- argument, consuming the delimiter. I.e. @@ -601,6 +609,7 @@ splitWith f = B.splitWith (f . w2c) {-# INLINE splitWith #-} -- the inline makes a big difference here. +{- -- | Like 'splitWith', except that sequences of adjacent separators are -- treated as a single separator. eg. -- @@ -609,17 +618,20 @@ splitWith f = B.splitWith (f . w2c) tokens :: (Char -> Bool) -> ByteString -> [ByteString] tokens f = B.tokens (f . w2c) {-# INLINE tokens #-} +-} -- | The 'groupBy' function is the non-overloaded version of 'group'. groupBy :: (Char -> Char -> Bool) -> ByteString -> [ByteString] groupBy k = B.groupBy (\a b -> k (w2c a) (w2c b)) +{- -- | /O(n)/ joinWithChar. An efficient way to join to two ByteStrings with a -- char. Around 4 times faster than the generalised join. -- joinWithChar :: Char -> ByteString -> ByteString -> ByteString joinWithChar = B.joinWithByte . c2w {-# INLINE joinWithChar #-} +-} -- | /O(1)/ 'ByteString' index (subscript) operator, starting from 0. index :: ByteString -> Int -> Char @@ -699,6 +711,7 @@ find :: (Char -> Bool) -> ByteString -> Maybe Char find f ps = w2c `fmap` B.find (f . w2c) ps {-# INLINE find #-} +{- -- | /O(n)/ A first order equivalent of /filter . (==)/, for the common -- case of filtering a single Char. It is more efficient to use -- filterChar in this case. @@ -724,6 +737,7 @@ filterChar c = B.filterByte (c2w c) filterNotChar :: Char -> ByteString -> ByteString filterNotChar c = B.filterNotByte (c2w c) {-# INLINE filterNotChar #-} +-} -- | /O(n)/ 'zip' takes two ByteStrings and returns a list of -- corresponding pairs of Chars. If one input ByteString is short, @@ -756,17 +770,14 @@ unsafeHead :: ByteString -> Char unsafeHead = w2c . B.unsafeHead {-# INLINE unsafeHead #-} --- | Unsafe 'ByteString' index (subscript) operator, starting from 0, returning a Char. --- This omits the bounds check, which means there is an accompanying --- obligation on the programmer to ensure the bounds are checked in some --- other way. -unsafeIndex :: ByteString -> Int -> Char -unsafeIndex = (w2c .) . B.unsafeIndex -{-# INLINE unsafeIndex #-} - -- --------------------------------------------------------------------- -- Things that depend on the encoding +{-# RULES + "FPS specialise break -> breakSpace" + break isSpace = breakSpace + #-} + -- | 'breakSpace' returns the pair of ByteStrings when the argument is -- broken at the first whitespace byte. I.e. -- @@ -789,6 +800,11 @@ firstspace ptr n m | otherwise = do w <- peekByteOff ptr n if (not . isSpaceWord8) w then firstspace ptr (n+1) m else return n +{-# RULES + "FPS specialise dropWhile isSpace -> dropSpace" + dropWhile isSpace = dropSpace + #-} + -- | 'dropSpace' efficiently returns the 'ByteString' argument with -- white space Chars removed from the front. It is more efficient than -- calling dropWhile for removing whitespace. I.e. @@ -808,6 +824,7 @@ firstnonspace ptr n m | otherwise = do w <- peekElemOff ptr n if isSpaceWord8 w then firstnonspace ptr (n+1) m else return n +{- -- | 'dropSpaceEnd' efficiently returns the 'ByteString' argument with -- white space removed from the end. I.e. -- @@ -827,6 +844,7 @@ lastnonspace ptr n | n < 0 = return n | otherwise = do w <- peekElemOff ptr n if isSpaceWord8 w then lastnonspace ptr (n-1) else return n +-} -- | 'lines' breaks a ByteString up into a list of ByteStrings at -- newline Chars. The resulting strings do not contain newlines. @@ -840,13 +858,6 @@ lines ps where search = elemIndex '\n' {-# INLINE lines #-} -{-# Bogus rule, wrong if there's not \n at end of line - -"length.lines/count" - P.length . lines = count '\n' - - #-} - {- -- Just as fast, but more complex. Should be much faster, I thought. lines :: ByteString -> [ByteString] @@ -878,7 +889,7 @@ unlines ss = (concat $ List.intersperse nl ss) `append` nl -- half as much space -- > tokens isSpace = words -- words :: ByteString -> [ByteString] -words = B.tokens isSpaceWord8 +words = P.filter (not . B.null) . B.splitWith isSpaceWord8 {-# INLINE words #-} -- | The 'unwords' function is analogous to the 'unlines' function, on words. @@ -886,95 +897,6 @@ unwords :: [ByteString] -> ByteString unwords = join (singleton ' ') {-# INLINE unwords #-} --- | /O(n)/ Indicies of newlines. Shorthand for --- --- > elemIndices '\n' --- -lineIndices :: ByteString -> [Int] -lineIndices = elemIndices '\n' -{-# INLINE lineIndices #-} - --- | 'lines\'' behaves like 'lines', in that it breaks a ByteString on --- newline Chars. However, unlike the Prelude functions, 'lines\'' and --- 'unlines\'' correctly reconstruct lines that are missing terminating --- newlines characters. I.e. --- --- > unlines (lines "a\nb\nc") == "a\nb\nc\n" --- > unlines' (lines' "a\nb\nc") == "a\nb\nc" --- --- Note that this means: --- --- > lines "a\nb\nc\n" == ["a","b","c"] --- > lines' "a\nb\nc\n" == ["a","b","c",""] --- -lines' :: ByteString -> [ByteString] -lines' ps = ps `seq` case elemIndex '\n' ps of - Nothing -> [ps] - Just n -> take n ps : lines' (drop (n+1) ps) - --- | 'linesCRLF\'' behaves like 'lines\'', but breaks on (\\cr?\\lf) -linesCRLF' :: ByteString -> [ByteString] -linesCRLF' ps = ps `seq` case elemIndex '\n' ps of - Nothing -> [ps] - Just 0 -> empty : linesCRLF' (drop 1 ps) - Just n -> let k = if ps `unsafeIndex` (n-1) == '\r' then n-1 else n - in take k ps : linesCRLF' (drop (n+1) ps) - --- | 'unlines\'' behaves like 'unlines', except that it also correctly --- retores lines that do not have terminating newlines (see the --- description for 'lines\''). --- -unlines' :: [ByteString] -> ByteString -unlines' ss = concat $ intersperse_newlines ss - where intersperse_newlines (a:b:s) = a:newline: intersperse_newlines (b:s) - intersperse_newlines s = s - newline = singleton '\n' - --- | 'unlines\'' behaves like 'unlines', except that it also correctly --- retores lines that do not have terminating newlines (see the --- description for 'lines\''). Uses CRLF instead of LF. --- -unlinesCRLF' :: [ByteString] -> ByteString -unlinesCRLF' ss = concat $ intersperse_newlines ss - where intersperse_newlines (a:b:s) = a:newline: intersperse_newlines (b:s) - intersperse_newlines s = s - newline = pack "\r\n" - --- | 'words\'' behaves like 'words', with the exception that it produces --- output on ByteStrings with trailing whitespace that can be --- correctly inverted by 'unwords'. I.e. --- --- > words "a b c " == ["a","b","c"] --- > words' "a b c " == ["a","b","c",""] --- --- > unwords $ words "a b c " == "a b c" --- > unwords $ words' "a b c " == "a b c " --- -words' :: ByteString -> [ByteString] -words' = B.splitWith isSpaceWord8 - --- | 'unwords\'' behaves like 'unwords'. It is provided for consistency --- with the other invertable words and lines functions. -unwords' :: [ByteString] -> ByteString -unwords' = unwords - --- | 'betweenLines' returns the ByteString between the two lines given, --- or Nothing if they do not appear. The returned string is the first --- and shortest string such that the line before it is the given first --- line, and the line after it is the given second line. -betweenLines :: ByteString -- ^ First line to look for - -> ByteString -- ^ Second line to look for - -> ByteString -- ^ 'ByteString' to look in - -> Maybe (ByteString) - -betweenLines start end ps = - case P.break (start ==) (lines ps) of - (_, _:rest@(PS ps1 s1 _:_)) -> - case P.break (end ==) rest of - (_, PS _ s2 _:_) -> Just $ PS ps1 s1 (s2 - s1) - _ -> Nothing - _ -> Nothing - -- --------------------------------------------------------------------- -- Reading from ByteStrings @@ -1006,12 +928,21 @@ readInt as end True _ n ps = Just (negate n, ps) end _ _ n ps = Just (n, ps) --- | /O(n)/ Like 'map', but not fuseable. The benefit is that it is --- slightly faster for one-shot cases. -map' :: (Char -> Char) -> ByteString -> ByteString -map' f = B.map' (c2w . f . w2c) +-- | Read an entire file strictly into a 'ByteString'. This is far more +-- efficient than reading the characters into a 'String' and then using +-- 'pack'. It also may be more efficient than opening the file and +-- reading it using hGet. +readFile :: FilePath -> IO ByteString +readFile f = bracket (openFile f ReadMode) hClose + (\h -> hFileSize h >>= hGet h . fromIntegral) + +-- | Write a 'ByteString' to a file. +writeFile :: FilePath -> ByteString -> IO () +writeFile f txt = bracket (openFile f WriteMode) hClose + (\h -> hPut h txt) + +-- | Append a 'ByteString' to a file. +appendFile :: FilePath -> ByteString -> IO () +appendFile f txt = bracket (openFile f AppendMode) hClose + (\h -> hPut h txt) --- | /O(n)/ 'filter\'' is a non-fuseable version of filter, that may be --- around 2x faster for some one-shot applications. -filter' :: (Char -> Bool) -> ByteString -> ByteString -filter' f = B.filter' (f . w2c) diff --git a/Data/ByteString/Lazy.hs b/Data/ByteString/Lazy.hs index 17e181f..eb4ba61 100644 --- a/Data/ByteString/Lazy.hs +++ b/Data/ByteString/Lazy.hs @@ -1,6 +1,4 @@ -{-# OPTIONS_GHC -cpp -optc-O1 -fffi -fglasgow-exts -fno-warn-incomplete-patterns #-} --- --- -optc-O2 breaks with 4.0.4 gcc on debian +{-# OPTIONS_GHC -cpp -fglasgow-exts -fno-warn-orphans -fno-warn-incomplete-patterns #-} -- -- Module : ByteString.Lazy -- Copyright : (c) Don Stewart 2006 @@ -52,8 +50,6 @@ module Data.ByteString.Lazy ( singleton, -- :: Word8 -> ByteString pack, -- :: [Word8] -> ByteString unpack, -- :: ByteString -> [Word8] - packWith, -- :: (a -> Word8) -> [a] -> ByteString - unpackWith, -- :: (Word8 -> a) -> ByteString -> [a] -- * Basic interface cons, -- :: Word8 -> ByteString -> ByteString @@ -97,6 +93,7 @@ module Data.ByteString.Lazy ( -- ** Accumulating maps mapAccumL, -- :: (acc -> Word8 -> (acc, Word8)) -> acc -> ByteString -> (acc, ByteString) + mapAccumR, -- :: (acc -> Word8 -> (acc, Word8)) -> acc -> ByteString -> (acc, ByteString) mapIndexed, -- :: (Int64 -> Word8 -> Word8) -> ByteString -> ByteString -- ** Infinite ByteStrings @@ -123,10 +120,6 @@ module Data.ByteString.Lazy ( inits, -- :: ByteString -> [ByteString] tails, -- :: ByteString -> [ByteString] - -- ** Breaking and dropping on specific bytes - breakByte, -- :: Word8 -> ByteString -> (ByteString, ByteString) - spanByte, -- :: Word8 -> ByteString -> (ByteString, ByteString) - -- ** Breaking into many substrings split, -- :: Word8 -> ByteString -> [ByteString] splitWith, -- :: (Word8 -> Bool) -> ByteString -> [ByteString] @@ -134,7 +127,6 @@ module Data.ByteString.Lazy ( -- ** Joining strings join, -- :: ByteString -> [ByteString] -> ByteString - joinWithByte, -- :: Word8 -> ByteString -> ByteString -> ByteString -- * Predicates isPrefixOf, -- :: ByteString -> ByteString -> Bool @@ -145,8 +137,6 @@ module Data.ByteString.Lazy ( -- ** Searching by equality elem, -- :: Word8 -> ByteString -> Bool notElem, -- :: Word8 -> ByteString -> Bool - filterByte, -- :: Word8 -> ByteString -> ByteString - filterNotByte, -- :: Word8 -> ByteString -> ByteString -- ** Searching with a predicate find, -- :: (Word8 -> Bool) -> ByteString -> Maybe Word8 @@ -169,6 +159,8 @@ module Data.ByteString.Lazy ( -- * Ordered ByteStrings -- sort, -- :: ByteString -> ByteString + copy, -- :: ByteString -> ByteString + -- * I\/O with 'ByteString's -- ** Standard input and output @@ -184,14 +176,12 @@ module Data.ByteString.Lazy ( -- ** I\/O with Handles hGetContents, -- :: Handle -> IO ByteString - hGetContentsN, -- :: Int -> Handle -> IO ByteString hGet, -- :: Handle -> Int -> IO ByteString - hGetN, -- :: Int -> Handle -> Int -> IO ByteString hPut, -- :: Handle -> ByteString -> IO () -#if defined(__GLASGOW_HASKELL__) hGetNonBlocking, -- :: Handle -> IO ByteString - hGetNonBlockingN, -- :: Int -> Handle -> IO ByteString -#endif +-- hGetN, -- :: Int -> Handle -> Int -> IO ByteString +-- hGetContentsN, -- :: Int -> Handle -> IO ByteString +-- hGetNonBlockingN, -- :: Int -> Handle -> IO ByteString ) where @@ -213,10 +203,15 @@ import Data.Monoid (Monoid(..)) import Data.Word (Word8) import Data.Int (Int64) -import System.IO (Handle,stdin,stdout,openBinaryFile,IOMode(..),hClose) +import System.IO (Handle,stdin,stdout,openBinaryFile,IOMode(..) + ,hClose,hWaitForInput,hIsEOF) import System.IO.Unsafe import Control.Exception (bracket) +import Foreign.ForeignPtr (withForeignPtr) +import Foreign.Ptr +import Foreign.Storable + #if defined(__GLASGOW_HASKELL__) import Data.Generics (Data(..), Typeable(..)) #endif @@ -304,12 +299,14 @@ _abstr (LPS xs) = P.concat xs -- and need to share the cache with other programs. -- defaultChunkSize :: Int -defaultChunkSize = 64 * k +defaultChunkSize = 32 * k - overhead where k = 1024 + overhead = 2 * sizeOf (undefined :: Int) smallChunkSize :: Int -smallChunkSize = 4 * k +smallChunkSize = 4 * k - overhead where k = 1024 + overhead = 2 * sizeOf (undefined :: Int) -- defaultChunkSize = 1 @@ -372,6 +369,7 @@ unpack (LPS ss) = L.concatMap P.unpack ss ------------------------------------------------------------------------ +{- -- | /O(n)/ Convert a '[a]' into a 'ByteString' using some -- conversion function packWith :: (a -> Word8) -> [a] -> ByteString @@ -384,6 +382,7 @@ unpackWith :: (Word8 -> a) -> ByteString -> [a] unpackWith k (LPS ss) = L.concatMap (P.unpackWith k) ss {-# INLINE unpackWith #-} {-# SPECIALIZE unpackWith :: (Word8 -> Char) -> ByteString -> [Char] #-} +-} -- --------------------------------------------------------------------- -- Basic interface @@ -391,12 +390,12 @@ unpackWith k (LPS ss) = L.concatMap (P.unpackWith k) ss -- | /O(1)/ Test whether a ByteString is empty. null :: ByteString -> Bool null (LPS []) = True -null (_) = False -- TODO: guarantee this invariant is maintained +null (_) = False {-# INLINE null #-} -- | /O(n\/c)/ 'length' returns the length of a ByteString as an 'Int64' length :: ByteString -> Int64 -length (LPS ss) = L.sum (L.map (fromIntegral.P.length) ss) +length (LPS ss) = L.foldl' (\n ps -> n + fromIntegral (P.length ps)) 0 ss -- avoid the intermediate list? -- length (LPS ss) = L.foldl lengthF 0 ss @@ -445,7 +444,7 @@ last (LPS []) = errorEmptyList "last" last (LPS xs) = P.last (L.last xs) {-# INLINE last #-} --- | /O(1)/ Return all the elements of a 'ByteString' except the last one. +-- | /O(n\/c)/ Return all the elements of a 'ByteString' except the last one. init :: ByteString -> ByteString init (LPS []) = errorEmptyList "init" init (LPS xs) @@ -473,7 +472,13 @@ map f = LPS . P.loopArr . loopL (P.mapEFL f) P.NoAcc . unLPS -- | /O(n)/ 'reverse' @xs@ efficiently returns the elements of @xs@ in reverse order. reverse :: ByteString -> ByteString -reverse (LPS xs) = LPS (L.reverse . L.map P.reverse $ xs) +reverse (LPS ps) = LPS (rev [] ps) + where rev a [] = a + rev a (x:xs) = rev (P.reverse x:a) xs +-- note, here is one example where the extra element lazyness is an advantage. +-- we can reerse the list of chunks strictly but reverse each chunk lazily +-- so while we may force the whole lot into memory we do not need to copy +-- each chunk until it is used. {-# INLINE reverse #-} -- The 'intersperse' function takes a 'Word8' and a 'ByteString' and @@ -567,14 +572,14 @@ all f (LPS xs) = L.and (L.map (P.all f) xs) -- | /O(n)/ 'maximum' returns the maximum value from a 'ByteString' maximum :: ByteString -> Word8 -maximum (LPS []) = errorEmptyList "maximum" -maximum (LPS xs) = L.maximum (L.map P.maximum xs) +maximum (LPS []) = errorEmptyList "maximum" +maximum (LPS (x:xs)) = L.foldl' (\n ps -> n `max` P.maximum ps) (P.maximum x) xs {-# INLINE maximum #-} -- | /O(n)/ 'minimum' returns the minimum value from a 'ByteString' minimum :: ByteString -> Word8 -minimum (LPS []) = errorEmptyList "minimum" -minimum (LPS xs) = L.minimum (L.map P.minimum xs) +minimum (LPS []) = errorEmptyList "minimum" +minimum (LPS (x:xs)) = L.foldl' (\n ps -> n `min` P.minimum ps) (P.minimum x) xs {-# INLINE minimum #-} -- | The 'mapAccumL' function behaves like a combination of 'map' and @@ -584,6 +589,9 @@ minimum (LPS xs) = L.minimum (L.map P.minimum xs) mapAccumL :: (acc -> Word8 -> (acc, Word8)) -> acc -> ByteString -> (acc, ByteString) mapAccumL f z = (\(a :*: ps) -> (a, LPS ps)) . loopL (P.mapAccumEFL f) z . unLPS +mapAccumR :: (acc -> Word8 -> (acc, Word8)) -> acc -> ByteString -> (acc, ByteString) +mapAccumR = error "mapAccumR unimplemented" + -- | /O(n)/ map Word8 functions, provided with the index at each position mapIndexed :: (Int -> Word8 -> Word8) -> ByteString -> ByteString mapIndexed f = LPS . P.loopArr . loopL (P.mapIndexEFL f) 0 . unLPS @@ -704,7 +712,7 @@ takeWhile :: (Word8 -> Bool) -> ByteString -> ByteString takeWhile f (LPS ps) = LPS (takeWhile' ps) where takeWhile' [] = [] takeWhile' (x:xs) = - case P.findIndexOrEnd (not . f) x of + case findIndexOrEnd (not . f) x of 0 -> [] n | n < P.length x -> P.take n x : [] | otherwise -> x : takeWhile' xs @@ -714,7 +722,7 @@ dropWhile :: (Word8 -> Bool) -> ByteString -> ByteString dropWhile f (LPS ps) = LPS (dropWhile' ps) where dropWhile' [] = [] dropWhile' (x:xs) = - case P.findIndexOrEnd (not . f) x of + case findIndexOrEnd (not . f) x of n | n < P.length x -> P.drop n x : xs | otherwise -> dropWhile' xs @@ -723,12 +731,19 @@ break :: (Word8 -> Bool) -> ByteString -> (ByteString, ByteString) break f (LPS ps) = case (break' ps) of (a,b) -> (LPS a, LPS b) where break' [] = ([], []) break' (x:xs) = - case P.findIndexOrEnd f x of + case findIndexOrEnd f x of 0 -> ([], x : xs) n | n < P.length x -> (P.take n x : [], P.drop n x : xs) | otherwise -> let (xs', xs'') = break' xs in (x : xs', xs'') +-- +-- TODO +-- +-- Add rules +-- + +{- -- | 'breakByte' breaks its ByteString argument at the first occurence -- of the specified byte. It is more efficient than 'break' as it is -- implemented with @memchr(3)@. I.e. @@ -760,6 +775,7 @@ spanByte c (LPS ps) = case (spanByte' ps) of (a,b) -> (LPS a, LPS b) | P.null x'' -> let (xs', xs'') = spanByte' xs in (x : xs', xs'') | otherwise -> (x' : [], x'' : xs) +-} -- | 'span' @p xs@ breaks the ByteString into two segments. It is -- equivalent to @('takeWhile' p xs, 'dropWhile' p xs)@ @@ -860,6 +876,8 @@ group xs -- | The 'groupBy' function is the non-overloaded version of 'group'. -- groupBy :: (Word8 -> Word8 -> Bool) -> ByteString -> [ByteString] +groupBy = error "Data.ByteString.Lazy.groupBy: unimplemented" +{- groupBy _ (LPS []) = [] groupBy k (LPS (a:as)) = groupBy' [] 0 (P.groupBy k a) as where groupBy' :: [P.ByteString] -> Word8 -> [P.ByteString] -> [P.ByteString] -> [ByteString] @@ -869,6 +887,7 @@ groupBy k (LPS (a:as)) = groupBy' [] 0 (P.groupBy k a) as groupBy' [] _ (s:[]) (x:xs) = groupBy' (s:[]) (P.unsafeHead s) (P.groupBy k x) xs groupBy' acc c (s:[]) (x:xs) = groupBy' (s:acc) c (P.groupBy k x) xs groupBy' acc _ (s:ss) xs = LPS (L.reverse (s : acc)) : groupBy' [] 0 ss xs +-} {- TODO: check if something like this might be faster @@ -887,12 +906,6 @@ groupBy k xs join :: ByteString -> [ByteString] -> ByteString join s = concat . (L.intersperse s) --- | /O(n)/ joinWithByte. An efficient way to join to two ByteStrings --- with a char. --- -joinWithByte :: Word8 -> ByteString -> ByteString -> ByteString -joinWithByte c x y = append x (cons c y) - -- --------------------------------------------------------------------- -- Indexing ByteStrings @@ -953,7 +966,7 @@ elemIndices c (LPS ps) = elemIndices' 0 ps -- -- But more efficiently than using length on the intermediate list. count :: Word8 -> ByteString -> Int64 -count w (LPS xs) = L.sum (L.map (fromIntegral . P.count w) xs) +count w (LPS xs) = L.foldl' (\n ps -> n + fromIntegral (P.count w ps)) 0 xs -- | The 'findIndex' function takes a predicate and a 'ByteString' and -- returns the index of the first element in the ByteString @@ -1008,6 +1021,7 @@ filter :: (Word8 -> Bool) -> ByteString -> ByteString filter p = LPS . P.loopArr . loopL (P.filterEFL p) P.NoAcc . unLPS {-# INLINE filter #-} +{- -- | /O(n)/ and /O(n\/c) space/ A first order equivalent of /filter . -- (==)/, for the common case of filtering a single byte. It is more -- efficient to use /filterByte/ in this case. @@ -1029,6 +1043,7 @@ filterByte w ps = replicate (count w ps) w -- filterNotByte is around 2x faster than its filter equivalent. filterNotByte :: Word8 -> ByteString -> ByteString filterNotByte w (LPS xs) = LPS (filterMap (P.filterNotByte w) xs) +-} -- --------------------------------------------------------------------- -- Searching for substrings @@ -1114,6 +1129,20 @@ tails = tails' . unLPS | otherwise = LPS xs : tails' (P.unsafeTail x : xs') -- --------------------------------------------------------------------- +-- Low level constructors + +-- | /O(n)/ Make a copy of the 'ByteString' with its own storage. +-- This is mainly useful to allow the rest of the data pointed +-- to by the 'ByteString' to be garbage collected, for example +-- if a large string has been read in, and only a small part of it +-- is needed in the rest of the program. +copy :: ByteString -> ByteString +copy (LPS lps) = LPS (L.map P.copy lps) +--TODO, we could coalese small blocks here +--FIXME: probably not strict enough, if we're doing this to avoid retaining +-- the parent blocks then we'd better copy strictly. + +-- --------------------------------------------------------------------- -- TODO defrag func that concatenates block together that are below a threshold -- defrag :: Int -> ByteString -> ByteString @@ -1122,17 +1151,26 @@ tails = tails' . unLPS -- Lazy ByteString IO -- | Read entire handle contents /lazily/ into a 'ByteString'. Chunks --- are read on demand, in @k@-sized chunks. +-- are read on demand, in at most @k@-sized chunks. It does not block +-- waiting for a whole @k@-sized chunk, so if less than @k@ bytes are +-- available then they will be returned immediately as a smaller chunk. hGetContentsN :: Int -> Handle -> IO ByteString hGetContentsN k h = lazyRead >>= return . LPS where - lazyRead = unsafeInterleaveIO $ do - ps <- P.hGet h k - case P.length ps of - 0 -> return [] - n | n < k -> return [ps] - _ -> do pss <- lazyRead - return (ps : pss) + lazyRead = unsafeInterleaveIO loop + + loop = do + ps <- P.hGetNonBlocking h k + --TODO: I think this should distinguish EOF from no data available + -- the otherlying POSIX call makes this distincion, returning either + -- 0 or EAGAIN + if P.null ps + then do eof <- hIsEOF h + if eof then return [] + else hWaitForInput h (-1) + >> loop + else do pss <- lazyRead + return (ps : pss) -- | Read @n@ bytes into a 'ByteString', directly from the -- specified 'Handle', in chunks of size @k@. @@ -1144,26 +1182,27 @@ hGetN k h n = readChunks n >>= return . LPS readChunks i = do ps <- P.hGet h (min k i) case P.length ps of - 0 -> return [] - m | m == i -> return [ps] - m -> do pss <- readChunks (i - m) - return (ps : pss) + 0 -> return [] + m -> do pss <- readChunks (i - m) + return (ps : pss) -#if defined(__GLASGOW_HASKELL__) -- | hGetNonBlockingN is similar to 'hGetContentsN', except that it will never block -- waiting for data to become available, instead it returns only whatever data -- is available. Chunks are read on demand, in @k@-sized chunks. hGetNonBlockingN :: Int -> Handle -> Int -> IO ByteString +#if defined(__GLASGOW_HASKELL__) hGetNonBlockingN _ _ 0 = return empty hGetNonBlockingN k h n = readChunks n >>= return . LPS where + STRICT1(readChunks) readChunks i = do ps <- P.hGetNonBlocking h (min k i) case P.length ps of - 0 -> return [] - m | fromIntegral m < i -> return [ps] - m -> do pss <- readChunks (i - m) - return (ps : pss) + 0 -> return [] + m -> do pss <- readChunks (i - m) + return (ps : pss) +#else +hGetNonBlockingN = hGetN #endif -- | Read entire handle contents /lazily/ into a 'ByteString'. Chunks @@ -1175,15 +1214,16 @@ hGetContents = hGetContentsN defaultChunkSize hGet :: Handle -> Int -> IO ByteString hGet = hGetN defaultChunkSize -#if defined(__GLASGOW_HASKELL__) -- | hGetNonBlocking is similar to 'hGet', except that it will never block -- waiting for data to become available, instead it returns only whatever data -- is available. +#if defined(__GLASGOW_HASKELL__) hGetNonBlocking :: Handle -> Int -> IO ByteString hGetNonBlocking = hGetNonBlockingN defaultChunkSize +#else +hGetNonBlocking = hGet #endif - -- | Read an entire file /lazily/ into a 'ByteString'. readFile :: FilePath -> IO ByteString readFile f = openBinaryFile f ReadMode >>= hGetContents @@ -1244,3 +1284,16 @@ filterMap f (x:xs) = case f x of | otherwise -> y : filterMap f xs {-# INLINE filterMap #-} + +-- | 'findIndexOrEnd' is a variant of findIndex, that returns the length +-- of the string if no element is found, rather than Nothing. +findIndexOrEnd :: (Word8 -> Bool) -> P.ByteString -> Int +findIndexOrEnd k (P.PS x s l) = P.inlinePerformIO $ withForeignPtr x $ \f -> go (f `plusPtr` s) 0 + where + STRICT2(go) + go ptr n | n >= l = return l + | otherwise = do w <- peek ptr + if k w + then return n + else go (ptr `plusPtr` 1) (n+1) +{-# INLINE findIndexOrEnd #-} diff --git a/Data/ByteString/Lazy/Char8.hs b/Data/ByteString/Lazy/Char8.hs index 15af132..ada949b 100644 --- a/Data/ByteString/Lazy/Char8.hs +++ b/Data/ByteString/Lazy/Char8.hs @@ -1,6 +1,4 @@ -{-# OPTIONS_GHC -cpp -optc-O1 -fno-warn-orphans #-} --- --- -optc-O2 breaks with 4.0.4 gcc on debian +{-# OPTIONS_GHC -cpp -fno-warn-orphans #-} -- -- Module : Data.ByteString.Lazy.Char8 -- Copyright : (c) Don Stewart 2006 @@ -102,10 +100,6 @@ module Data.ByteString.Lazy.Char8 ( inits, -- :: ByteString -> [ByteString] tails, -- :: ByteString -> [ByteString] - -- ** Breaking and dropping on specific Chars - breakChar, -- :: Char -> ByteString -> (ByteString, ByteString) - spanChar, -- :: Char -> ByteString -> (ByteString, ByteString) - -- ** Breaking into many substrings split, -- :: Char -> ByteString -> [ByteString] splitWith, -- :: (Char -> Bool) -> ByteString -> [ByteString] @@ -119,7 +113,6 @@ module Data.ByteString.Lazy.Char8 ( -- ** Joining strings join, -- :: ByteString -> [ByteString] -> ByteString - joinWithChar, -- :: Char -> ByteString -> ByteString -> ByteString -- * Predicates isPrefixOf, -- :: ByteString -> ByteString -> Bool @@ -130,8 +123,6 @@ module Data.ByteString.Lazy.Char8 ( -- ** Searching by equality elem, -- :: Char -> ByteString -> Bool notElem, -- :: Char -> ByteString -> Bool - filterChar, -- :: Char -> ByteString -> ByteString - filterNotChar, -- :: Char -> ByteString -> ByteString -- ** Searching with a predicate find, -- :: (Char -> Bool) -> ByteString -> Maybe Char @@ -154,6 +145,8 @@ module Data.ByteString.Lazy.Char8 ( -- * Ordered ByteStrings -- sort, -- :: ByteString -> ByteString + copy, -- :: ByteString -> ByteString + -- * Reading from ByteStrings readInt, @@ -172,27 +165,23 @@ module Data.ByteString.Lazy.Char8 ( -- ** I\/O with Handles hGetContents, -- :: Handle -> IO ByteString - hGetContentsN, -- :: Int -> Handle -> IO ByteString hGet, -- :: Handle -> Int64 -> IO ByteString - hGetN, -- :: Int -> Handle -> Int64 -> IO ByteString hPut, -- :: Handle -> ByteString -> IO () -#if defined(__GLASGOW_HASKELL__) hGetNonBlocking, -- :: Handle -> IO ByteString - hGetNonBlockingN, -- :: Int -> Handle -> IO ByteString -#endif + +-- hGetN, -- :: Int -> Handle -> Int64 -> IO ByteString +-- hGetContentsN, -- :: Int -> Handle -> IO ByteString +-- hGetNonBlockingN, -- :: Int -> Handle -> IO ByteString ) where -- Functions transparently exported import Data.ByteString.Lazy (ByteString(..) ,empty,null,length,tail,init,append,reverse,transpose - ,concat,take,drop,splitAt,join,isPrefixOf,group,inits, tails - ,hGetContentsN, hGetN, hGetContents, hGet, hPut, getContents -#if defined(__GLASGOW_HASKELL__) - ,hGetNonBlocking, hGetNonBlockingN -#endif - ,putStr, putStrLn - ,readFile, writeFile, appendFile) + ,concat,take,drop,splitAt,join,isPrefixOf,group,inits,tails,copy + ,hGetContents, hGet, hPut, getContents + ,hGetNonBlocking + ,putStr, putStrLn, interact) -- Functions we need to wrap. import qualified Data.ByteString.Lazy as L @@ -209,7 +198,10 @@ import Prelude hiding ,concat,any,take,drop,splitAt,takeWhile,dropWhile,span,break,elem,filter ,unwords,words,maximum,minimum,all,concatMap,scanl,scanl1,foldl1,foldr1 ,readFile,writeFile,appendFile,replicate,getContents,getLine,putStr,putStrLn - ,zip,zipWith,unzip,notElem,repeat,iterate) + ,zip,zipWith,unzip,notElem,repeat,iterate,interact) + +import System.IO (hClose,openFile,IOMode(..)) +import Control.Exception (bracket) #define STRICT1(f) f a | a `seq` False = undefined #define STRICT2(f) f a b | a `seq` b `seq` False = undefined @@ -226,11 +218,11 @@ singleton = L.singleton . c2w -- | /O(n)/ Convert a 'String' into a 'ByteString'. pack :: [Char] -> ByteString -pack = L.packWith c2w +pack = L.pack. P.map c2w -- | /O(n)/ Converts a 'ByteString' to a 'String'. unpack :: ByteString -> [Char] -unpack = L.unpackWith w2c +unpack = P.map w2c . L.unpack {-# INLINE unpack #-} -- | /O(n)/ 'cons' is analogous to (:) for lists, but of different @@ -406,6 +398,7 @@ span :: (Char -> Bool) -> ByteString -> (ByteString, ByteString) span f = L.span (f . w2c) {-# INLINE span #-} +{- -- | 'breakChar' breaks its ByteString argument at the first occurence -- of the specified Char. It is more efficient than 'break' as it is -- implemented with @memchr(3)@. I.e. @@ -425,6 +418,11 @@ breakChar = L.breakByte . c2w spanChar :: Char -> ByteString -> (ByteString, ByteString) spanChar = L.spanByte . c2w {-# INLINE spanChar #-} +-} + +-- +-- TODO, more rules for breakChar* +-- -- | /O(n)/ Break a 'ByteString' into pieces separated by the byte -- argument, consuming the delimiter. I.e. @@ -470,13 +468,6 @@ tokens f = L.tokens (f . w2c) groupBy :: (Char -> Char -> Bool) -> ByteString -> [ByteString] groupBy k = L.groupBy (\a b -> k (w2c a) (w2c b)) --- | /O(n)/ joinWithChar. An efficient way to join to two ByteStrings with a --- char. Around 4 times faster than the generalised join. --- -joinWithChar :: Char -> ByteString -> ByteString -> ByteString -joinWithChar = L.joinWithByte . c2w -{-# INLINE joinWithChar #-} - -- | /O(1)/ 'ByteString' index (subscript) operator, starting from 0. index :: ByteString -> Int64 -> Char index = (w2c .) . L.index @@ -540,6 +531,7 @@ find :: (Char -> Bool) -> ByteString -> Maybe Char find f ps = w2c `fmap` L.find (f . w2c) ps {-# INLINE find #-} +{- -- | /O(n)/ A first order equivalent of /filter . (==)/, for the common -- case of filtering a single Char. It is more efficient to use -- filterChar in this case. @@ -565,6 +557,7 @@ filterChar c = L.filterByte (c2w c) filterNotChar :: Char -> ByteString -> ByteString filterNotChar c = L.filterNotByte (c2w c) {-# INLINE filterNotChar #-} +-} -- | /O(n)/ 'zip' takes two ByteStrings and returns a list of -- corresponding pairs of Chars. If one input ByteString is short, @@ -691,3 +684,18 @@ readInt (LPS (x:xs)) = | otherwise = ps:pss in n' `seq` ps' `seq` Just $! (n', LPS ps') + +-- | Read an entire file /lazily/ into a 'ByteString'. Use 'text mode' +-- on Windows to interpret newlines +readFile :: FilePath -> IO ByteString +readFile f = openFile f ReadMode >>= hGetContents + +-- | Write a 'ByteString' to a file. +writeFile :: FilePath -> ByteString -> IO () +writeFile f txt = bracket (openFile f WriteMode) hClose + (\hdl -> hPut hdl txt) + +-- | Append a 'ByteString' to a file. +appendFile :: FilePath -> ByteString -> IO () +appendFile f txt = bracket (openFile f AppendMode) hClose + (\hdl -> hPut hdl txt) diff --git a/cbits/fpstring.c b/cbits/fpstring.c index d42ebe5..9e0b809 100644 --- a/cbits/fpstring.c +++ b/cbits/fpstring.c @@ -32,30 +32,30 @@ #include "fpstring.h" /* copy a string in reverse */ -void fps_reverse(unsigned char *dest, unsigned char *from, int len) { - unsigned char *p, *q; - p = from + len - 1; - q = dest; - - while (p >= from) +void fps_reverse(unsigned char *q, unsigned char *p, unsigned long n) { + p += n-1; + while (n-- != 0) *q++ = *p--; } /* duplicate a string, interspersing the character through the elements of the duplicated string */ -void fps_intersperse(unsigned char *dest, unsigned char *from, int len, char c) { - unsigned char *p, *q; - p = from; - q = dest; - while (p < from + len - 1) { - *q++ = *p++; +void fps_intersperse(unsigned char *q, + unsigned char *p, + unsigned long n, + unsigned char c) { + + while (n > 1) { + *q++ = *p++; *q++ = c; + n--; } - *q = *p; + if (n == 1) + *q = *p; } /* find maximum char in a packed string */ -unsigned char fps_maximum(unsigned char *p, int len) { +unsigned char fps_maximum(unsigned char *p, unsigned long len) { unsigned char *q, c = *p; for (q = p; q < p + len; q++) if (*q > c) @@ -64,7 +64,7 @@ unsigned char fps_maximum(unsigned char *p, int len) { } /* find minimum char in a packed string */ -unsigned char fps_minimum(unsigned char *p, int len) { +unsigned char fps_minimum(unsigned char *p, unsigned long len) { unsigned char *q, c = *p; for (q = p; q < p + len; q++) if (*q < c) @@ -73,9 +73,9 @@ unsigned char fps_minimum(unsigned char *p, int len) { } /* count the number of occurences of a char in a string */ -int fps_count(unsigned char *p, int len, unsigned char w) { - int c; - for (c = 0; len--; ++p) +unsigned long fps_count(unsigned char *p, unsigned long len, unsigned char w) { + unsigned long c; + for (c = 0; len-- != 0; ++p) if (*p == w) ++c; return c; diff --git a/include/fpstring.h b/include/fpstring.h index 42e8346..afbc911 100644 --- a/include/fpstring.h +++ b/include/fpstring.h @@ -1,6 +1,6 @@ -void fps_reverse(unsigned char *dest, unsigned char *from, int len); -void fps_intersperse(unsigned char *dest, unsigned char *from, int len, char c); -unsigned char fps_maximum(unsigned char *p, int len); -unsigned char fps_minimum(unsigned char *p, int len); -int fps_count(unsigned char *p, int len, unsigned char w); +void fps_reverse(unsigned char *dest, unsigned char *from, unsigned long len); +void fps_intersperse(unsigned char *dest, unsigned char *from, unsigned long len, unsigned char c); +unsigned char fps_maximum(unsigned char *p, unsigned long len); +unsigned char fps_minimum(unsigned char *p, unsigned long len); +unsigned long fps_count(unsigned char *p, unsigned long len, unsigned char w);