-{-# OPTIONS_GHC -cpp -fffi #-}
+{-# OPTIONS_GHC -cpp -fffi -fglasgow-exts #-}
--
-- Module : ByteString
-- Copyright : (c) The University of Glasgow 2001,
-- ** Breaking and dropping on specific bytes
breakByte, -- :: Word8 -> ByteString -> (ByteString, ByteString)
+ spanByte, -- :: Word8 -> ByteString -> (ByteString, ByteString)
breakFirst, -- :: Word8 -> ByteString -> Maybe (ByteString,ByteString)
breakLast, -- :: Word8 -> ByteString -> Maybe (ByteString,ByteString)
split, -- :: Word8 -> ByteString -> [ByteString]
splitWith, -- :: (Word8 -> Bool) -> ByteString -> [ByteString]
tokens, -- :: (Word8 -> Bool) -> ByteString -> [ByteString]
+ group, -- :: ByteString -> [ByteString]
+ groupBy, -- :: (Word8 -> Word8 -> Bool) -> ByteString -> [ByteString]
-- ** Joining strings
join, -- :: ByteString -> [ByteString] -> ByteString
-- Control.Exception.bracket not available in yhc or nhc
import Control.Exception (bracket)
+import Control.Monad (when)
import Foreign.C.String (CString, CStringLen)
import Foreign.C.Types (CSize, CInt)
foldr1 :: (Word8 -> Word8 -> Word8) -> ByteString -> Word8
foldr1 f ps
| null ps = errorEmptyList "foldr1"
- | otherwise = f (unsafeHead ps) (foldr1 f (unsafeTail ps))
+ | otherwise = foldr f (last ps) (init ps)
-- ---------------------------------------------------------------------
-- Special folds
Just n -> (take n p, drop n p)
{-# INLINE breakByte #-}
+-- | 'spanByte' breaks its ByteString argument at the first
+-- occurence of a byte other than its argument. It is more efficient
+-- than 'span (==)'
+--
+-- > span (=='c') "abcd" == spanByte 'c' "abcd"
+--
+spanByte :: Word8 -> ByteString -> (ByteString, ByteString)
+spanByte c ps@(PS x s l) = inlinePerformIO $ withForeignPtr x $ \p ->
+ go (p `plusPtr` s) 0
+ where
+ STRICT2(go)
+ go p i | i >= l = return (ps, empty)
+ | otherwise = do c' <- peekByteOff p i
+ if c /= c'
+ then return (take i ps, drop i ps)
+ else go p (i+1)
+{-# INLINE spanByte #-}
+
-- | /O(n)/ 'breakFirst' breaks the given ByteString on the first
-- occurence of @w@. It behaves like 'break', except the delimiter is
-- not returned, and @Nothing@ is returned if the delimiter is not in
-- | 'span' @p xs@ breaks the ByteString into two segments. It is
-- equivalent to @('takeWhile' p xs, 'dropWhile' p xs)@
span :: (Word8 -> Bool) -> ByteString -> (ByteString, ByteString)
-span p ps = break (not . p) ps
+span p ps = break (not . p) ps
{-# INLINE span #-}
-- | 'spanEnd' behaves like 'span' but from the end of the 'ByteString'.
tokens :: (Word8 -> Bool) -> ByteString -> [ByteString]
tokens f = P.filter (not.null) . splitWith f
+-- | The 'group' function takes a ByteString and returns a list of
+-- ByteStrings such that the concatenation of the result is equal to the
+-- argument. Moreover, each sublist in the result contains only equal
+-- elements. For example,
+--
+-- > group "Mississippi" = ["M","i","ss","i","ss","i","pp","i"]
+--
+-- It is a special case of 'groupBy', which allows the programmer to
+-- supply their own equality test. It is about 40% faster than
+-- /groupBy (==)/
+group :: ByteString -> [ByteString]
+group xs
+ | null xs = []
+ | otherwise = ys : group zs
+ where
+ (ys, zs) = spanByte (unsafeHead xs) xs
+
+-- | The 'groupBy' function is the non-overloaded version of 'group'.
+groupBy :: (Word8 -> Word8 -> Bool) -> ByteString -> [ByteString]
+groupBy k xs
+ | null xs = []
+ | otherwise = take n xs : groupBy k (drop n xs)
+ where
+ n = 1 + findIndexOrEnd (not . k (unsafeHead xs)) (unsafeTail xs)
+
-- | /O(n)/ The 'join' function takes a 'ByteString' and a list of
-- 'ByteString's and concatenates the list after interspersing the first
-- argument between each element of the list.
-- ---------------------------------------------------------------------
-- ** Ordered 'ByteString's
--- | /O(n log(n))/ Sort a ByteString efficiently, using qsort(3).
+-- | /O(n)/ Sort a ByteString efficiently, using counting sort.
+sort :: ByteString -> ByteString
+sort (PS input s l) = create l $ \p -> allocaArray 256 $ \arr -> do
+
+ memset (castPtr arr) 0 (256 * fromIntegral (sizeOf (undefined :: CSize)))
+ withForeignPtr input (\x -> countEach arr (x `plusPtr` s) l)
+
+ let STRICT2(go)
+ go 256 _ = return ()
+ go i ptr = do n <- peekElemOff arr i
+ when (n /= 0) $ memset ptr (fromIntegral i) n >> return ()
+ go (i + 1) (ptr `plusPtr` (fromIntegral n))
+ go 0 p
+
+-- "countEach counts str l" counts the number of occurences of each Word8 in
+-- str, and stores the result in counts.
+countEach :: Ptr CSize -> Ptr Word8 -> Int -> IO ()
+STRICT3(countEach)
+countEach counts str l = go 0
+ where
+ STRICT1(go)
+ go i | i == l = return ()
+ | otherwise = do k <- fromIntegral `fmap` peekElemOff str i
+ x <- peekElemOff counts k
+ pokeElemOff counts k (x + 1)
+ go (i + 1)
+
+{-
sort :: ByteString -> ByteString
sort (PS x s l) = create l $ \p -> withForeignPtr x $ \f -> do
memcpy p (f `plusPtr` s) l
c_qsort p l -- inplace
+-}
{-
sort = pack . List.sort . unpack
-}
+-- | The 'sortBy' function is the non-overloaded version of 'sort'.
+--
+-- Try some linear sorts: radix, counting
+-- Or mergesort.
+--
+-- sortBy :: (Word8 -> Word8 -> Ordering) -> ByteString -> ByteString
+-- sortBy f ps = undefined
+
-- ---------------------------------------------------------------------
--
-- Extensions to the basic interface
-- | A way of creating ForeignPtrs outside the IO monad. The @Int@
-- argument gives the final size of the ByteString. Unlike 'generate'
--- the ByteString is no reallocated if the final size is less than the
--- estimated size.
+-- the ByteString is not reallocated if the final size is less than the
+-- estimated size. Also, unlike 'generate' ByteString's created this way
+-- are managed on the Haskell heap.
create :: Int -> (Ptr Word8 -> IO ()) -> ByteString
create l write_ptr = inlinePerformIO $ do
fp <- mallocByteString (l+1)
foreign import ccall unsafe "static fpstring.h count" c_count
:: Ptr Word8 -> Int -> Word8 -> Int
-foreign import ccall unsafe "static fpstring.h my_qsort" c_qsort
- :: Ptr Word8 -> Int -> IO ()
-
-- ---------------------------------------------------------------------
-- MMap
-{-# OPTIONS_GHC -cpp -fffi #-}
+{-# OPTIONS_GHC -cpp -fffi -fglasgow-exts #-}
--
-- Module : Data.ByteString.Char8
-- Copyright : (c) Don Stewart 2006
-- ** Breaking and dropping on specific Chars
breakChar, -- :: Char -> ByteString -> (ByteString, ByteString)
+ spanChar, -- :: Char -> ByteString -> (ByteString, ByteString)
breakFirst, -- :: Char -> ByteString -> Maybe (ByteString,ByteString)
breakLast, -- :: Char -> ByteString -> Maybe (ByteString,ByteString)
breakSpace, -- :: ByteString -> Maybe (ByteString,ByteString)
split, -- :: Char -> ByteString -> [ByteString]
splitWith, -- :: (Char -> Bool) -> ByteString -> [ByteString]
tokens, -- :: (Char -> Bool) -> ByteString -> [ByteString]
+ group, -- :: ByteString -> [ByteString]
+ groupBy, -- :: (Word8 -> Word8 -> Bool) -> ByteString -> [ByteString]
-- ** Breaking into lines and words
lines, -- :: ByteString -> [ByteString]
,inits,tails,elems,reverse,transpose
,concat,take,drop,splitAt,join
,sort,isPrefixOf,isSuffixOf,isSubstringOf,findSubstring
- ,findSubstrings,unsafeTail,copy
+ ,findSubstrings,unsafeTail,copy,group
,getContents, putStr, putStrLn
,readFile, {-mmapFile,-} writeFile
-- | 'foldr1' is a variant of 'foldr' that has no starting value argument,
-- and thus must be applied to non-empty 'ByteString's
foldr1 :: (Char -> Char -> Char) -> ByteString -> Char
-foldr1 f ps = w2c (B.foldl1 (\x y -> c2w (f (w2c x) (w2c y))) ps)
+foldr1 f ps = w2c (B.foldr1 (\x y -> c2w (f (w2c x) (w2c y))) ps)
{-# INLINE foldr1 #-}
-- | Map a function over a 'ByteString' and concatenate the results
breakChar = B.breakByte . c2w
{-# INLINE breakChar #-}
+-- | 'spanChar' breaks its ByteString argument at the first
+-- occurence of a Char other than its argument. It is more efficient
+-- than 'span (==)'
+--
+-- > span (=='c') "abcd" == spanByte 'c' "abcd"
+--
+spanChar :: Char -> ByteString -> (ByteString, ByteString)
+spanChar = B.spanByte . c2w
+{-# INLINE spanChar #-}
+
-- | /O(n)/ 'breakFirst' breaks the given ByteString on the first
-- occurence of @w@. It behaves like 'break', except the delimiter is
-- not returned, and @Nothing@ is returned if the delimiter is not in
tokens f = B.tokens (f . w2c)
{-# INLINE tokens #-}
+-- | The 'groupBy' function is the non-overloaded version of 'group'.
+groupBy :: (Char -> Char -> Bool) -> ByteString -> [ByteString]
+groupBy k = B.groupBy (\a b -> k (w2c a) (w2c b))
+
-- | /O(n)/ joinWithChar. An efficient way to join to two ByteStrings with a
-- char. Around 4 times faster than the generalised join.
--
-- | count returns the number of times its argument appears in the ByteString
--
-- > count = length . elemIndices
+--
+-- Also
+--
+-- > count '\n' == length . lines
--
-- But more efficiently than using length on the intermediate list.
count :: Char -> ByteString -> Int
-- | 'lines' breaks a ByteString up into a list of ByteStrings at
-- newline Chars. The resulting strings do not contain newlines.
+--
lines :: ByteString -> [ByteString]
lines ps
| null ps = []