1 {-# OPTIONS_GHC -cpp -fffi -fglasgow-exts #-}
3 -- Module : Data.ByteString.Char8
4 -- Copyright : (c) Don Stewart 2006
7 -- Maintainer : dons@cse.unsw.edu.au
8 -- Stability : experimental
9 -- Portability : portable (tested with GHC>=6.4.1 and Hugs 2005)
13 -- | Manipulate ByteStrings using Char operations. All Chars will be
14 -- truncated to 8 bits. It can be expected that these functions will run
15 -- at identical speeds to their Word8 equivalents in @Data.ByteString@.
17 -- More specifically these byte strings are taken to be in the
18 -- subset of Unicode covered by code points 0-255. This covers
19 -- Unicode Basic Latin, Latin-1 Supplement and C0+C1 Controls.
23 -- * <http://www.unicode.org/charts/>
25 -- * <http://www.unicode.org/charts/PDF/U0000.pdf>
27 -- * <http://www.unicode.org/charts/PDF/U0080.pdf>
29 -- This module is intended to be imported @qualified@, to avoid name
30 -- clashes with Prelude functions. eg.
32 -- > import qualified Data.ByteString.Char8 as B
35 module Data.ByteString.Char8 (
37 -- * The @ByteString@ type
38 ByteString(..), -- instances: Eq, Ord, Show, Read, Data, Typeable
40 -- * Introducing and eliminating 'ByteString's
41 empty, -- :: ByteString
42 packChar, -- :: Char -> ByteString
43 pack, -- :: String -> ByteString
44 unpack, -- :: ByteString -> String
47 cons, -- :: Char -> ByteString -> ByteString
48 snoc, -- :: Char -> ByteString -> ByteString
49 null, -- :: ByteString -> Bool
50 length, -- :: ByteString -> Int
51 head, -- :: ByteString -> Char
52 tail, -- :: ByteString -> ByteString
53 last, -- :: ByteString -> Char
54 init, -- :: ByteString -> ByteString
55 append, -- :: ByteString -> ByteString -> ByteString
57 -- * Special ByteStrings
58 inits, -- :: ByteString -> [ByteString]
59 tails, -- :: ByteString -> [ByteString]
60 elems, -- :: ByteString -> [ByteString]
62 -- * Transformating ByteStrings
63 map, -- :: (Char -> Char) -> ByteString -> ByteString
64 reverse, -- :: ByteString -> ByteString
65 intersperse, -- :: Char -> ByteString -> ByteString
66 transpose, -- :: [ByteString] -> [ByteString]
68 -- * Reducing 'ByteString's
69 foldl, -- :: (a -> Char -> a) -> a -> ByteString -> a
70 foldr, -- :: (Char -> a -> a) -> a -> ByteString -> a
71 foldl1, -- :: (Char -> Char -> Char) -> ByteString -> Char
72 foldl1', -- :: (Char -> Char -> Char) -> ByteString -> Char
73 foldr1, -- :: (Char -> Char -> Char) -> ByteString -> Char
74 foldl', -- :: (a -> Char -> a) -> a -> ByteString -> a
77 concat, -- :: [ByteString] -> ByteString
78 concatMap, -- :: (Char -> ByteString) -> ByteString -> ByteString
79 any, -- :: (Char -> Bool) -> ByteString -> Bool
80 all, -- :: (Char -> Bool) -> ByteString -> Bool
81 maximum, -- :: ByteString -> Char
82 minimum, -- :: ByteString -> Char
83 mapIndexed, -- :: (Int -> Char -> Char) -> ByteString -> ByteString
85 -- * Building ByteStrings
89 -- * Generating and unfolding ByteStrings
90 replicate, -- :: Int -> Char -> ByteString
91 unfoldrN, -- :: (a -> Maybe (Char, a)) -> a -> ByteString
95 -- ** Breaking strings
96 take, -- :: Int -> ByteString -> ByteString
97 drop, -- :: Int -> ByteString -> ByteString
98 splitAt, -- :: Int -> ByteString -> (ByteString, ByteString)
99 takeWhile, -- :: (Char -> Bool) -> ByteString -> ByteString
100 dropWhile, -- :: (Char -> Bool) -> ByteString -> ByteString
101 break, -- :: (Char -> Bool) -> ByteString -> (ByteString, ByteString)
102 span, -- :: (Char -> Bool) -> ByteString -> (ByteString, ByteString)
103 spanEnd, -- :: (Char -> Bool) -> ByteString -> (ByteString, ByteString)
105 -- ** Breaking and dropping on specific Chars
106 breakChar, -- :: Char -> ByteString -> (ByteString, ByteString)
107 spanChar, -- :: Char -> ByteString -> (ByteString, ByteString)
108 breakFirst, -- :: Char -> ByteString -> Maybe (ByteString,ByteString)
109 breakLast, -- :: Char -> ByteString -> Maybe (ByteString,ByteString)
110 breakSpace, -- :: ByteString -> Maybe (ByteString,ByteString)
111 dropSpace, -- :: ByteString -> ByteString
112 dropSpaceEnd, -- :: ByteString -> ByteString
114 -- ** Breaking into many substrings
115 split, -- :: Char -> ByteString -> [ByteString]
116 splitWith, -- :: (Char -> Bool) -> ByteString -> [ByteString]
117 tokens, -- :: (Char -> Bool) -> ByteString -> [ByteString]
118 group, -- :: ByteString -> [ByteString]
119 groupBy, -- :: (Word8 -> Word8 -> Bool) -> ByteString -> [ByteString]
121 -- ** Breaking into lines and words
122 lines, -- :: ByteString -> [ByteString]
123 words, -- :: ByteString -> [ByteString]
124 unlines, -- :: [ByteString] -> ByteString
125 unwords, -- :: ByteString -> [ByteString]
127 lines', -- :: ByteString -> [ByteString]
128 unlines', -- :: [ByteString] -> ByteString
129 linesCRLF', -- :: ByteString -> [ByteString]
130 unlinesCRLF', -- :: [ByteString] -> ByteString
131 words', -- :: ByteString -> [ByteString]
132 unwords', -- :: ByteString -> [ByteString]
134 lineIndices, -- :: ByteString -> [Int]
135 betweenLines, -- :: ByteString -> ByteString -> ByteString -> Maybe (ByteString)
137 -- ** Joining strings
138 join, -- :: ByteString -> [ByteString] -> ByteString
139 joinWithChar, -- :: Char -> ByteString -> ByteString -> ByteString
141 -- * Indexing ByteStrings
142 index, -- :: ByteString -> Int -> Char
143 elemIndex, -- :: Char -> ByteString -> Maybe Int
144 elemIndexLast, -- :: Char -> ByteString -> Maybe Int
145 elemIndices, -- :: Char -> ByteString -> [Int]
146 findIndex, -- :: (Char -> Bool) -> ByteString -> Maybe Int
147 findIndices, -- :: (Char -> Bool) -> ByteString -> [Int]
148 count, -- :: Char -> ByteString -> Int
150 -- * Ordered ByteStrings
151 sort, -- :: ByteString -> ByteString
153 -- * Searching ByteStrings
155 -- ** Searching by equality
156 elem, -- :: Char -> ByteString -> Bool
157 notElem, -- :: Char -> ByteString -> Bool
158 filterChar, -- :: Char -> ByteString -> ByteString
159 filterNotChar, -- :: Char -> ByteString -> ByteString
161 -- ** Searching with a predicate
162 filter, -- :: (Char -> Bool) -> ByteString -> ByteString
163 find, -- :: (Char -> Bool) -> ByteString -> Maybe Char
165 -- ** Searching for substrings
166 isPrefixOf, -- :: ByteString -> ByteString -> Bool
167 isSuffixOf, -- :: ByteString -> ByteString -> Bool
168 isSubstringOf, -- :: ByteString -> ByteString -> Bool
169 findSubstring, -- :: ByteString -> ByteString -> Maybe Int
170 findSubstrings, -- :: ByteString -> ByteString -> [Int]
172 -- * Zipping and unzipping ByteString
173 zip, -- :: ByteString -> ByteString -> [(Char,Char)]
174 zipWith, -- :: (Char -> Char -> c) -> ByteString -> ByteString -> [c]
175 unzip, -- :: [(Char,Char)] -> (ByteString,ByteString)
177 -- * Unchecked access
178 unsafeHead, -- :: ByteString -> Char
179 unsafeTail, -- :: ByteString -> ByteString
180 unsafeIndex, -- :: ByteString -> Int -> Char
181 w2c, -- :: Word8 -> Char
182 c2w, -- :: Char -> Word8
184 -- * Reading from ByteStrings
185 readInt, -- :: ByteString -> Maybe Int
186 unsafeReadInt, -- :: ByteString -> Maybe Int
188 -- * Copying ByteStrings
189 copy, -- :: ByteString -> ByteString
191 -- * I\/O with @ByteString@s
193 -- ** Standard input and output
195 #if defined(__GLASGOW_HASKELL__)
196 getLine, -- :: IO ByteString
198 getContents, -- :: IO ByteString
199 putStr, -- :: ByteString -> IO ()
200 putStrLn, -- :: ByteString -> IO ()
203 readFile, -- :: FilePath -> IO ByteString
204 -- mmapFile, -- :: FilePath -> IO ByteString
205 writeFile, -- :: FilePath -> ByteString -> IO ()
207 -- ** I\/O with Handles
208 #if defined(__GLASGOW_HASKELL__)
209 getArgs, -- :: IO [ByteString]
210 hGetLine, -- :: Handle -> IO ByteString
211 hGetNonBlocking, -- :: Handle -> Int -> IO ByteString
213 hGetContents, -- :: Handle -> IO ByteString
214 hGet, -- :: Handle -> Int -> IO ByteString
215 hPut, -- :: Handle -> ByteString -> IO ()
217 #if defined(__GLASGOW_HASKELL__)
218 -- * Low level construction
219 -- | For constructors from foreign language types see /Data.ByteString/
220 packAddress, -- :: Addr# -> ByteString
221 unsafePackAddress, -- :: Int -> Addr# -> ByteString
224 -- * Utilities (needed for array fusion)
225 #if defined(__GLASGOW_HASKELL__)
228 noAL, NoAL, loopArr, loopAcc, loopSndAcc,
229 loopU, mapEFL, filterEFL, foldEFL, foldEFL', fuseEFL, scanEFL,
234 import qualified Prelude as P
235 import Prelude hiding (reverse,head,tail,last,init,null
236 ,length,map,lines,foldl,foldr,unlines
237 ,concat,any,take,drop,splitAt,takeWhile
238 ,dropWhile,span,break,elem,filter,unwords
239 ,words,maximum,minimum,all,concatMap,scanl,scanl1
240 ,foldl1,foldr1,readFile,writeFile,replicate
241 ,getContents,getLine,putStr,putStrLn
242 ,zip,zipWith,unzip,notElem)
244 import qualified Data.ByteString as B
246 -- Listy functions transparently exported
247 import Data.ByteString (ByteString(..)
248 ,empty,null,length,tail,init,append
249 ,inits,tails,elems,reverse,transpose
250 ,concat,take,drop,splitAt,join
251 ,sort,isPrefixOf,isSuffixOf,isSubstringOf,findSubstring
252 ,findSubstrings,unsafeTail,copy,group
254 ,getContents, putStr, putStrLn
255 ,readFile, {-mmapFile,-} writeFile
256 ,hGetContents, hGet, hPut
257 #if defined(__GLASGOW_HASKELL__)
258 ,getLine, getArgs, hGetLine, hGetNonBlocking
259 ,packAddress, unsafePackAddress
262 ,noAL, NoAL, loopArr, loopAcc, loopSndAcc
263 ,loopU, mapEFL, filterEFL, foldEFL, foldEFL', fuseEFL, scanEFL
264 ,useAsCString, unsafeUseAsCString
269 import qualified Data.List as List (intersperse)
272 import Foreign.C.Types (CLong)
273 import Foreign.Marshal.Utils (with)
275 #if defined(__GLASGOW_HASKELL__)
276 import GHC.Base (Char(..),unsafeChr,unpackCString#,unsafeCoerce#)
277 import GHC.IOBase (IO(..),stToIO)
278 import GHC.Prim (Addr#,writeWord8OffAddr#,realWorld#,plusAddr#)
279 import GHC.Ptr (Ptr(..))
280 import GHC.ST (ST(..))
283 #define STRICT1(f) f a | a `seq` False = undefined
284 #define STRICT2(f) f a b | a `seq` b `seq` False = undefined
285 #define STRICT3(f) f a b c | a `seq` b `seq` c `seq` False = undefined
287 ------------------------------------------------------------------------
289 -- | /O(1)/ Convert a 'Char' into a 'ByteString'
290 packChar :: Char -> ByteString
291 packChar = B.packByte . c2w
292 {-# INLINE packChar #-}
294 -- | /O(n)/ Convert a 'String' into a 'ByteString'
296 -- For applications with large numbers of string literals, pack can be a
297 -- bottleneck. In such cases, consider using packAddress (GHC only).
298 pack :: String -> ByteString
299 #if !defined(__GLASGOW_HASKELL__)
301 pack str = B.create (P.length str) $ \p -> go p str
302 where go _ [] = return ()
303 go p (x:xs) = poke p (c2w x) >> go (p `plusPtr` 1) xs
305 #else /* hack away */
307 pack str = B.create (P.length str) $ \(Ptr p) -> stToIO (go p str)
309 go :: Addr# -> [Char] -> ST a ()
311 go p (C# c:cs) = writeByte p (unsafeCoerce# c) >> go (p `plusAddr#` 1#) cs
313 writeByte p c = ST $ \s# ->
314 case writeWord8OffAddr# p 0# c s# of s2# -> (# s2#, () #)
315 {-# INLINE writeByte #-}
318 "pack/packAddress" forall s# .
319 pack (unpackCString# s#) = B.packAddress s#
326 -- | /O(n)/ Converts a 'ByteString' to a 'String'.
327 unpack :: ByteString -> [Char]
328 unpack = B.unpackWith w2c
329 {-# INLINE unpack #-}
331 -- | /O(n)/ 'cons' is analogous to (:) for lists, but of different
332 -- complexity, as it requires a memcpy.
333 cons :: Char -> ByteString -> ByteString
337 -- | /O(n)/ Append a Char to the end of a 'ByteString'. Similar to
338 -- 'cons', this function performs a memcpy.
339 snoc :: ByteString -> Char -> ByteString
340 snoc p = B.snoc p . c2w
343 -- | /O(1)/ Extract the first element of a ByteString, which must be non-empty.
344 head :: ByteString -> Char
348 -- | /O(1)/ Extract the last element of a packed string, which must be non-empty.
349 last :: ByteString -> Char
353 -- | /O(n)/ 'map' @f xs@ is the ByteString obtained by applying @f@ to each element of @xs@
354 map :: (Char -> Char) -> ByteString -> ByteString
355 map f = B.map (c2w . f . w2c)
358 -- | /O(n)/ The 'intersperse' function takes a Char and a 'ByteString'
359 -- and \`intersperses\' that Char between the elements of the
360 -- 'ByteString'. It is analogous to the intersperse function on Lists.
361 intersperse :: Char -> ByteString -> ByteString
362 intersperse = B.intersperse . c2w
363 {-# INLINE intersperse #-}
365 -- | 'foldl', applied to a binary operator, a starting value (typically
366 -- the left-identity of the operator), and a ByteString, reduces the
367 -- ByteString using the binary operator, from left to right.
368 foldl :: (a -> Char -> a) -> a -> ByteString -> a
369 foldl f = B.foldl (\a c -> f a (w2c c))
372 -- | 'foldl\'' is like foldl, but strict in the accumulator.
373 foldl' :: (a -> Char -> a) -> a -> ByteString -> a
374 foldl' f = B.foldl' (\a c -> f a (w2c c))
375 {-# INLINE foldl' #-}
377 -- | 'foldr', applied to a binary operator, a starting value
378 -- (typically the right-identity of the operator), and a packed string,
379 -- reduces the packed string using the binary operator, from right to left.
380 foldr :: (Char -> a -> a) -> a -> ByteString -> a
381 foldr f = B.foldr (\c a -> f (w2c c) a)
384 -- | 'foldl1' is a variant of 'foldl' that has no starting value
385 -- argument, and thus must be applied to non-empty 'ByteStrings'.
386 foldl1 :: (Char -> Char -> Char) -> ByteString -> Char
387 foldl1 f ps = w2c (B.foldl1 (\x y -> c2w (f (w2c x) (w2c y))) ps)
388 {-# INLINE foldl1 #-}
390 -- | A strict version of 'foldl1'
391 foldl1' :: (Char -> Char -> Char) -> ByteString -> Char
392 foldl1' f ps = w2c (B.foldl1' (\x y -> c2w (f (w2c x) (w2c y))) ps)
393 {-# INLINE foldl1' #-}
395 -- | 'foldr1' is a variant of 'foldr' that has no starting value argument,
396 -- and thus must be applied to non-empty 'ByteString's
397 foldr1 :: (Char -> Char -> Char) -> ByteString -> Char
398 foldr1 f ps = w2c (B.foldr1 (\x y -> c2w (f (w2c x) (w2c y))) ps)
399 {-# INLINE foldr1 #-}
401 -- | Map a function over a 'ByteString' and concatenate the results
402 concatMap :: (Char -> ByteString) -> ByteString -> ByteString
403 concatMap f = B.concatMap (f . w2c)
404 {-# INLINE concatMap #-}
406 -- | Applied to a predicate and a ByteString, 'any' determines if
407 -- any element of the 'ByteString' satisfies the predicate.
408 any :: (Char -> Bool) -> ByteString -> Bool
409 any f = B.any (f . w2c)
412 -- | Applied to a predicate and a 'ByteString', 'all' determines if
413 -- all elements of the 'ByteString' satisfy the predicate.
414 all :: (Char -> Bool) -> ByteString -> Bool
415 all f = B.all (f . w2c)
418 -- | 'maximum' returns the maximum value from a 'ByteString'
419 maximum :: ByteString -> Char
420 maximum = w2c . B.maximum
421 {-# INLINE maximum #-}
423 -- | 'minimum' returns the minimum value from a 'ByteString'
424 minimum :: ByteString -> Char
425 minimum = w2c . B.minimum
426 {-# INLINE minimum #-}
428 -- | /O(n)/ map Char functions, provided with the index at each position
429 mapIndexed :: (Int -> Char -> Char) -> ByteString -> ByteString
430 mapIndexed f = B.mapIndexed (\i c -> c2w (f i (w2c c)))
431 {-# INLINE mapIndexed #-}
433 -- | 'scanl' is similar to 'foldl', but returns a list of successive
434 -- reduced values from the left:
436 -- > scanl f z [x1, x2, ...] == [z, z `f` x1, (z `f` x1) `f` x2, ...]
440 -- > last (scanl f z xs) == foldl f z xs.
441 scanl :: (Char -> Char -> Char) -> Char -> ByteString -> ByteString
442 scanl f z = B.scanl (\a b -> c2w (f (w2c a) (w2c b))) (c2w z)
444 -- | 'scanl1' is a variant of 'scanl' that has no starting value argument:
446 -- > scanl1 f [x1, x2, ...] == [x1, x1 `f` x2, ...]
447 scanl1 :: (Char -> Char -> Char) -> ByteString -> ByteString
448 scanl1 f = B.scanl1 (\a b -> c2w (f (w2c a) (w2c b)))
450 -- | /O(n)/ 'replicate' @n x@ is a ByteString of length @n@ with @x@
451 -- the value of every element. The following holds:
453 -- > replicate w c = unfoldr w (\u -> Just (u,u)) c
455 -- This implemenation uses @memset(3)@
456 replicate :: Int -> Char -> ByteString
457 replicate w = B.replicate w . c2w
458 {-# INLINE replicate #-}
460 -- | /O(n)/ The 'unfoldrN' function is analogous to the List \'unfoldr\'.
461 -- 'unfoldrN' builds a ByteString from a seed value. The function takes
462 -- the element and returns 'Nothing' if it is done producing the
463 -- ByteString or returns 'Just' @(a,b)@, in which case, @a@ is a
464 -- prepending to the ByteString and @b@ is used as the next element in a
467 -- To preven unfoldrN having /O(n^2)/ complexity (as prepending a
468 -- character to a ByteString is /O(n)/, this unfoldr requires a maximum
469 -- final size of the ByteString as an argument. 'cons' can then be
470 -- implemented in /O(1)/ (i.e. a 'poke'), and the unfoldr itself has
471 -- linear complexity. The depth of the recursion is limited to this
472 -- size, but may be less. For lazy, infinite unfoldr, use
473 -- 'Data.List.unfoldr' (from 'Data.List').
477 -- > unfoldrN 10 (\x -> Just (x, chr (ord x + 1))) '0' == "0123456789"
479 -- The following equation connects the depth-limited unfoldr to the List unfoldr:
481 -- > unfoldrN n == take n $ List.unfoldr
483 unfoldrN :: Int -> (a -> Maybe (Char, a)) -> a -> ByteString
484 unfoldrN n f w = B.unfoldrN n ((k `fmap`) . f) w
485 where k (i,j) = (c2w i, j)
486 {-# INLINE unfoldrN #-}
488 -- | 'takeWhile', applied to a predicate @p@ and a ByteString @xs@,
489 -- returns the longest prefix (possibly empty) of @xs@ of elements that
491 takeWhile :: (Char -> Bool) -> ByteString -> ByteString
492 takeWhile f = B.takeWhile (f . w2c)
493 {-# INLINE takeWhile #-}
495 -- | 'dropWhile' @p xs@ returns the suffix remaining after 'takeWhile' @p xs@.
496 dropWhile :: (Char -> Bool) -> ByteString -> ByteString
497 dropWhile f = B.dropWhile (f . w2c)
498 {-# INLINE dropWhile #-}
500 -- | 'break' @p@ is equivalent to @'span' ('not' . p)@.
501 break :: (Char -> Bool) -> ByteString -> (ByteString, ByteString)
502 break f = B.break (f . w2c)
505 -- | 'span' @p xs@ breaks the ByteString into two segments. It is
506 -- equivalent to @('takeWhile' p xs, 'dropWhile' p xs)@
507 span :: (Char -> Bool) -> ByteString -> (ByteString, ByteString)
508 span f = B.span (f . w2c)
511 -- | 'spanEnd' behaves like 'span' but from the end of the 'ByteString'.
514 -- > spanEnd (not.isSpace) "x y z" == ("x y ","z")
518 -- > spanEnd (not . isSpace) ps
520 -- > let (x,y) = span (not.isSpace) (reverse ps) in (reverse y, reverse x)
522 spanEnd :: (Char -> Bool) -> ByteString -> (ByteString, ByteString)
523 spanEnd f = B.spanEnd (f . w2c)
524 {-# INLINE spanEnd #-}
526 -- | 'breakChar' breaks its ByteString argument at the first occurence
527 -- of the specified Char. It is more efficient than 'break' as it is
528 -- implemented with @memchr(3)@. I.e.
530 -- > break (=='c') "abcd" == breakChar 'c' "abcd"
532 breakChar :: Char -> ByteString -> (ByteString, ByteString)
533 breakChar = B.breakByte . c2w
534 {-# INLINE breakChar #-}
536 -- | 'spanChar' breaks its ByteString argument at the first
537 -- occurence of a Char other than its argument. It is more efficient
540 -- > span (=='c') "abcd" == spanByte 'c' "abcd"
542 spanChar :: Char -> ByteString -> (ByteString, ByteString)
543 spanChar = B.spanByte . c2w
544 {-# INLINE spanChar #-}
546 -- | /O(n)/ 'breakFirst' breaks the given ByteString on the first
547 -- occurence of @w@. It behaves like 'break', except the delimiter is
548 -- not returned, and @Nothing@ is returned if the delimiter is not in
549 -- the ByteString. I.e.
551 -- > breakFirst 'b' "aabbcc" == Just ("aa","bcc")
553 -- > breakFirst c xs ==
554 -- > let (x,y) = break (== c) xs
555 -- > in if null y then Nothing else Just (x, drop 1 y))
557 breakFirst :: Char -> ByteString -> Maybe (ByteString,ByteString)
558 breakFirst = B.breakFirst . c2w
559 {-# INLINE breakFirst #-}
561 -- | /O(n)/ 'breakLast' behaves like breakFirst, but from the end of the
564 -- > breakLast ('b') (pack "aabbcc") == Just ("aab","cc")
566 -- and the following are equivalent:
568 -- > breakLast 'c' "abcdef"
569 -- > let (x,y) = break (=='c') (reverse "abcdef")
570 -- > in if null x then Nothing else Just (reverse (drop 1 y), reverse x)
572 breakLast :: Char -> ByteString -> Maybe (ByteString,ByteString)
573 breakLast = B.breakLast . c2w
574 {-# INLINE breakLast #-}
576 -- | /O(n)/ Break a 'ByteString' into pieces separated by the byte
577 -- argument, consuming the delimiter. I.e.
579 -- > split '\n' "a\nb\nd\ne" == ["a","b","d","e"]
580 -- > split 'a' "aXaXaXa" == ["","X","X","X"]
581 -- > split 'x' "x" == ["",""]
585 -- > join [c] . split c == id
586 -- > split == splitWith . (==)
588 -- As for all splitting functions in this library, this function does
589 -- not copy the substrings, it just constructs new 'ByteStrings' that
590 -- are slices of the original.
592 split :: Char -> ByteString -> [ByteString]
593 split = B.split . c2w
596 -- | /O(n)/ Splits a 'ByteString' into components delimited by
597 -- separators, where the predicate returns True for a separator element.
598 -- The resulting components do not contain the separators. Two adjacent
599 -- separators result in an empty component in the output. eg.
601 -- > splitWith (=='a') "aabbaca" == ["","","bb","c",""]
603 splitWith :: (Char -> Bool) -> ByteString -> [ByteString]
604 splitWith f = B.splitWith (f . w2c)
605 {-# INLINE splitWith #-}
606 -- the inline makes a big difference here.
608 -- | Like 'splitWith', except that sequences of adjacent separators are
609 -- treated as a single separator. eg.
611 -- > tokens (=='a') "aabbaca" == ["bb","c"]
613 tokens :: (Char -> Bool) -> ByteString -> [ByteString]
614 tokens f = B.tokens (f . w2c)
615 {-# INLINE tokens #-}
617 -- | The 'groupBy' function is the non-overloaded version of 'group'.
618 groupBy :: (Char -> Char -> Bool) -> ByteString -> [ByteString]
619 groupBy k = B.groupBy (\a b -> k (w2c a) (w2c b))
621 -- | /O(n)/ joinWithChar. An efficient way to join to two ByteStrings with a
622 -- char. Around 4 times faster than the generalised join.
624 joinWithChar :: Char -> ByteString -> ByteString -> ByteString
625 joinWithChar = B.joinWithByte . c2w
626 {-# INLINE joinWithChar #-}
628 -- | /O(1)/ 'ByteString' index (subscript) operator, starting from 0.
629 index :: ByteString -> Int -> Char
630 index = (w2c .) . B.index
633 -- | /O(n)/ The 'elemIndex' function returns the index of the first
634 -- element in the given 'ByteString' which is equal (by memchr) to the
635 -- query element, or 'Nothing' if there is no such element.
636 elemIndex :: Char -> ByteString -> Maybe Int
637 elemIndex = B.elemIndex . c2w
638 {-# INLINE elemIndex #-}
640 -- | /O(n)/ The 'elemIndexLast' function returns the last index of the
641 -- element in the given 'ByteString' which is equal to the query
642 -- element, or 'Nothing' if there is no such element. The following
645 -- > elemIndexLast c xs ==
646 -- > (-) (length xs - 1) `fmap` elemIndex c (reverse xs)
648 elemIndexLast :: Char -> ByteString -> Maybe Int
649 elemIndexLast = B.elemIndexLast . c2w
650 {-# INLINE elemIndexLast #-}
652 -- | /O(n)/ The 'elemIndices' function extends 'elemIndex', by returning
653 -- the indices of all elements equal to the query element, in ascending order.
654 elemIndices :: Char -> ByteString -> [Int]
655 elemIndices = B.elemIndices . c2w
656 {-# INLINE elemIndices #-}
658 -- | The 'findIndex' function takes a predicate and a 'ByteString' and
659 -- returns the index of the first element in the ByteString satisfying the predicate.
660 findIndex :: (Char -> Bool) -> ByteString -> Maybe Int
661 findIndex f = B.findIndex (f . w2c)
662 {-# INLINE findIndex #-}
664 -- | The 'findIndices' function extends 'findIndex', by returning the
665 -- indices of all elements satisfying the predicate, in ascending order.
666 findIndices :: (Char -> Bool) -> ByteString -> [Int]
667 findIndices f = B.findIndices (f . w2c)
669 -- | count returns the number of times its argument appears in the ByteString
671 -- > count = length . elemIndices
675 -- > count '\n' == length . lines
677 -- But more efficiently than using length on the intermediate list.
678 count :: Char -> ByteString -> Int
679 count c = B.count (c2w c)
681 -- | /O(n)/ 'elem' is the 'ByteString' membership predicate. This
682 -- implementation uses @memchr(3)@.
683 elem :: Char -> ByteString -> Bool
684 elem c = B.elem (c2w c)
687 -- | /O(n)/ 'notElem' is the inverse of 'elem'
688 notElem :: Char -> ByteString -> Bool
689 notElem c = B.notElem (c2w c)
690 {-# INLINE notElem #-}
692 -- | /O(n)/ 'filter', applied to a predicate and a ByteString,
693 -- returns a ByteString containing those characters that satisfy the
695 filter :: (Char -> Bool) -> ByteString -> ByteString
696 filter f = B.filter (f . w2c)
697 {-# INLINE filter #-}
699 -- | /O(n)/ The 'find' function takes a predicate and a ByteString,
700 -- and returns the first element in matching the predicate, or 'Nothing'
701 -- if there is no such element.
702 find :: (Char -> Bool) -> ByteString -> Maybe Char
703 find f ps = w2c `fmap` B.find (f . w2c) ps
706 -- | /O(n)/ A first order equivalent of /filter . (==)/, for the common
707 -- case of filtering a single Char. It is more efficient to use
708 -- filterChar in this case.
710 -- > filterChar == filter . (==)
712 -- filterChar is around 10x faster, and uses much less space, than its
715 filterChar :: Char -> ByteString -> ByteString
716 filterChar c = B.filterByte (c2w c)
717 {-# INLINE filterChar #-}
719 -- | /O(n)/ A first order equivalent of /filter . (\/=)/, for the common
720 -- case of filtering a single Char out of a list. It is more efficient
721 -- to use /filterNotChar/ in this case.
723 -- > filterNotChar == filter . (/=)
725 -- filterNotChar is around 3x faster, and uses much less space, than its
728 filterNotChar :: Char -> ByteString -> ByteString
729 filterNotChar c = B.filterNotByte (c2w c)
730 {-# INLINE filterNotChar #-}
732 -- | /O(n)/ 'zip' takes two ByteStrings and returns a list of
733 -- corresponding pairs of Chars. If one input ByteString is short,
734 -- excess elements of the longer ByteString are discarded. This is
735 -- equivalent to a pair of 'unpack' operations, and so space
736 -- usage may be large for multi-megabyte ByteStrings
737 zip :: ByteString -> ByteString -> [(Char,Char)]
739 | B.null ps || B.null qs = []
740 | otherwise = (unsafeHead ps, unsafeHead qs) : zip (B.unsafeTail ps) (B.unsafeTail qs)
742 -- | 'zipWith' generalises 'zip' by zipping with the function given as
743 -- the first argument, instead of a tupling function. For example,
744 -- @'zipWith' (+)@ is applied to two ByteStrings to produce the list
745 -- of corresponding sums.
746 zipWith :: (Char -> Char -> a) -> ByteString -> ByteString -> [a]
747 zipWith f = B.zipWith ((. w2c) . f . w2c)
749 -- | 'unzip' transforms a list of pairs of Chars into a pair of
750 -- ByteStrings. Note that this performs two 'pack' operations.
751 unzip :: [(Char,Char)] -> (ByteString,ByteString)
752 unzip ls = (pack (P.map fst ls), pack (P.map snd ls))
755 -- | A variety of 'head' for non-empty ByteStrings. 'unsafeHead' omits
756 -- the check for the empty case, which is good for performance, but
757 -- there is an obligation on the programmer to provide a proof that the
758 -- ByteString is non-empty.
759 unsafeHead :: ByteString -> Char
760 unsafeHead = w2c . B.unsafeHead
761 {-# INLINE unsafeHead #-}
763 -- | Unsafe 'ByteString' index (subscript) operator, starting from 0, returning a Char.
764 -- This omits the bounds check, which means there is an accompanying
765 -- obligation on the programmer to ensure the bounds are checked in some
767 unsafeIndex :: ByteString -> Int -> Char
768 unsafeIndex = (w2c .) . B.unsafeIndex
769 {-# INLINE unsafeIndex #-}
771 -- | Conversion between 'Word8' and 'Char'. Should compile to a no-op.
773 #if !defined(__GLASGOW_HASKELL__)
774 w2c = chr . fromIntegral
776 w2c = unsafeChr . fromIntegral
780 -- | Unsafe conversion between 'Char' and 'Word8'. This is a no-op and
781 -- silently truncates to 8 bits Chars > '\255'. It is provided as
782 -- convenience for ByteString construction.
784 c2w = fromIntegral . ord
787 -- ---------------------------------------------------------------------
788 -- Things that depend on the encoding
790 -- | 'breakSpace' returns the pair of ByteStrings when the argument is
791 -- broken at the first whitespace byte. I.e.
793 -- > break isSpace == breakSpace
795 breakSpace :: ByteString -> (ByteString,ByteString)
796 breakSpace (PS x s l) = inlinePerformIO $ withForeignPtr x $ \p -> do
797 i <- firstspace (p `plusPtr` s) 0 l
798 return $ case () of {_
799 | i == 0 -> (empty, PS x s l)
800 | i == l -> (PS x s l, empty)
801 | otherwise -> (PS x s i, PS x (s+i) (l-i))
803 {-# INLINE breakSpace #-}
805 firstspace :: Ptr Word8 -> Int -> Int -> IO Int
809 | otherwise = do w <- peekByteOff ptr n
810 if (not . isSpaceWord8) w then firstspace ptr (n+1) m else return n
812 -- | 'dropSpace' efficiently returns the 'ByteString' argument with
813 -- white space Chars removed from the front. It is more efficient than
814 -- calling dropWhile for removing whitespace. I.e.
816 -- > dropWhile isSpace == dropSpace
818 dropSpace :: ByteString -> ByteString
819 dropSpace (PS x s l) = inlinePerformIO $ withForeignPtr x $ \p -> do
820 i <- firstnonspace (p `plusPtr` s) 0 l
821 return $ if i == l then empty else PS x (s+i) (l-i)
822 {-# INLINE dropSpace #-}
824 firstnonspace :: Ptr Word8 -> Int -> Int -> IO Int
825 STRICT3(firstnonspace)
826 firstnonspace ptr n m
828 | otherwise = do w <- peekElemOff ptr n
829 if isSpaceWord8 w then firstnonspace ptr (n+1) m else return n
831 -- | 'dropSpaceEnd' efficiently returns the 'ByteString' argument with
832 -- white space removed from the end. I.e.
834 -- > reverse . (dropWhile isSpace) . reverse == dropSpaceEnd
836 -- but it is more efficient than using multiple reverses.
838 dropSpaceEnd :: ByteString -> ByteString
839 dropSpaceEnd (PS x s l) = inlinePerformIO $ withForeignPtr x $ \p -> do
840 i <- lastnonspace (p `plusPtr` s) (l-1)
841 return $ if i == (-1) then empty else PS x s (i+1)
842 {-# INLINE dropSpaceEnd #-}
844 lastnonspace :: Ptr Word8 -> Int -> IO Int
845 STRICT2(lastnonspace)
848 | otherwise = do w <- peekElemOff ptr n
849 if isSpaceWord8 w then lastnonspace ptr (n-1) else return n
851 -- | 'lines' breaks a ByteString up into a list of ByteStrings at
852 -- newline Chars. The resulting strings do not contain newlines.
854 lines :: ByteString -> [ByteString]
857 | otherwise = case search ps of
859 Just n -> take n ps : lines (drop (n+1) ps)
860 where search = elemIndex '\n'
866 P.length . lines = count '\n'
871 -- Just as fast, but more complex. Should be much faster, I thought.
872 lines :: ByteString -> [ByteString]
873 lines (PS _ _ 0) = []
874 lines (PS x s l) = inlinePerformIO $ withForeignPtr x $ \p -> do
875 let ptr = p `plusPtr` s
879 let q = memchr (ptr `plusPtr` n) 0x0a (fromIntegral (l-n))
881 then return [PS x (s+n) (l-n)]
882 else do let i = q `minusPtr` ptr
884 return $! PS x (s+n) (i-n) : ls
888 -- | 'unlines' is an inverse operation to 'lines'. It joins lines,
889 -- after appending a terminating newline to each.
890 unlines :: [ByteString] -> ByteString
892 unlines ss = (concat $ List.intersperse nl ss) `append` nl -- half as much space
893 where nl = packChar '\n'
895 -- | 'words' breaks a ByteString up into a list of words, which
896 -- were delimited by Chars representing white space. And
898 -- > tokens isSpace = words
900 words :: ByteString -> [ByteString]
901 words = B.tokens isSpaceWord8
904 -- | The 'unwords' function is analogous to the 'unlines' function, on words.
905 unwords :: [ByteString] -> ByteString
906 unwords = join (packChar ' ')
907 {-# INLINE unwords #-}
909 -- | /O(n)/ Indicies of newlines. Shorthand for
911 -- > elemIndices '\n'
913 lineIndices :: ByteString -> [Int]
914 lineIndices = elemIndices '\n'
915 {-# INLINE lineIndices #-}
917 -- | 'lines\'' behaves like 'lines', in that it breaks a ByteString on
918 -- newline Chars. However, unlike the Prelude functions, 'lines\'' and
919 -- 'unlines\'' correctly reconstruct lines that are missing terminating
920 -- newlines characters. I.e.
922 -- > unlines (lines "a\nb\nc") == "a\nb\nc\n"
923 -- > unlines' (lines' "a\nb\nc") == "a\nb\nc"
925 -- Note that this means:
927 -- > lines "a\nb\nc\n" == ["a","b","c"]
928 -- > lines' "a\nb\nc\n" == ["a","b","c",""]
930 lines' :: ByteString -> [ByteString]
931 lines' ps = ps `seq` case elemIndex '\n' ps of
933 Just n -> take n ps : lines' (drop (n+1) ps)
935 -- | 'linesCRLF\'' behaves like 'lines\'', but breaks on (\\cr?\\lf)
936 linesCRLF' :: ByteString -> [ByteString]
937 linesCRLF' ps = ps `seq` case elemIndex '\n' ps of
939 Just 0 -> empty : linesCRLF' (drop 1 ps)
940 Just n -> let k = if ps `unsafeIndex` (n-1) == '\r' then n-1 else n
941 in take k ps : linesCRLF' (drop (n+1) ps)
943 -- | 'unlines\'' behaves like 'unlines', except that it also correctly
944 -- retores lines that do not have terminating newlines (see the
945 -- description for 'lines\'').
947 unlines' :: [ByteString] -> ByteString
948 unlines' ss = concat $ intersperse_newlines ss
949 where intersperse_newlines (a:b:s) = a:newline: intersperse_newlines (b:s)
950 intersperse_newlines s = s
951 newline = packChar '\n'
953 -- | 'unlines\'' behaves like 'unlines', except that it also correctly
954 -- retores lines that do not have terminating newlines (see the
955 -- description for 'lines\''). Uses CRLF instead of LF.
957 unlinesCRLF' :: [ByteString] -> ByteString
958 unlinesCRLF' ss = concat $ intersperse_newlines ss
959 where intersperse_newlines (a:b:s) = a:newline: intersperse_newlines (b:s)
960 intersperse_newlines s = s
961 newline = pack "\r\n"
963 -- | 'words\'' behaves like 'words', with the exception that it produces
964 -- output on ByteStrings with trailing whitespace that can be
965 -- correctly inverted by 'unwords'. I.e.
967 -- > words "a b c " == ["a","b","c"]
968 -- > words' "a b c " == ["a","b","c",""]
970 -- > unwords $ words "a b c " == "a b c"
971 -- > unwords $ words' "a b c " == "a b c "
973 words' :: ByteString -> [ByteString]
974 words' = B.splitWith isSpaceWord8
976 -- | 'unwords\'' behaves like 'unwords'. It is provided for consistency
977 -- with the other invertable words and lines functions.
978 unwords' :: [ByteString] -> ByteString
981 -- | 'betweenLines' returns the ByteString between the two lines given,
982 -- or Nothing if they do not appear. The returned string is the first
983 -- and shortest string such that the line before it is the given first
984 -- line, and the line after it is the given second line.
985 betweenLines :: ByteString -- ^ First line to look for
986 -> ByteString -- ^ Second line to look for
987 -> ByteString -- ^ 'ByteString' to look in
988 -> Maybe (ByteString)
990 betweenLines start end ps =
991 case P.break (start ==) (lines ps) of
992 (_, _:rest@(PS ps1 s1 _:_)) ->
993 case P.break (end ==) rest of
994 (_, PS _ s2 _:_) -> Just $ PS ps1 s1 (s2 - s1)
998 -- ---------------------------------------------------------------------
999 -- Reading from ByteStrings
1001 -- | readInt skips any whitespace at the beginning of its argument, and
1002 -- reads an Int from the beginning of the ByteString. If there is no
1003 -- integer at the beginning of the string, it returns Nothing, otherwise
1004 -- it just returns the int read, and the rest of the string.
1005 readInt :: ByteString -> Maybe (Int, ByteString)
1006 readInt p@(PS x s l) = inlinePerformIO $ useAsCString p $ \cstr ->
1007 with (castPtr cstr) $ \endpp -> do
1008 val <- c_strtol (castPtr cstr) endpp 0
1009 skipped <- (`minusPtr` cstr) `fmap` peek endpp
1010 return $ if skipped == 0
1012 else Just (fromIntegral val, PS x (s+skipped) (l-skipped))
1014 -- | unsafeReadInt is like readInt, but requires a null terminated
1015 -- ByteString. It avoids a copy if this is the case. It returns the Int
1016 -- read, if any, and the rest of the string.
1017 unsafeReadInt :: ByteString -> Maybe (Int, ByteString)
1018 unsafeReadInt p@(PS x s l) = inlinePerformIO $ unsafeUseAsCString p $ \cstr ->
1019 with (castPtr cstr) $ \endpp -> do
1020 val <- c_strtol (castPtr cstr) endpp 0
1021 skipped <- (`minusPtr` cstr) `fmap` peek endpp
1022 return $ if skipped == 0
1024 else Just (fromIntegral val, PS x (s+skipped) (l-skipped))
1026 foreign import ccall unsafe "stdlib.h strtol" c_strtol
1027 :: Ptr Word8 -> Ptr (Ptr Word8) -> Int -> IO CLong
1031 -- not quite there yet
1033 readInt :: ByteString -> Maybe (Int, ByteString)
1038 | B.null ps = Nothing
1039 | x == '-' = neg 0 xs
1040 | otherwise = pos (parse x) xs
1041 where (x, xs) = (ps `unsafeIndex` 0, unsafeTail ps)
1044 neg n qs | isSpace x = return $ Just ((i-n),xs)
1045 | otherwise = neg (parse x + (10 * n)) xs
1046 where (x, xs) = (qs `unsafeIndex` 0, unsafeTail qs)
1049 pos n qs | isSpace x = go (i+n) xs
1050 | otherwise = pos (parse x + (10 * n)) xs
1051 where (x, xs) = (qs `unsafeIndexWord8` 0, unsafeTail qs)
1053 parse w = fromIntegral (w - 48) :: Int
1054 {-# INLINE parse #-}
1057 -- ---------------------------------------------------------------------
1060 -- Just like inlinePerformIO, but we inline it. Big performance gains as
1061 -- it exposes lots of things to further inlining
1063 {-# INLINE inlinePerformIO #-}
1064 inlinePerformIO :: IO a -> a
1065 #if defined(__GLASGOW_HASKELL__)
1066 inlinePerformIO (IO m) = case m realWorld# of (# _, r #) -> r
1068 inlinePerformIO = unsafePerformIO
1071 -- Selects white-space characters in the Latin-1 range
1072 -- ordered by frequency
1074 isSpaceWord8 :: Word8 -> Bool
1075 isSpaceWord8 w = case w of
1076 0x20 -> True -- SPACE
1077 0x0A -> True -- LF, \n
1078 0x09 -> True -- HT, \t
1079 0x0C -> True -- FF, \f
1080 0x0D -> True -- CR, \r
1081 0x0B -> True -- VT, \v
1082 0xA0 -> True -- spotted by QC..
1084 {-# INLINE isSpaceWord8 #-}
1086 -- | /O(n)/ Like 'map', but not fuseable. The benefit is that it is
1087 -- slightly faster for one-shot cases.
1088 map' :: (Char -> Char) -> ByteString -> ByteString
1089 map' f = B.map' (c2w . f . w2c)
1091 -- | /O(n)/ 'filter\'' is a non-fuseable version of filter, that may be
1092 -- around 2x faster for some one-shot applications.
1093 filter' :: (Char -> Bool) -> ByteString -> ByteString
1094 filter' f = B.filter' (f . w2c)