1 {-# OPTIONS_GHC -cpp -fffi -fglasgow-exts #-}
3 -- Module : Data.ByteString.Char8
4 -- Copyright : (c) Don Stewart 2006
7 -- Maintainer : dons@cse.unsw.edu.au
8 -- Stability : experimental
9 -- Portability : portable (tested with GHC>=6.4.1 and Hugs 2005)
13 -- | Manipulate ByteStrings using Char operations. All Chars will be
14 -- truncated to 8 bits. It can be expected that these functions will run
15 -- at identical speeds to their Word8 equivalents in @Data.ByteString@.
17 -- More specifically these byte strings are taken to be in the
18 -- subset of Unicode covered by code points 0-255. This covers
19 -- Unicode Basic Latin, Latin-1 Supplement and C0+C1 Controls.
23 -- * <http://www.unicode.org/charts/>
25 -- * <http://www.unicode.org/charts/PDF/U0000.pdf>
27 -- * <http://www.unicode.org/charts/PDF/U0080.pdf>
29 -- This module is intended to be imported @qualified@, to avoid name
30 -- clashes with Prelude functions. eg.
32 -- > import qualified Data.ByteString.Char8 as B
35 module Data.ByteString.Char8 (
37 -- * The @ByteString@ type
38 ByteString(..), -- instances: Eq, Ord, Show, Read, Data, Typeable
40 -- * Introducing and eliminating 'ByteString's
41 empty, -- :: ByteString
42 packChar, -- :: Char -> ByteString
43 pack, -- :: String -> ByteString
44 unpack, -- :: ByteString -> String
47 cons, -- :: Char -> ByteString -> ByteString
48 snoc, -- :: Char -> ByteString -> ByteString
49 null, -- :: ByteString -> Bool
50 length, -- :: ByteString -> Int
51 head, -- :: ByteString -> Char
52 tail, -- :: ByteString -> ByteString
53 last, -- :: ByteString -> Char
54 init, -- :: ByteString -> ByteString
55 append, -- :: ByteString -> ByteString -> ByteString
57 -- * Special ByteStrings
58 inits, -- :: ByteString -> [ByteString]
59 tails, -- :: ByteString -> [ByteString]
60 elems, -- :: ByteString -> [ByteString]
62 -- * Transformating ByteStrings
63 map, -- :: (Char -> Char) -> ByteString -> ByteString
64 reverse, -- :: ByteString -> ByteString
65 intersperse, -- :: Char -> ByteString -> ByteString
66 transpose, -- :: [ByteString] -> [ByteString]
68 -- * Reducing 'ByteString's
69 foldl, -- :: (a -> Char -> a) -> a -> ByteString -> a
70 foldr, -- :: (Char -> a -> a) -> a -> ByteString -> a
71 foldl1, -- :: (Char -> Char -> Char) -> ByteString -> Char
72 foldr1, -- :: (Char -> Char -> Char) -> ByteString -> Char
73 foldl', -- :: (a -> Char -> a) -> a -> ByteString -> a
76 concat, -- :: [ByteString] -> ByteString
77 concatMap, -- :: (Char -> ByteString) -> ByteString -> ByteString
78 any, -- :: (Char -> Bool) -> ByteString -> Bool
79 all, -- :: (Char -> Bool) -> ByteString -> Bool
80 maximum, -- :: ByteString -> Char
81 minimum, -- :: ByteString -> Char
82 mapIndexed, -- :: (Int -> Char -> Char) -> ByteString -> ByteString
84 -- * Generating and unfolding ByteStrings
85 replicate, -- :: Int -> Char -> ByteString
86 unfoldrN, -- :: (a -> Maybe (Char, a)) -> a -> ByteString
90 -- ** Breaking strings
91 take, -- :: Int -> ByteString -> ByteString
92 drop, -- :: Int -> ByteString -> ByteString
93 splitAt, -- :: Int -> ByteString -> (ByteString, ByteString)
94 takeWhile, -- :: (Char -> Bool) -> ByteString -> ByteString
95 dropWhile, -- :: (Char -> Bool) -> ByteString -> ByteString
96 break, -- :: (Char -> Bool) -> ByteString -> (ByteString, ByteString)
97 span, -- :: (Char -> Bool) -> ByteString -> (ByteString, ByteString)
98 spanEnd, -- :: (Char -> Bool) -> ByteString -> (ByteString, ByteString)
100 -- ** Breaking and dropping on specific Chars
101 breakChar, -- :: Char -> ByteString -> (ByteString, ByteString)
102 spanChar, -- :: Char -> ByteString -> (ByteString, ByteString)
103 breakFirst, -- :: Char -> ByteString -> Maybe (ByteString,ByteString)
104 breakLast, -- :: Char -> ByteString -> Maybe (ByteString,ByteString)
105 breakSpace, -- :: ByteString -> Maybe (ByteString,ByteString)
106 dropSpace, -- :: ByteString -> ByteString
107 dropSpaceEnd, -- :: ByteString -> ByteString
109 -- ** Breaking into many substrings
110 split, -- :: Char -> ByteString -> [ByteString]
111 splitWith, -- :: (Char -> Bool) -> ByteString -> [ByteString]
112 tokens, -- :: (Char -> Bool) -> ByteString -> [ByteString]
113 group, -- :: ByteString -> [ByteString]
114 groupBy, -- :: (Word8 -> Word8 -> Bool) -> ByteString -> [ByteString]
116 -- ** Breaking into lines and words
117 lines, -- :: ByteString -> [ByteString]
118 words, -- :: ByteString -> [ByteString]
119 unlines, -- :: [ByteString] -> ByteString
120 unwords, -- :: ByteString -> [ByteString]
122 lines', -- :: ByteString -> [ByteString]
123 unlines', -- :: [ByteString] -> ByteString
124 linesCRLF', -- :: ByteString -> [ByteString]
125 unlinesCRLF', -- :: [ByteString] -> ByteString
126 words', -- :: ByteString -> [ByteString]
127 unwords', -- :: ByteString -> [ByteString]
129 lineIndices, -- :: ByteString -> [Int]
130 betweenLines, -- :: ByteString -> ByteString -> ByteString -> Maybe (ByteString)
132 -- ** Joining strings
133 join, -- :: ByteString -> [ByteString] -> ByteString
134 joinWithChar, -- :: Char -> ByteString -> ByteString -> ByteString
136 -- * Indexing ByteStrings
137 index, -- :: ByteString -> Int -> Char
138 elemIndex, -- :: Char -> ByteString -> Maybe Int
139 elemIndexLast, -- :: Char -> ByteString -> Maybe Int
140 elemIndices, -- :: Char -> ByteString -> [Int]
141 findIndex, -- :: (Char -> Bool) -> ByteString -> Maybe Int
142 findIndices, -- :: (Char -> Bool) -> ByteString -> [Int]
143 count, -- :: Char -> ByteString -> Int
145 -- * Ordered ByteStrings
146 sort, -- :: ByteString -> ByteString
148 -- * Searching ByteStrings
150 -- ** Searching by equality
151 elem, -- :: Char -> ByteString -> Bool
152 notElem, -- :: Char -> ByteString -> Bool
153 filterChar, -- :: Char -> ByteString -> ByteString
154 filterNotChar, -- :: Char -> ByteString -> ByteString
156 -- ** Searching with a predicate
157 filter, -- :: (Char -> Bool) -> ByteString -> ByteString
158 find, -- :: (Char -> Bool) -> ByteString -> Maybe Char
160 -- ** Searching for substrings
161 isPrefixOf, -- :: ByteString -> ByteString -> Bool
162 isSuffixOf, -- :: ByteString -> ByteString -> Bool
163 isSubstringOf, -- :: ByteString -> ByteString -> Bool
164 findSubstring, -- :: ByteString -> ByteString -> Maybe Int
165 findSubstrings, -- :: ByteString -> ByteString -> [Int]
167 -- * Zipping and unzipping ByteString
168 zip, -- :: ByteString -> ByteString -> [(Char,Char)]
169 zipWith, -- :: (Char -> Char -> c) -> ByteString -> ByteString -> [c]
170 unzip, -- :: [(Char,Char)] -> (ByteString,ByteString)
172 -- * Unchecked access
173 unsafeHead, -- :: ByteString -> Char
174 unsafeTail, -- :: ByteString -> ByteString
175 unsafeIndex, -- :: ByteString -> Int -> Char
176 w2c, -- :: Word8 -> Char
177 c2w, -- :: Char -> Word8
179 -- * Reading from ByteStrings
180 readInt, -- :: ByteString -> Maybe Int
181 unsafeReadInt, -- :: ByteString -> Maybe Int
183 -- * Copying ByteStrings
184 copy, -- :: ByteString -> ByteString
186 -- * I\/O with @ByteString@s
188 -- ** Standard input and output
190 #if defined(__GLASGOW_HASKELL__)
191 getLine, -- :: IO ByteString
193 getContents, -- :: IO ByteString
194 putStr, -- :: ByteString -> IO ()
195 putStrLn, -- :: ByteString -> IO ()
198 readFile, -- :: FilePath -> IO ByteString
199 -- mmapFile, -- :: FilePath -> IO ByteString
200 writeFile, -- :: FilePath -> ByteString -> IO ()
202 -- ** I\/O with Handles
203 #if defined(__GLASGOW_HASKELL__)
204 getArgs, -- :: IO [ByteString]
205 hGetLine, -- :: Handle -> IO ByteString
206 hGetNonBlocking, -- :: Handle -> Int -> IO ByteString
208 hGetContents, -- :: Handle -> IO ByteString
209 hGet, -- :: Handle -> Int -> IO ByteString
210 hPut, -- :: Handle -> ByteString -> IO ()
212 #if defined(__GLASGOW_HASKELL__)
213 -- * Low level construction
214 -- | For constructors from foreign language types see /Data.ByteString/
215 packAddress, -- :: Addr# -> ByteString
216 unsafePackAddress, -- :: Int -> Addr# -> ByteString
219 -- * Utilities (needed for array fusion)
220 #if defined(__GLASGOW_HASKELL__)
223 noAL, NoAL, loopArr, loopAcc, loopSndAcc,
224 loopU, mapEFL, filterEFL, foldEFL, foldEFL', fuseEFL,
229 import qualified Prelude as P
230 import Prelude hiding (reverse,head,tail,last,init,null
231 ,length,map,lines,foldl,foldr,unlines
232 ,concat,any,take,drop,splitAt,takeWhile
233 ,dropWhile,span,break,elem,filter,unwords
234 ,words,maximum,minimum,all,concatMap
235 ,foldl1,foldr1,readFile,writeFile,replicate
236 ,getContents,getLine,putStr,putStrLn
237 ,zip,zipWith,unzip,notElem)
239 import qualified Data.ByteString as B
241 -- Listy functions transparently exported
242 import Data.ByteString (ByteString(..)
243 ,empty,null,length,tail,init,append
244 ,inits,tails,elems,reverse,transpose
245 ,concat,take,drop,splitAt,join
246 ,sort,isPrefixOf,isSuffixOf,isSubstringOf,findSubstring
247 ,findSubstrings,unsafeTail,copy,group
249 ,getContents, putStr, putStrLn
250 ,readFile, {-mmapFile,-} writeFile
251 ,hGetContents, hGet, hPut
252 #if defined(__GLASGOW_HASKELL__)
253 ,getLine, getArgs, hGetLine, hGetNonBlocking
254 ,packAddress, unsafePackAddress
257 ,noAL, NoAL, loopArr, loopAcc, loopSndAcc
258 ,loopU, mapEFL, filterEFL, foldEFL, foldEFL', fuseEFL
259 ,useAsCString, unsafeUseAsCString
264 import qualified Data.List as List (intersperse)
267 import Foreign.C.Types (CLong)
268 import Foreign.Marshal.Utils (with)
270 #if defined(__GLASGOW_HASKELL__)
271 import GHC.Base (Char(..),unsafeChr,unpackCString#,unsafeCoerce#)
272 import GHC.IOBase (IO(..),stToIO)
273 import GHC.Prim (Addr#,writeWord8OffAddr#,realWorld#,plusAddr#)
274 import GHC.Ptr (Ptr(..))
275 import GHC.ST (ST(..))
278 #define STRICT1(f) f a | a `seq` False = undefined
279 #define STRICT2(f) f a b | a `seq` b `seq` False = undefined
280 #define STRICT3(f) f a b c | a `seq` b `seq` c `seq` False = undefined
282 ------------------------------------------------------------------------
284 -- | /O(1)/ Convert a 'Char' into a 'ByteString'
285 packChar :: Char -> ByteString
286 packChar = B.packByte . c2w
287 {-# INLINE packChar #-}
289 -- | /O(n)/ Convert a 'String' into a 'ByteString'
291 -- For applications with large numbers of string literals, pack can be a
292 -- bottleneck. In such cases, consider using packAddress (GHC only).
293 pack :: String -> ByteString
294 #if !defined(__GLASGOW_HASKELL__)
296 pack str = B.create (P.length str) $ \p -> go p str
297 where go _ [] = return ()
298 go p (x:xs) = poke p (c2w x) >> go (p `plusPtr` 1) xs
300 #else /* hack away */
302 pack str = B.create (P.length str) $ \(Ptr p) -> stToIO (go p str)
304 go :: Addr# -> [Char] -> ST a ()
306 go p (C# c:cs) = writeByte p (unsafeCoerce# c) >> go (p `plusAddr#` 1#) cs
308 writeByte p c = ST $ \s# ->
309 case writeWord8OffAddr# p 0# c s# of s2# -> (# s2#, () #)
310 {-# INLINE writeByte #-}
313 "pack/packAddress" forall s# .
314 pack (unpackCString# s#) = B.packAddress s#
321 -- | /O(n)/ Converts a 'ByteString' to a 'String'.
322 unpack :: ByteString -> [Char]
323 unpack = B.unpackWith w2c
324 {-# INLINE unpack #-}
326 -- | /O(n)/ 'cons' is analogous to (:) for lists, but of different
327 -- complexity, as it requires a memcpy.
328 cons :: Char -> ByteString -> ByteString
332 -- | /O(n)/ Append a Char to the end of a 'ByteString'. Similar to
333 -- 'cons', this function performs a memcpy.
334 snoc :: ByteString -> Char -> ByteString
335 snoc p = B.snoc p . c2w
338 -- | /O(1)/ Extract the first element of a ByteString, which must be non-empty.
339 head :: ByteString -> Char
343 -- | /O(1)/ Extract the last element of a packed string, which must be non-empty.
344 last :: ByteString -> Char
348 -- | /O(n)/ 'map' @f xs@ is the ByteString obtained by applying @f@ to each element of @xs@
349 map :: (Char -> Char) -> ByteString -> ByteString
350 map f = B.map (c2w . f . w2c)
353 -- | /O(n)/ The 'intersperse' function takes a Char and a 'ByteString'
354 -- and \`intersperses\' that Char between the elements of the
355 -- 'ByteString'. It is analogous to the intersperse function on Lists.
356 intersperse :: Char -> ByteString -> ByteString
357 intersperse = B.intersperse . c2w
358 {-# INLINE intersperse #-}
360 -- | 'foldl', applied to a binary operator, a starting value (typically
361 -- the left-identity of the operator), and a ByteString, reduces the
362 -- ByteString using the binary operator, from left to right.
363 foldl :: (a -> Char -> a) -> a -> ByteString -> a
364 foldl f = B.foldl (\a c -> f a (w2c c))
367 -- | 'foldl\'' is like foldl, but strict in the accumulator.
368 foldl' :: (a -> Char -> a) -> a -> ByteString -> a
369 foldl' f = B.foldl' (\a c -> f a (w2c c))
370 {-# INLINE foldl' #-}
372 -- | 'foldr', applied to a binary operator, a starting value
373 -- (typically the right-identity of the operator), and a packed string,
374 -- reduces the packed string using the binary operator, from right to left.
375 foldr :: (Char -> a -> a) -> a -> ByteString -> a
376 foldr f = B.foldr (\c a -> f (w2c c) a)
379 -- | 'foldl1' is a variant of 'foldl' that has no starting value
380 -- argument, and thus must be applied to non-empty 'ByteStrings'.
381 foldl1 :: (Char -> Char -> Char) -> ByteString -> Char
382 foldl1 f ps = w2c (B.foldl1 (\x y -> c2w (f (w2c x) (w2c y))) ps)
383 {-# INLINE foldl1 #-}
385 -- | 'foldr1' is a variant of 'foldr' that has no starting value argument,
386 -- and thus must be applied to non-empty 'ByteString's
387 foldr1 :: (Char -> Char -> Char) -> ByteString -> Char
388 foldr1 f ps = w2c (B.foldr1 (\x y -> c2w (f (w2c x) (w2c y))) ps)
389 {-# INLINE foldr1 #-}
391 -- | Map a function over a 'ByteString' and concatenate the results
392 concatMap :: (Char -> ByteString) -> ByteString -> ByteString
393 concatMap f = B.concatMap (f . w2c)
394 {-# INLINE concatMap #-}
396 -- | Applied to a predicate and a ByteString, 'any' determines if
397 -- any element of the 'ByteString' satisfies the predicate.
398 any :: (Char -> Bool) -> ByteString -> Bool
399 any f = B.any (f . w2c)
402 -- | Applied to a predicate and a 'ByteString', 'all' determines if
403 -- all elements of the 'ByteString' satisfy the predicate.
404 all :: (Char -> Bool) -> ByteString -> Bool
405 all f = B.all (f . w2c)
408 -- | 'maximum' returns the maximum value from a 'ByteString'
409 maximum :: ByteString -> Char
410 maximum = w2c . B.maximum
411 {-# INLINE maximum #-}
413 -- | 'minimum' returns the minimum value from a 'ByteString'
414 minimum :: ByteString -> Char
415 minimum = w2c . B.minimum
416 {-# INLINE minimum #-}
418 -- | /O(n)/ map Char functions, provided with the index at each position
419 mapIndexed :: (Int -> Char -> Char) -> ByteString -> ByteString
420 mapIndexed f = B.mapIndexed (\i c -> c2w (f i (w2c c)))
421 {-# INLINE mapIndexed #-}
423 -- | /O(n)/ 'replicate' @n x@ is a ByteString of length @n@ with @x@
424 -- the value of every element. The following holds:
426 -- > replicate w c = unfoldr w (\u -> Just (u,u)) c
428 -- This implemenation uses @memset(3)@
429 replicate :: Int -> Char -> ByteString
430 replicate w = B.replicate w . c2w
431 {-# INLINE replicate #-}
433 -- | /O(n)/ The 'unfoldrN' function is analogous to the List \'unfoldr\'.
434 -- 'unfoldrN' builds a ByteString from a seed value. The function takes
435 -- the element and returns 'Nothing' if it is done producing the
436 -- ByteString or returns 'Just' @(a,b)@, in which case, @a@ is a
437 -- prepending to the ByteString and @b@ is used as the next element in a
440 -- To preven unfoldrN having /O(n^2)/ complexity (as prepending a
441 -- character to a ByteString is /O(n)/, this unfoldr requires a maximum
442 -- final size of the ByteString as an argument. 'cons' can then be
443 -- implemented in /O(1)/ (i.e. a 'poke'), and the unfoldr itself has
444 -- linear complexity. The depth of the recursion is limited to this
445 -- size, but may be less. For lazy, infinite unfoldr, use
446 -- 'Data.List.unfoldr' (from 'Data.List').
450 -- > unfoldrN 10 (\x -> Just (x, chr (ord x + 1))) '0' == "0123456789"
452 -- The following equation connects the depth-limited unfoldr to the List unfoldr:
454 -- > unfoldrN n == take n $ List.unfoldr
456 unfoldrN :: Int -> (a -> Maybe (Char, a)) -> a -> ByteString
457 unfoldrN n f w = B.unfoldrN n ((k `fmap`) . f) w
458 where k (i,j) = (c2w i, j)
459 {-# INLINE unfoldrN #-}
461 -- | 'takeWhile', applied to a predicate @p@ and a ByteString @xs@,
462 -- returns the longest prefix (possibly empty) of @xs@ of elements that
464 takeWhile :: (Char -> Bool) -> ByteString -> ByteString
465 takeWhile f = B.takeWhile (f . w2c)
466 {-# INLINE takeWhile #-}
468 -- | 'dropWhile' @p xs@ returns the suffix remaining after 'takeWhile' @p xs@.
469 dropWhile :: (Char -> Bool) -> ByteString -> ByteString
470 dropWhile f = B.dropWhile (f . w2c)
471 {-# INLINE dropWhile #-}
473 -- | 'break' @p@ is equivalent to @'span' ('not' . p)@.
474 break :: (Char -> Bool) -> ByteString -> (ByteString, ByteString)
475 break f = B.break (f . w2c)
478 -- | 'span' @p xs@ breaks the ByteString into two segments. It is
479 -- equivalent to @('takeWhile' p xs, 'dropWhile' p xs)@
480 span :: (Char -> Bool) -> ByteString -> (ByteString, ByteString)
481 span f = B.span (f . w2c)
484 -- | 'spanEnd' behaves like 'span' but from the end of the 'ByteString'.
487 -- > spanEnd (not.isSpace) "x y z" == ("x y ","z")
491 -- > spanEnd (not . isSpace) ps
493 -- > let (x,y) = span (not.isSpace) (reverse ps) in (reverse y, reverse x)
495 spanEnd :: (Char -> Bool) -> ByteString -> (ByteString, ByteString)
496 spanEnd f = B.spanEnd (f . w2c)
497 {-# INLINE spanEnd #-}
499 -- | 'breakChar' breaks its ByteString argument at the first occurence
500 -- of the specified Char. It is more efficient than 'break' as it is
501 -- implemented with @memchr(3)@. I.e.
503 -- > break (=='c') "abcd" == breakChar 'c' "abcd"
505 breakChar :: Char -> ByteString -> (ByteString, ByteString)
506 breakChar = B.breakByte . c2w
507 {-# INLINE breakChar #-}
509 -- | 'spanChar' breaks its ByteString argument at the first
510 -- occurence of a Char other than its argument. It is more efficient
513 -- > span (=='c') "abcd" == spanByte 'c' "abcd"
515 spanChar :: Char -> ByteString -> (ByteString, ByteString)
516 spanChar = B.spanByte . c2w
517 {-# INLINE spanChar #-}
519 -- | /O(n)/ 'breakFirst' breaks the given ByteString on the first
520 -- occurence of @w@. It behaves like 'break', except the delimiter is
521 -- not returned, and @Nothing@ is returned if the delimiter is not in
522 -- the ByteString. I.e.
524 -- > breakFirst 'b' "aabbcc" == Just ("aa","bcc")
526 -- > breakFirst c xs ==
527 -- > let (x,y) = break (== c) xs
528 -- > in if null y then Nothing else Just (x, drop 1 y))
530 breakFirst :: Char -> ByteString -> Maybe (ByteString,ByteString)
531 breakFirst = B.breakFirst . c2w
532 {-# INLINE breakFirst #-}
534 -- | /O(n)/ 'breakLast' behaves like breakFirst, but from the end of the
537 -- > breakLast ('b') (pack "aabbcc") == Just ("aab","cc")
539 -- and the following are equivalent:
541 -- > breakLast 'c' "abcdef"
542 -- > let (x,y) = break (=='c') (reverse "abcdef")
543 -- > in if null x then Nothing else Just (reverse (drop 1 y), reverse x)
545 breakLast :: Char -> ByteString -> Maybe (ByteString,ByteString)
546 breakLast = B.breakLast . c2w
547 {-# INLINE breakLast #-}
549 -- | /O(n)/ Break a 'ByteString' into pieces separated by the byte
550 -- argument, consuming the delimiter. I.e.
552 -- > split '\n' "a\nb\nd\ne" == ["a","b","d","e"]
553 -- > split 'a' "aXaXaXa" == ["","X","X","X"]
554 -- > split 'x' "x" == ["",""]
558 -- > join [c] . split c == id
559 -- > split == splitWith . (==)
561 -- As for all splitting functions in this library, this function does
562 -- not copy the substrings, it just constructs new 'ByteStrings' that
563 -- are slices of the original.
565 split :: Char -> ByteString -> [ByteString]
566 split = B.split . c2w
569 -- | /O(n)/ Splits a 'ByteString' into components delimited by
570 -- separators, where the predicate returns True for a separator element.
571 -- The resulting components do not contain the separators. Two adjacent
572 -- separators result in an empty component in the output. eg.
574 -- > splitWith (=='a') "aabbaca" == ["","","bb","c",""]
576 splitWith :: (Char -> Bool) -> ByteString -> [ByteString]
577 splitWith f = B.splitWith (f . w2c)
578 {-# INLINE splitWith #-}
579 -- the inline makes a big difference here.
581 -- | Like 'splitWith', except that sequences of adjacent separators are
582 -- treated as a single separator. eg.
584 -- > tokens (=='a') "aabbaca" == ["bb","c"]
586 tokens :: (Char -> Bool) -> ByteString -> [ByteString]
587 tokens f = B.tokens (f . w2c)
588 {-# INLINE tokens #-}
590 -- | The 'groupBy' function is the non-overloaded version of 'group'.
591 groupBy :: (Char -> Char -> Bool) -> ByteString -> [ByteString]
592 groupBy k = B.groupBy (\a b -> k (w2c a) (w2c b))
594 -- | /O(n)/ joinWithChar. An efficient way to join to two ByteStrings with a
595 -- char. Around 4 times faster than the generalised join.
597 joinWithChar :: Char -> ByteString -> ByteString -> ByteString
598 joinWithChar = B.joinWithByte . c2w
599 {-# INLINE joinWithChar #-}
601 -- | /O(1)/ 'ByteString' index (subscript) operator, starting from 0.
602 index :: ByteString -> Int -> Char
603 index = (w2c .) . B.index
606 -- | /O(n)/ The 'elemIndex' function returns the index of the first
607 -- element in the given 'ByteString' which is equal (by memchr) to the
608 -- query element, or 'Nothing' if there is no such element.
609 elemIndex :: Char -> ByteString -> Maybe Int
610 elemIndex = B.elemIndex . c2w
611 {-# INLINE elemIndex #-}
613 -- | /O(n)/ The 'elemIndexLast' function returns the last index of the
614 -- element in the given 'ByteString' which is equal to the query
615 -- element, or 'Nothing' if there is no such element. The following
618 -- > elemIndexLast c xs ==
619 -- > (-) (length xs - 1) `fmap` elemIndex c (reverse xs)
621 elemIndexLast :: Char -> ByteString -> Maybe Int
622 elemIndexLast = B.elemIndexLast . c2w
623 {-# INLINE elemIndexLast #-}
625 -- | /O(n)/ The 'elemIndices' function extends 'elemIndex', by returning
626 -- the indices of all elements equal to the query element, in ascending order.
627 elemIndices :: Char -> ByteString -> [Int]
628 elemIndices = B.elemIndices . c2w
629 {-# INLINE elemIndices #-}
631 -- | The 'findIndex' function takes a predicate and a 'ByteString' and
632 -- returns the index of the first element in the ByteString satisfying the predicate.
633 findIndex :: (Char -> Bool) -> ByteString -> Maybe Int
634 findIndex f = B.findIndex (f . w2c)
635 {-# INLINE findIndex #-}
637 -- | The 'findIndices' function extends 'findIndex', by returning the
638 -- indices of all elements satisfying the predicate, in ascending order.
639 findIndices :: (Char -> Bool) -> ByteString -> [Int]
640 findIndices f = B.findIndices (f . w2c)
642 -- | count returns the number of times its argument appears in the ByteString
644 -- > count = length . elemIndices
648 -- > count '\n' == length . lines
650 -- But more efficiently than using length on the intermediate list.
651 count :: Char -> ByteString -> Int
652 count c = B.count (c2w c)
654 -- | /O(n)/ 'elem' is the 'ByteString' membership predicate. This
655 -- implementation uses @memchr(3)@.
656 elem :: Char -> ByteString -> Bool
657 elem c = B.elem (c2w c)
660 -- | /O(n)/ 'notElem' is the inverse of 'elem'
661 notElem :: Char -> ByteString -> Bool
662 notElem c = B.notElem (c2w c)
663 {-# INLINE notElem #-}
665 -- | /O(n)/ 'filter', applied to a predicate and a ByteString,
666 -- returns a ByteString containing those characters that satisfy the
668 filter :: (Char -> Bool) -> ByteString -> ByteString
669 filter f = B.filter (f . w2c)
670 {-# INLINE filter #-}
672 -- | /O(n)/ The 'find' function takes a predicate and a ByteString,
673 -- and returns the first element in matching the predicate, or 'Nothing'
674 -- if there is no such element.
675 find :: (Char -> Bool) -> ByteString -> Maybe Char
676 find f ps = w2c `fmap` B.find (f . w2c) ps
679 -- | /O(n)/ A first order equivalent of /filter . (==)/, for the common
680 -- case of filtering a single Char. It is more efficient to use
681 -- filterChar in this case.
683 -- > filterChar == filter . (==)
685 -- filterChar is around 10x faster, and uses much less space, than its
688 filterChar :: Char -> ByteString -> ByteString
689 filterChar c = B.filterByte (c2w c)
690 {-# INLINE filterChar #-}
692 -- | /O(n)/ A first order equivalent of /filter . (\/=)/, for the common
693 -- case of filtering a single Char out of a list. It is more efficient
694 -- to use /filterNotChar/ in this case.
696 -- > filterNotChar == filter . (/=)
698 -- filterNotChar is around 3x faster, and uses much less space, than its
701 filterNotChar :: Char -> ByteString -> ByteString
702 filterNotChar c = B.filterNotByte (c2w c)
703 {-# INLINE filterNotChar #-}
705 -- | /O(n)/ 'zip' takes two ByteStrings and returns a list of
706 -- corresponding pairs of Chars. If one input ByteString is short,
707 -- excess elements of the longer ByteString are discarded. This is
708 -- equivalent to a pair of 'unpack' operations, and so space
709 -- usage may be large for multi-megabyte ByteStrings
710 zip :: ByteString -> ByteString -> [(Char,Char)]
712 | B.null ps || B.null qs = []
713 | otherwise = (unsafeHead ps, unsafeHead qs) : zip (B.unsafeTail ps) (B.unsafeTail qs)
715 -- | 'zipWith' generalises 'zip' by zipping with the function given as
716 -- the first argument, instead of a tupling function. For example,
717 -- @'zipWith' (+)@ is applied to two ByteStrings to produce the list
718 -- of corresponding sums.
719 zipWith :: (Char -> Char -> a) -> ByteString -> ByteString -> [a]
720 zipWith f = B.zipWith ((. w2c) . f . w2c)
722 -- | 'unzip' transforms a list of pairs of Chars into a pair of
723 -- ByteStrings. Note that this performs two 'pack' operations.
724 unzip :: [(Char,Char)] -> (ByteString,ByteString)
725 unzip ls = (pack (P.map fst ls), pack (P.map snd ls))
728 -- | A variety of 'head' for non-empty ByteStrings. 'unsafeHead' omits
729 -- the check for the empty case, which is good for performance, but
730 -- there is an obligation on the programmer to provide a proof that the
731 -- ByteString is non-empty.
732 unsafeHead :: ByteString -> Char
733 unsafeHead = w2c . B.unsafeHead
734 {-# INLINE unsafeHead #-}
736 -- | Unsafe 'ByteString' index (subscript) operator, starting from 0, returning a Char.
737 -- This omits the bounds check, which means there is an accompanying
738 -- obligation on the programmer to ensure the bounds are checked in some
740 unsafeIndex :: ByteString -> Int -> Char
741 unsafeIndex = (w2c .) . B.unsafeIndex
742 {-# INLINE unsafeIndex #-}
744 -- | Conversion between 'Word8' and 'Char'. Should compile to a no-op.
746 #if !defined(__GLASGOW_HASKELL__)
747 w2c = chr . fromIntegral
749 w2c = unsafeChr . fromIntegral
753 -- | Unsafe conversion between 'Char' and 'Word8'. This is a no-op and
754 -- silently truncates to 8 bits Chars > '\255'. It is provided as
755 -- convenience for ByteString construction.
757 c2w = fromIntegral . ord
760 -- ---------------------------------------------------------------------
761 -- Things that depend on the encoding
763 -- | 'breakSpace' returns the pair of ByteStrings when the argument is
764 -- broken at the first whitespace byte. I.e.
766 -- > break isSpace == breakSpace
768 breakSpace :: ByteString -> (ByteString,ByteString)
769 breakSpace (PS x s l) = inlinePerformIO $ withForeignPtr x $ \p -> do
770 i <- firstspace (p `plusPtr` s) 0 l
771 return $ case () of {_
772 | i == 0 -> (empty, PS x s l)
773 | i == l -> (PS x s l, empty)
774 | otherwise -> (PS x s i, PS x (s+i) (l-i))
776 {-# INLINE breakSpace #-}
778 firstspace :: Ptr Word8 -> Int -> Int -> IO Int
782 | otherwise = do w <- peekByteOff ptr n
783 if (not . isSpaceWord8) w then firstspace ptr (n+1) m else return n
785 -- | 'dropSpace' efficiently returns the 'ByteString' argument with
786 -- white space Chars removed from the front. It is more efficient than
787 -- calling dropWhile for removing whitespace. I.e.
789 -- > dropWhile isSpace == dropSpace
791 dropSpace :: ByteString -> ByteString
792 dropSpace (PS x s l) = inlinePerformIO $ withForeignPtr x $ \p -> do
793 i <- firstnonspace (p `plusPtr` s) 0 l
794 return $ if i == l then empty else PS x (s+i) (l-i)
795 {-# INLINE dropSpace #-}
797 firstnonspace :: Ptr Word8 -> Int -> Int -> IO Int
798 STRICT3(firstnonspace)
799 firstnonspace ptr n m
801 | otherwise = do w <- peekElemOff ptr n
802 if isSpaceWord8 w then firstnonspace ptr (n+1) m else return n
804 -- | 'dropSpaceEnd' efficiently returns the 'ByteString' argument with
805 -- white space removed from the end. I.e.
807 -- > reverse . (dropWhile isSpace) . reverse == dropSpaceEnd
809 -- but it is more efficient than using multiple reverses.
811 dropSpaceEnd :: ByteString -> ByteString
812 dropSpaceEnd (PS x s l) = inlinePerformIO $ withForeignPtr x $ \p -> do
813 i <- lastnonspace (p `plusPtr` s) (l-1)
814 return $ if i == (-1) then empty else PS x s (i+1)
815 {-# INLINE dropSpaceEnd #-}
817 lastnonspace :: Ptr Word8 -> Int -> IO Int
818 STRICT2(lastnonspace)
821 | otherwise = do w <- peekElemOff ptr n
822 if isSpaceWord8 w then lastnonspace ptr (n-1) else return n
824 -- | 'lines' breaks a ByteString up into a list of ByteStrings at
825 -- newline Chars. The resulting strings do not contain newlines.
827 lines :: ByteString -> [ByteString]
830 | otherwise = case search ps of
832 Just n -> take n ps : lines (drop (n+1) ps)
833 where search = elemIndex '\n'
839 P.length . lines = count '\n'
844 -- Just as fast, but more complex. Should be much faster, I thought.
845 lines :: ByteString -> [ByteString]
846 lines (PS _ _ 0) = []
847 lines (PS x s l) = inlinePerformIO $ withForeignPtr x $ \p -> do
848 let ptr = p `plusPtr` s
852 let q = memchr (ptr `plusPtr` n) 0x0a (fromIntegral (l-n))
854 then return [PS x (s+n) (l-n)]
855 else do let i = q `minusPtr` ptr
857 return $! PS x (s+n) (i-n) : ls
861 -- | 'unlines' is an inverse operation to 'lines'. It joins lines,
862 -- after appending a terminating newline to each.
863 unlines :: [ByteString] -> ByteString
865 unlines ss = (concat $ List.intersperse nl ss) `append` nl -- half as much space
866 where nl = packChar '\n'
868 -- | 'words' breaks a ByteString up into a list of words, which
869 -- were delimited by Chars representing white space. And
871 -- > tokens isSpace = words
873 words :: ByteString -> [ByteString]
874 words = B.tokens isSpaceWord8
877 -- | The 'unwords' function is analogous to the 'unlines' function, on words.
878 unwords :: [ByteString] -> ByteString
879 unwords = join (packChar ' ')
880 {-# INLINE unwords #-}
882 -- | /O(n)/ Indicies of newlines. Shorthand for
884 -- > elemIndices '\n'
886 lineIndices :: ByteString -> [Int]
887 lineIndices = elemIndices '\n'
888 {-# INLINE lineIndices #-}
890 -- | 'lines\'' behaves like 'lines', in that it breaks a ByteString on
891 -- newline Chars. However, unlike the Prelude functions, 'lines\'' and
892 -- 'unlines\'' correctly reconstruct lines that are missing terminating
893 -- newlines characters. I.e.
895 -- > unlines (lines "a\nb\nc") == "a\nb\nc\n"
896 -- > unlines' (lines' "a\nb\nc") == "a\nb\nc"
898 -- Note that this means:
900 -- > lines "a\nb\nc\n" == ["a","b","c"]
901 -- > lines' "a\nb\nc\n" == ["a","b","c",""]
903 lines' :: ByteString -> [ByteString]
904 lines' ps = ps `seq` case elemIndex '\n' ps of
906 Just n -> take n ps : lines' (drop (n+1) ps)
908 -- | 'linesCRLF\'' behaves like 'lines\'', but breaks on (\\cr?\\lf)
909 linesCRLF' :: ByteString -> [ByteString]
910 linesCRLF' ps = ps `seq` case elemIndex '\n' ps of
912 Just 0 -> empty : linesCRLF' (drop 1 ps)
913 Just n -> let k = if ps `unsafeIndex` (n-1) == '\r' then n-1 else n
914 in take k ps : linesCRLF' (drop (n+1) ps)
916 -- | 'unlines\'' behaves like 'unlines', except that it also correctly
917 -- retores lines that do not have terminating newlines (see the
918 -- description for 'lines\'').
920 unlines' :: [ByteString] -> ByteString
921 unlines' ss = concat $ intersperse_newlines ss
922 where intersperse_newlines (a:b:s) = a:newline: intersperse_newlines (b:s)
923 intersperse_newlines s = s
924 newline = packChar '\n'
926 -- | 'unlines\'' behaves like 'unlines', except that it also correctly
927 -- retores lines that do not have terminating newlines (see the
928 -- description for 'lines\''). Uses CRLF instead of LF.
930 unlinesCRLF' :: [ByteString] -> ByteString
931 unlinesCRLF' ss = concat $ intersperse_newlines ss
932 where intersperse_newlines (a:b:s) = a:newline: intersperse_newlines (b:s)
933 intersperse_newlines s = s
934 newline = pack "\r\n"
936 -- | 'words\'' behaves like 'words', with the exception that it produces
937 -- output on ByteStrings with trailing whitespace that can be
938 -- correctly inverted by 'unwords'. I.e.
940 -- > words "a b c " == ["a","b","c"]
941 -- > words' "a b c " == ["a","b","c",""]
943 -- > unwords $ words "a b c " == "a b c"
944 -- > unwords $ words' "a b c " == "a b c "
946 words' :: ByteString -> [ByteString]
947 words' = B.splitWith isSpaceWord8
949 -- | 'unwords\'' behaves like 'unwords'. It is provided for consistency
950 -- with the other invertable words and lines functions.
951 unwords' :: [ByteString] -> ByteString
954 -- | 'betweenLines' returns the ByteString between the two lines given,
955 -- or Nothing if they do not appear. The returned string is the first
956 -- and shortest string such that the line before it is the given first
957 -- line, and the line after it is the given second line.
958 betweenLines :: ByteString -- ^ First line to look for
959 -> ByteString -- ^ Second line to look for
960 -> ByteString -- ^ 'ByteString' to look in
961 -> Maybe (ByteString)
963 betweenLines start end ps =
964 case P.break (start ==) (lines ps) of
965 (_, _:rest@(PS ps1 s1 _:_)) ->
966 case P.break (end ==) rest of
967 (_, PS _ s2 _:_) -> Just $ PS ps1 s1 (s2 - s1)
971 -- ---------------------------------------------------------------------
972 -- Reading from ByteStrings
974 -- | readInt skips any whitespace at the beginning of its argument, and
975 -- reads an Int from the beginning of the ByteString. If there is no
976 -- integer at the beginning of the string, it returns Nothing, otherwise
977 -- it just returns the int read, and the rest of the string.
978 readInt :: ByteString -> Maybe (Int, ByteString)
979 readInt p@(PS x s l) = inlinePerformIO $ useAsCString p $ \cstr ->
980 with (castPtr cstr) $ \endpp -> do
981 val <- c_strtol (castPtr cstr) endpp 0
982 skipped <- (`minusPtr` cstr) `fmap` peek endpp
983 return $ if skipped == 0
985 else Just (fromIntegral val, PS x (s+skipped) (l-skipped))
987 -- | unsafeReadInt is like readInt, but requires a null terminated
988 -- ByteString. It avoids a copy if this is the case. It returns the Int
989 -- read, if any, and the rest of the string.
990 unsafeReadInt :: ByteString -> Maybe (Int, ByteString)
991 unsafeReadInt p@(PS x s l) = inlinePerformIO $ unsafeUseAsCString p $ \cstr ->
992 with (castPtr cstr) $ \endpp -> do
993 val <- c_strtol (castPtr cstr) endpp 0
994 skipped <- (`minusPtr` cstr) `fmap` peek endpp
995 return $ if skipped == 0
997 else Just (fromIntegral val, PS x (s+skipped) (l-skipped))
999 foreign import ccall unsafe "stdlib.h strtol" c_strtol
1000 :: Ptr Word8 -> Ptr (Ptr Word8) -> Int -> IO CLong
1004 -- not quite there yet
1006 readInt :: ByteString -> Maybe (Int, ByteString)
1011 | B.null ps = Nothing
1012 | x == '-' = neg 0 xs
1013 | otherwise = pos (parse x) xs
1014 where (x, xs) = (ps `unsafeIndex` 0, unsafeTail ps)
1017 neg n qs | isSpace x = return $ Just ((i-n),xs)
1018 | otherwise = neg (parse x + (10 * n)) xs
1019 where (x, xs) = (qs `unsafeIndex` 0, unsafeTail qs)
1022 pos n qs | isSpace x = go (i+n) xs
1023 | otherwise = pos (parse x + (10 * n)) xs
1024 where (x, xs) = (qs `unsafeIndexWord8` 0, unsafeTail qs)
1026 parse w = fromIntegral (w - 48) :: Int
1027 {-# INLINE parse #-}
1030 -- ---------------------------------------------------------------------
1033 -- Just like inlinePerformIO, but we inline it. Big performance gains as
1034 -- it exposes lots of things to further inlining
1036 {-# INLINE inlinePerformIO #-}
1037 inlinePerformIO :: IO a -> a
1038 #if defined(__GLASGOW_HASKELL__)
1039 inlinePerformIO (IO m) = case m realWorld# of (# _, r #) -> r
1041 inlinePerformIO = unsafePerformIO
1044 -- Selects white-space characters in the Latin-1 range
1045 -- ordered by frequency
1047 isSpaceWord8 :: Word8 -> Bool
1048 isSpaceWord8 w = case w of
1049 0x20 -> True -- SPACE
1050 0x0A -> True -- LF, \n
1051 0x09 -> True -- HT, \t
1052 0x0C -> True -- FF, \f
1053 0x0D -> True -- CR, \r
1054 0x0B -> True -- VT, \v
1055 0xA0 -> True -- spotted by QC..
1057 {-# INLINE isSpaceWord8 #-}
1059 -- | /O(n)/ Like 'map', but not fuseable. The benefit is that it is
1060 -- slightly faster for one-shot cases.
1061 map' :: (Char -> Char) -> ByteString -> ByteString
1062 map' f = B.map' (c2w . f . w2c)
1064 -- | /O(n)/ 'filter\'' is a non-fuseable version of filter, that may be
1065 -- around 2x faster for some one-shot applications.
1066 filter' :: (Char -> Bool) -> ByteString -> ByteString
1067 filter' f = B.filter' (f . w2c)