1 {-# OPTIONS_GHC -cpp -fffi -fglasgow-exts #-}
3 -- Module : Data.ByteString.Char8
4 -- Copyright : (c) Don Stewart 2006
7 -- Maintainer : dons@cse.unsw.edu.au
8 -- Stability : experimental
9 -- Portability : portable (tested with GHC>=6.4.1 and Hugs 2005)
13 -- | Manipulate ByteStrings using Char operations. All Chars will be
14 -- truncated to 8 bits. It can be expected that these functions will run
15 -- at identical speeds to their Word8 equivalents in @Data.ByteString@.
17 -- More specifically these byte strings are taken to be in the
18 -- subset of Unicode covered by code points 0-255. This covers
19 -- Unicode Basic Latin, Latin-1 Supplement and C0+C1 Controls.
23 -- * <http://www.unicode.org/charts/>
25 -- * <http://www.unicode.org/charts/PDF/U0000.pdf>
27 -- * <http://www.unicode.org/charts/PDF/U0080.pdf>
29 -- This module is intended to be imported @qualified@, to avoid name
30 -- clashes with Prelude functions. eg.
32 -- > import qualified Data.ByteString.Char8 as B
35 module Data.ByteString.Char8 (
37 -- * The @ByteString@ type
38 ByteString(..), -- instances: Eq, Ord, Show, Read, Data, Typeable
40 -- * Introducing and eliminating 'ByteString's
41 empty, -- :: ByteString
42 packChar, -- :: Char -> ByteString
43 pack, -- :: String -> ByteString
44 unpack, -- :: ByteString -> String
47 cons, -- :: Char -> ByteString -> ByteString
48 snoc, -- :: Char -> ByteString -> ByteString
49 null, -- :: ByteString -> Bool
50 length, -- :: ByteString -> Int
51 head, -- :: ByteString -> Char
52 tail, -- :: ByteString -> ByteString
53 last, -- :: ByteString -> Char
54 init, -- :: ByteString -> ByteString
55 append, -- :: ByteString -> ByteString -> ByteString
57 -- * Special ByteStrings
58 inits, -- :: ByteString -> [ByteString]
59 tails, -- :: ByteString -> [ByteString]
60 elems, -- :: ByteString -> [ByteString]
62 -- * Transformating ByteStrings
63 map, -- :: (Char -> Char) -> ByteString -> ByteString
64 reverse, -- :: ByteString -> ByteString
65 intersperse, -- :: Char -> ByteString -> ByteString
66 transpose, -- :: [ByteString] -> [ByteString]
68 -- * Reducing 'ByteString's
69 foldl, -- :: (a -> Char -> a) -> a -> ByteString -> a
70 foldr, -- :: (Char -> a -> a) -> a -> ByteString -> a
71 foldl1, -- :: (Char -> Char -> Char) -> ByteString -> Char
72 foldr1, -- :: (Char -> Char -> Char) -> ByteString -> Char
75 concat, -- :: [ByteString] -> ByteString
76 concatMap, -- :: (Char -> ByteString) -> ByteString -> ByteString
77 any, -- :: (Char -> Bool) -> ByteString -> Bool
78 all, -- :: (Char -> Bool) -> ByteString -> Bool
79 maximum, -- :: ByteString -> Char
80 minimum, -- :: ByteString -> Char
81 mapIndexed, -- :: (Int -> Char -> Char) -> ByteString -> ByteString
83 -- * Generating and unfolding ByteStrings
84 replicate, -- :: Int -> Char -> ByteString
85 unfoldrN, -- :: (Char -> Maybe (Char, Char)) -> Char -> ByteString
89 -- ** Breaking strings
90 take, -- :: Int -> ByteString -> ByteString
91 drop, -- :: Int -> ByteString -> ByteString
92 splitAt, -- :: Int -> ByteString -> (ByteString, ByteString)
93 takeWhile, -- :: (Char -> Bool) -> ByteString -> ByteString
94 dropWhile, -- :: (Char -> Bool) -> ByteString -> ByteString
95 break, -- :: (Char -> Bool) -> ByteString -> (ByteString, ByteString)
96 span, -- :: (Char -> Bool) -> ByteString -> (ByteString, ByteString)
97 spanEnd, -- :: (Char -> Bool) -> ByteString -> (ByteString, ByteString)
99 -- ** Breaking and dropping on specific Chars
100 breakChar, -- :: Char -> ByteString -> (ByteString, ByteString)
101 spanChar, -- :: Char -> ByteString -> (ByteString, ByteString)
102 breakFirst, -- :: Char -> ByteString -> Maybe (ByteString,ByteString)
103 breakLast, -- :: Char -> ByteString -> Maybe (ByteString,ByteString)
104 breakSpace, -- :: ByteString -> Maybe (ByteString,ByteString)
105 dropSpace, -- :: ByteString -> ByteString
106 dropSpaceEnd, -- :: ByteString -> ByteString
108 -- ** Breaking into many substrings
109 split, -- :: Char -> ByteString -> [ByteString]
110 splitWith, -- :: (Char -> Bool) -> ByteString -> [ByteString]
111 tokens, -- :: (Char -> Bool) -> ByteString -> [ByteString]
112 group, -- :: ByteString -> [ByteString]
113 groupBy, -- :: (Word8 -> Word8 -> Bool) -> ByteString -> [ByteString]
115 -- ** Breaking into lines and words
116 lines, -- :: ByteString -> [ByteString]
117 words, -- :: ByteString -> [ByteString]
118 unlines, -- :: [ByteString] -> ByteString
119 unwords, -- :: ByteString -> [ByteString]
121 lines', -- :: ByteString -> [ByteString]
122 unlines', -- :: [ByteString] -> ByteString
123 linesCRLF', -- :: ByteString -> [ByteString]
124 unlinesCRLF', -- :: [ByteString] -> ByteString
125 words', -- :: ByteString -> [ByteString]
126 unwords', -- :: ByteString -> [ByteString]
128 lineIndices, -- :: ByteString -> [Int]
129 betweenLines, -- :: ByteString -> ByteString -> ByteString -> Maybe (ByteString)
131 -- ** Joining strings
132 join, -- :: ByteString -> [ByteString] -> ByteString
133 joinWithChar, -- :: Char -> ByteString -> ByteString -> ByteString
135 -- * Indexing ByteStrings
136 index, -- :: ByteString -> Int -> Char
137 elemIndex, -- :: Char -> ByteString -> Maybe Int
138 elemIndexLast, -- :: Char -> ByteString -> Maybe Int
139 elemIndices, -- :: Char -> ByteString -> [Int]
140 findIndex, -- :: (Char -> Bool) -> ByteString -> Maybe Int
141 findIndices, -- :: (Char -> Bool) -> ByteString -> [Int]
142 count, -- :: Char -> ByteString -> Int
144 -- * Ordered ByteStrings
145 sort, -- :: ByteString -> ByteString
147 -- * Searching ByteStrings
149 -- ** Searching by equality
150 elem, -- :: Char -> ByteString -> Bool
151 notElem, -- :: Char -> ByteString -> Bool
152 filterChar, -- :: Char -> ByteString -> ByteString
153 filterNotChar, -- :: Char -> ByteString -> ByteString
155 -- ** Searching with a predicate
156 filter, -- :: (Char -> Bool) -> ByteString -> ByteString
157 find, -- :: (Char -> Bool) -> ByteString -> Maybe Char
159 -- ** Searching for substrings
160 isPrefixOf, -- :: ByteString -> ByteString -> Bool
161 isSuffixOf, -- :: ByteString -> ByteString -> Bool
162 isSubstringOf, -- :: ByteString -> ByteString -> Bool
163 findSubstring, -- :: ByteString -> ByteString -> Maybe Int
164 findSubstrings, -- :: ByteString -> ByteString -> [Int]
166 -- * Zipping and unzipping ByteString
167 zip, -- :: ByteString -> ByteString -> [(Char,Char)]
168 zipWith, -- :: (Char -> Char -> c) -> ByteString -> ByteString -> [c]
169 unzip, -- :: [(Char,Char)] -> (ByteString,ByteString)
171 -- * Unchecked access
172 unsafeHead, -- :: ByteString -> Char
173 unsafeTail, -- :: ByteString -> ByteString
174 unsafeIndex, -- :: ByteString -> Int -> Char
175 w2c, -- :: Word8 -> Char
176 c2w, -- :: Char -> Word8
178 -- * Reading from ByteStrings
179 readInt, -- :: ByteString -> Maybe Int
180 unsafeReadInt, -- :: ByteString -> Maybe Int
182 -- * Copying ByteStrings
183 copy, -- :: ByteString -> ByteString
185 -- * I\/O with @ByteString@s
187 -- ** Standard input and output
189 #if defined(__GLASGOW_HASKELL__)
190 getLine, -- :: IO ByteString
192 getContents, -- :: IO ByteString
193 putStr, -- :: ByteString -> IO ()
194 putStrLn, -- :: ByteString -> IO ()
197 readFile, -- :: FilePath -> IO ByteString
198 -- mmapFile, -- :: FilePath -> IO ByteString
199 writeFile, -- :: FilePath -> ByteString -> IO ()
201 -- ** I\/O with Handles
202 #if defined(__GLASGOW_HASKELL__)
203 getArgs, -- :: IO [ByteString]
204 hGetLine, -- :: Handle -> IO ByteString
205 hGetNonBlocking, -- :: Handle -> Int -> IO ByteString
207 hGetContents, -- :: Handle -> IO ByteString
208 hGet, -- :: Handle -> Int -> IO ByteString
209 hPut, -- :: Handle -> ByteString -> IO ()
211 #if defined(__GLASGOW_HASKELL__)
212 -- * Low level construction
213 -- | For constructors from foreign language types see /Data.ByteString/
214 packAddress, -- :: Addr# -> ByteString
215 unsafePackAddress, -- :: Int -> Addr# -> ByteString
218 -- * Utilities (needed for array fusion)
219 #if defined(__GLASGOW_HASKELL__)
222 noAL, NoAL, loopArr, loopAcc, loopSndAcc,
223 loopU, mapEFL, filterEFL,
228 import qualified Prelude as P
229 import Prelude hiding (reverse,head,tail,last,init,null
230 ,length,map,lines,foldl,foldr,unlines
231 ,concat,any,take,drop,splitAt,takeWhile
232 ,dropWhile,span,break,elem,filter,unwords
233 ,words,maximum,minimum,all,concatMap
234 ,foldl1,foldr1,readFile,writeFile,replicate
235 ,getContents,getLine,putStr,putStrLn
236 ,zip,zipWith,unzip,notElem)
238 import qualified Data.ByteString as B
240 -- Listy functions transparently exported
241 import Data.ByteString (ByteString(..)
242 ,empty,null,length,tail,init,append
243 ,inits,tails,elems,reverse,transpose
244 ,concat,take,drop,splitAt,join
245 ,sort,isPrefixOf,isSuffixOf,isSubstringOf,findSubstring
246 ,findSubstrings,unsafeTail,copy,group
248 ,getContents, putStr, putStrLn
249 ,readFile, {-mmapFile,-} writeFile
250 ,hGetContents, hGet, hPut
251 #if defined(__GLASGOW_HASKELL__)
252 ,getLine, getArgs, hGetLine, hGetNonBlocking
253 ,packAddress, unsafePackAddress
256 ,noAL, NoAL, loopArr, loopAcc, loopSndAcc
257 ,loopU, mapEFL, filterEFL
258 ,useAsCString, unsafeUseAsCString
263 import qualified Data.List as List (intersperse)
266 import Foreign.C.Types (CLong)
267 import Foreign.Marshal.Utils (with)
269 #if defined(__GLASGOW_HASKELL__)
270 import GHC.Base (Char(..),unsafeChr,unpackCString#,unsafeCoerce#)
271 import GHC.IOBase (IO(..),stToIO)
272 import GHC.Prim (Addr#,writeWord8OffAddr#,realWorld#,plusAddr#)
273 import GHC.Ptr (Ptr(..))
274 import GHC.ST (ST(..))
277 #define STRICT1(f) f a | a `seq` False = undefined
278 #define STRICT2(f) f a b | a `seq` b `seq` False = undefined
279 #define STRICT3(f) f a b c | a `seq` b `seq` c `seq` False = undefined
281 ------------------------------------------------------------------------
283 -- | /O(1)/ Convert a 'Char' into a 'ByteString'
284 packChar :: Char -> ByteString
285 packChar = B.packByte . c2w
286 {-# INLINE packChar #-}
288 -- | /O(n)/ Convert a 'String' into a 'ByteString'
290 -- For applications with large numbers of string literals, pack can be a
291 -- bottleneck. In such cases, consider using packAddress (GHC only).
292 pack :: String -> ByteString
293 #if !defined(__GLASGOW_HASKELL__)
295 pack str = B.create (P.length str) $ \p -> go p str
296 where go _ [] = return ()
297 go p (x:xs) = poke p (c2w x) >> go (p `plusPtr` 1) xs
299 #else /* hack away */
301 pack str = B.create (P.length str) $ \(Ptr p) -> stToIO (go p str)
303 go :: Addr# -> [Char] -> ST a ()
305 go p (C# c:cs) = writeByte p (unsafeCoerce# c) >> go (p `plusAddr#` 1#) cs
307 writeByte p c = ST $ \s# ->
308 case writeWord8OffAddr# p 0# c s# of s2# -> (# s2#, () #)
309 {-# INLINE writeByte #-}
312 "pack/packAddress" forall s# .
313 pack (unpackCString# s#) = B.packAddress s#
320 -- | /O(n)/ Converts a 'ByteString' to a 'String'.
321 unpack :: ByteString -> [Char]
322 unpack = B.unpackWith w2c
323 {-# INLINE unpack #-}
325 -- | /O(n)/ 'cons' is analogous to (:) for lists, but of different
326 -- complexity, as it requires a memcpy.
327 cons :: Char -> ByteString -> ByteString
331 -- | /O(n)/ Append a Char to the end of a 'ByteString'. Similar to
332 -- 'cons', this function performs a memcpy.
333 snoc :: ByteString -> Char -> ByteString
334 snoc p = B.snoc p . c2w
337 -- | /O(1)/ Extract the first element of a ByteString, which must be non-empty.
338 head :: ByteString -> Char
342 -- | /O(1)/ Extract the last element of a packed string, which must be non-empty.
343 last :: ByteString -> Char
347 -- | /O(n)/ 'map' @f xs@ is the ByteString obtained by applying @f@ to each element of @xs@
348 map :: (Char -> Char) -> ByteString -> ByteString
349 map f = B.map (c2w . f . w2c)
352 -- | /O(n)/ The 'intersperse' function takes a Char and a 'ByteString'
353 -- and \`intersperses\' that Char between the elements of the
354 -- 'ByteString'. It is analogous to the intersperse function on Lists.
355 intersperse :: Char -> ByteString -> ByteString
356 intersperse = B.intersperse . c2w
357 {-# INLINE intersperse #-}
359 -- | 'foldl', applied to a binary operator, a starting value (typically
360 -- the left-identity of the operator), and a ByteString, reduces the
361 -- ByteString using the binary operator, from left to right.
362 foldl :: (a -> Char -> a) -> a -> ByteString -> a
363 foldl f = B.foldl (\a c -> f a (w2c c))
366 -- | 'foldr', applied to a binary operator, a starting value
367 -- (typically the right-identity of the operator), and a packed string,
368 -- reduces the packed string using the binary operator, from right to left.
369 foldr :: (Char -> a -> a) -> a -> ByteString -> a
370 foldr f = B.foldr (\c a -> f (w2c c) a)
373 -- | 'foldl1' is a variant of 'foldl' that has no starting value
374 -- argument, and thus must be applied to non-empty 'ByteStrings'.
375 foldl1 :: (Char -> Char -> Char) -> ByteString -> Char
376 foldl1 f ps = w2c (B.foldl1 (\x y -> c2w (f (w2c x) (w2c y))) ps)
377 {-# INLINE foldl1 #-}
379 -- | 'foldr1' is a variant of 'foldr' that has no starting value argument,
380 -- and thus must be applied to non-empty 'ByteString's
381 foldr1 :: (Char -> Char -> Char) -> ByteString -> Char
382 foldr1 f ps = w2c (B.foldr1 (\x y -> c2w (f (w2c x) (w2c y))) ps)
383 {-# INLINE foldr1 #-}
385 -- | Map a function over a 'ByteString' and concatenate the results
386 concatMap :: (Char -> ByteString) -> ByteString -> ByteString
387 concatMap f = B.concatMap (f . w2c)
388 {-# INLINE concatMap #-}
390 -- | Applied to a predicate and a ByteString, 'any' determines if
391 -- any element of the 'ByteString' satisfies the predicate.
392 any :: (Char -> Bool) -> ByteString -> Bool
393 any f = B.any (f . w2c)
396 -- | Applied to a predicate and a 'ByteString', 'all' determines if
397 -- all elements of the 'ByteString' satisfy the predicate.
398 all :: (Char -> Bool) -> ByteString -> Bool
399 all f = B.all (f . w2c)
402 -- | 'maximum' returns the maximum value from a 'ByteString'
403 maximum :: ByteString -> Char
404 maximum = w2c . B.maximum
405 {-# INLINE maximum #-}
407 -- | 'minimum' returns the minimum value from a 'ByteString'
408 minimum :: ByteString -> Char
409 minimum = w2c . B.minimum
410 {-# INLINE minimum #-}
412 -- | /O(n)/ map Char functions, provided with the index at each position
413 mapIndexed :: (Int -> Char -> Char) -> ByteString -> ByteString
414 mapIndexed f = B.mapIndexed (\i c -> c2w (f i (w2c c)))
415 {-# INLINE mapIndexed #-}
417 -- | /O(n)/ 'replicate' @n x@ is a ByteString of length @n@ with @x@
418 -- the value of every element. The following holds:
420 -- > replicate w c = unfoldr w (\u -> Just (u,u)) c
422 -- This implemenation uses @memset(3)@
423 replicate :: Int -> Char -> ByteString
424 replicate w = B.replicate w . c2w
425 {-# INLINE replicate #-}
427 -- | /O(n)/ The 'unfoldrN' function is analogous to the List \'unfoldr\'.
428 -- 'unfoldrN' builds a ByteString from a seed value. The function takes
429 -- the element and returns 'Nothing' if it is done producing the
430 -- ByteString or returns 'Just' @(a,b)@, in which case, @a@ is a
431 -- prepending to the ByteString and @b@ is used as the next element in a
434 -- To preven unfoldrN having /O(n^2)/ complexity (as prepending a
435 -- character to a ByteString is /O(n)/, this unfoldr requires a maximum
436 -- final size of the ByteString as an argument. 'cons' can then be
437 -- implemented in /O(1)/ (i.e. a 'poke'), and the unfoldr itself has
438 -- linear complexity. The depth of the recursion is limited to this
439 -- size, but may be less. For lazy, infinite unfoldr, use
440 -- 'Data.List.unfoldr' (from 'Data.List').
444 -- > unfoldrN 10 (\x -> Just (x, chr (ord x + 1))) '0' == "0123456789"
446 -- The following equation connects the depth-limited unfoldr to the List unfoldr:
448 -- > unfoldrN n == take n $ List.unfoldr
450 unfoldrN :: Int -> (Char -> Maybe (Char, Char)) -> Char -> ByteString
451 unfoldrN n f w = B.unfoldrN n ((k `fmap`) . f . w2c) (c2w w)
452 where k (i,j) = (c2w i, c2w j) -- (c2w *** c2w)
453 {-# INLINE unfoldrN #-}
455 -- | 'takeWhile', applied to a predicate @p@ and a ByteString @xs@,
456 -- returns the longest prefix (possibly empty) of @xs@ of elements that
458 takeWhile :: (Char -> Bool) -> ByteString -> ByteString
459 takeWhile f = B.takeWhile (f . w2c)
460 {-# INLINE takeWhile #-}
462 -- | 'dropWhile' @p xs@ returns the suffix remaining after 'takeWhile' @p xs@.
463 dropWhile :: (Char -> Bool) -> ByteString -> ByteString
464 dropWhile f = B.dropWhile (f . w2c)
465 {-# INLINE dropWhile #-}
467 -- | 'break' @p@ is equivalent to @'span' ('not' . p)@.
468 break :: (Char -> Bool) -> ByteString -> (ByteString, ByteString)
469 break f = B.break (f . w2c)
472 -- | 'span' @p xs@ breaks the ByteString into two segments. It is
473 -- equivalent to @('takeWhile' p xs, 'dropWhile' p xs)@
474 span :: (Char -> Bool) -> ByteString -> (ByteString, ByteString)
475 span f = B.span (f . w2c)
478 -- | 'spanEnd' behaves like 'span' but from the end of the 'ByteString'.
481 -- > spanEnd (not.isSpace) "x y z" == ("x y ","z")
485 -- > spanEnd (not . isSpace) ps
487 -- > let (x,y) = span (not.isSpace) (reverse ps) in (reverse y, reverse x)
489 spanEnd :: (Char -> Bool) -> ByteString -> (ByteString, ByteString)
490 spanEnd f = B.spanEnd (f . w2c)
491 {-# INLINE spanEnd #-}
493 -- | 'breakChar' breaks its ByteString argument at the first occurence
494 -- of the specified Char. It is more efficient than 'break' as it is
495 -- implemented with @memchr(3)@. I.e.
497 -- > break (=='c') "abcd" == breakChar 'c' "abcd"
499 breakChar :: Char -> ByteString -> (ByteString, ByteString)
500 breakChar = B.breakByte . c2w
501 {-# INLINE breakChar #-}
503 -- | 'spanChar' breaks its ByteString argument at the first
504 -- occurence of a Char other than its argument. It is more efficient
507 -- > span (=='c') "abcd" == spanByte 'c' "abcd"
509 spanChar :: Char -> ByteString -> (ByteString, ByteString)
510 spanChar = B.spanByte . c2w
511 {-# INLINE spanChar #-}
513 -- | /O(n)/ 'breakFirst' breaks the given ByteString on the first
514 -- occurence of @w@. It behaves like 'break', except the delimiter is
515 -- not returned, and @Nothing@ is returned if the delimiter is not in
516 -- the ByteString. I.e.
518 -- > breakFirst 'b' "aabbcc" == Just ("aa","bcc")
520 -- > breakFirst c xs ==
521 -- > let (x,y) = break (== c) xs
522 -- > in if null y then Nothing else Just (x, drop 1 y))
524 breakFirst :: Char -> ByteString -> Maybe (ByteString,ByteString)
525 breakFirst = B.breakFirst . c2w
526 {-# INLINE breakFirst #-}
528 -- | /O(n)/ 'breakLast' behaves like breakFirst, but from the end of the
531 -- > breakLast ('b') (pack "aabbcc") == Just ("aab","cc")
533 -- and the following are equivalent:
535 -- > breakLast 'c' "abcdef"
536 -- > let (x,y) = break (=='c') (reverse "abcdef")
537 -- > in if null x then Nothing else Just (reverse (drop 1 y), reverse x)
539 breakLast :: Char -> ByteString -> Maybe (ByteString,ByteString)
540 breakLast = B.breakLast . c2w
541 {-# INLINE breakLast #-}
543 -- | /O(n)/ Break a 'ByteString' into pieces separated by the byte
544 -- argument, consuming the delimiter. I.e.
546 -- > split '\n' "a\nb\nd\ne" == ["a","b","d","e"]
547 -- > split 'a' "aXaXaXa" == ["","X","X","X"]
548 -- > split 'x' "x" == ["",""]
552 -- > join [c] . split c == id
553 -- > split == splitWith . (==)
555 -- As for all splitting functions in this library, this function does
556 -- not copy the substrings, it just constructs new 'ByteStrings' that
557 -- are slices of the original.
559 split :: Char -> ByteString -> [ByteString]
560 split = B.split . c2w
563 -- | /O(n)/ Splits a 'ByteString' into components delimited by
564 -- separators, where the predicate returns True for a separator element.
565 -- The resulting components do not contain the separators. Two adjacent
566 -- separators result in an empty component in the output. eg.
568 -- > splitWith (=='a') "aabbaca" == ["","","bb","c",""]
570 splitWith :: (Char -> Bool) -> ByteString -> [ByteString]
571 splitWith f = B.splitWith (f . w2c)
572 {-# INLINE splitWith #-}
573 -- the inline makes a big difference here.
575 -- | Like 'splitWith', except that sequences of adjacent separators are
576 -- treated as a single separator. eg.
578 -- > tokens (=='a') "aabbaca" == ["bb","c"]
580 tokens :: (Char -> Bool) -> ByteString -> [ByteString]
581 tokens f = B.tokens (f . w2c)
582 {-# INLINE tokens #-}
584 -- | The 'groupBy' function is the non-overloaded version of 'group'.
585 groupBy :: (Char -> Char -> Bool) -> ByteString -> [ByteString]
586 groupBy k = B.groupBy (\a b -> k (w2c a) (w2c b))
588 -- | /O(n)/ joinWithChar. An efficient way to join to two ByteStrings with a
589 -- char. Around 4 times faster than the generalised join.
591 joinWithChar :: Char -> ByteString -> ByteString -> ByteString
592 joinWithChar = B.joinWithByte . c2w
593 {-# INLINE joinWithChar #-}
595 -- | /O(1)/ 'ByteString' index (subscript) operator, starting from 0.
596 index :: ByteString -> Int -> Char
597 index = (w2c .) . B.index
600 -- | /O(n)/ The 'elemIndex' function returns the index of the first
601 -- element in the given 'ByteString' which is equal (by memchr) to the
602 -- query element, or 'Nothing' if there is no such element.
603 elemIndex :: Char -> ByteString -> Maybe Int
604 elemIndex = B.elemIndex . c2w
605 {-# INLINE elemIndex #-}
607 -- | /O(n)/ The 'elemIndexLast' function returns the last index of the
608 -- element in the given 'ByteString' which is equal to the query
609 -- element, or 'Nothing' if there is no such element. The following
612 -- > elemIndexLast c xs ==
613 -- > (-) (length xs - 1) `fmap` elemIndex c (reverse xs)
615 elemIndexLast :: Char -> ByteString -> Maybe Int
616 elemIndexLast = B.elemIndexLast . c2w
617 {-# INLINE elemIndexLast #-}
619 -- | /O(n)/ The 'elemIndices' function extends 'elemIndex', by returning
620 -- the indices of all elements equal to the query element, in ascending order.
621 elemIndices :: Char -> ByteString -> [Int]
622 elemIndices = B.elemIndices . c2w
623 {-# INLINE elemIndices #-}
625 -- | The 'findIndex' function takes a predicate and a 'ByteString' and
626 -- returns the index of the first element in the ByteString satisfying the predicate.
627 findIndex :: (Char -> Bool) -> ByteString -> Maybe Int
628 findIndex f = B.findIndex (f . w2c)
629 {-# INLINE findIndex #-}
631 -- | The 'findIndices' function extends 'findIndex', by returning the
632 -- indices of all elements satisfying the predicate, in ascending order.
633 findIndices :: (Char -> Bool) -> ByteString -> [Int]
634 findIndices f = B.findIndices (f . w2c)
636 -- | count returns the number of times its argument appears in the ByteString
638 -- > count = length . elemIndices
642 -- > count '\n' == length . lines
644 -- But more efficiently than using length on the intermediate list.
645 count :: Char -> ByteString -> Int
646 count c = B.count (c2w c)
648 -- | /O(n)/ 'elem' is the 'ByteString' membership predicate. This
649 -- implementation uses @memchr(3)@.
650 elem :: Char -> ByteString -> Bool
651 elem c = B.elem (c2w c)
654 -- | /O(n)/ 'notElem' is the inverse of 'elem'
655 notElem :: Char -> ByteString -> Bool
656 notElem c = B.notElem (c2w c)
657 {-# INLINE notElem #-}
659 -- | /O(n)/ 'filter', applied to a predicate and a ByteString,
660 -- returns a ByteString containing those characters that satisfy the
662 filter :: (Char -> Bool) -> ByteString -> ByteString
663 filter f = B.filter (f . w2c)
664 {-# INLINE filter #-}
666 -- | /O(n)/ The 'find' function takes a predicate and a ByteString,
667 -- and returns the first element in matching the predicate, or 'Nothing'
668 -- if there is no such element.
669 find :: (Char -> Bool) -> ByteString -> Maybe Char
670 find f ps = w2c `fmap` B.find (f . w2c) ps
673 -- | /O(n)/ A first order equivalent of /filter . (==)/, for the common
674 -- case of filtering a single Char. It is more efficient to use
675 -- filterChar in this case.
677 -- > filterChar == filter . (==)
679 -- filterChar is around 10x faster, and uses much less space, than its
682 filterChar :: Char -> ByteString -> ByteString
683 filterChar c = B.filterByte (c2w c)
684 {-# INLINE filterChar #-}
686 -- | /O(n)/ A first order equivalent of /filter . (\/=)/, for the common
687 -- case of filtering a single Char out of a list. It is more efficient
688 -- to use /filterNotChar/ in this case.
690 -- > filterNotChar == filter . (/=)
692 -- filterNotChar is around 3x faster, and uses much less space, than its
695 filterNotChar :: Char -> ByteString -> ByteString
696 filterNotChar c = B.filterNotByte (c2w c)
697 {-# INLINE filterNotChar #-}
699 -- | /O(n)/ 'zip' takes two ByteStrings and returns a list of
700 -- corresponding pairs of Chars. If one input ByteString is short,
701 -- excess elements of the longer ByteString are discarded. This is
702 -- equivalent to a pair of 'unpack' operations, and so space
703 -- usage may be large for multi-megabyte ByteStrings
704 zip :: ByteString -> ByteString -> [(Char,Char)]
706 | B.null ps || B.null qs = []
707 | otherwise = (unsafeHead ps, unsafeHead qs) : zip (B.unsafeTail ps) (B.unsafeTail qs)
709 -- | 'zipWith' generalises 'zip' by zipping with the function given as
710 -- the first argument, instead of a tupling function. For example,
711 -- @'zipWith' (+)@ is applied to two ByteStrings to produce the list
712 -- of corresponding sums.
713 zipWith :: (Char -> Char -> a) -> ByteString -> ByteString -> [a]
714 zipWith f = B.zipWith ((. w2c) . f . w2c)
716 -- | 'unzip' transforms a list of pairs of Chars into a pair of
717 -- ByteStrings. Note that this performs two 'pack' operations.
718 unzip :: [(Char,Char)] -> (ByteString,ByteString)
719 unzip ls = (pack (P.map fst ls), pack (P.map snd ls))
722 -- | A variety of 'head' for non-empty ByteStrings. 'unsafeHead' omits
723 -- the check for the empty case, which is good for performance, but
724 -- there is an obligation on the programmer to provide a proof that the
725 -- ByteString is non-empty.
726 unsafeHead :: ByteString -> Char
727 unsafeHead = w2c . B.unsafeHead
728 {-# INLINE unsafeHead #-}
730 -- | Unsafe 'ByteString' index (subscript) operator, starting from 0, returning a Char.
731 -- This omits the bounds check, which means there is an accompanying
732 -- obligation on the programmer to ensure the bounds are checked in some
734 unsafeIndex :: ByteString -> Int -> Char
735 unsafeIndex = (w2c .) . B.unsafeIndex
736 {-# INLINE unsafeIndex #-}
738 -- | Conversion between 'Word8' and 'Char'. Should compile to a no-op.
740 #if !defined(__GLASGOW_HASKELL__)
741 w2c = chr . fromIntegral
743 w2c = unsafeChr . fromIntegral
747 -- | Unsafe conversion between 'Char' and 'Word8'. This is a no-op and
748 -- silently truncates to 8 bits Chars > '\255'. It is provided as
749 -- convenience for ByteString construction.
751 c2w = fromIntegral . ord
754 -- ---------------------------------------------------------------------
755 -- Things that depend on the encoding
757 -- | 'breakSpace' returns the pair of ByteStrings when the argument is
758 -- broken at the first whitespace byte. I.e.
760 -- > break isSpace == breakSpace
762 breakSpace :: ByteString -> (ByteString,ByteString)
763 breakSpace (PS x s l) = inlinePerformIO $ withForeignPtr x $ \p -> do
764 i <- firstspace (p `plusPtr` s) 0 l
765 return $ case () of {_
766 | i == 0 -> (empty, PS x s l)
767 | i == l -> (PS x s l, empty)
768 | otherwise -> (PS x s i, PS x (s+i) (l-i))
770 {-# INLINE breakSpace #-}
772 firstspace :: Ptr Word8 -> Int -> Int -> IO Int
776 | otherwise = do w <- peekByteOff ptr n
777 if (not . isSpaceWord8) w then firstspace ptr (n+1) m else return n
779 -- | 'dropSpace' efficiently returns the 'ByteString' argument with
780 -- white space Chars removed from the front. It is more efficient than
781 -- calling dropWhile for removing whitespace. I.e.
783 -- > dropWhile isSpace == dropSpace
785 dropSpace :: ByteString -> ByteString
786 dropSpace (PS x s l) = inlinePerformIO $ withForeignPtr x $ \p -> do
787 i <- firstnonspace (p `plusPtr` s) 0 l
788 return $ if i == l then empty else PS x (s+i) (l-i)
789 {-# INLINE dropSpace #-}
791 firstnonspace :: Ptr Word8 -> Int -> Int -> IO Int
792 STRICT3(firstnonspace)
793 firstnonspace ptr n m
795 | otherwise = do w <- peekElemOff ptr n
796 if isSpaceWord8 w then firstnonspace ptr (n+1) m else return n
798 -- | 'dropSpaceEnd' efficiently returns the 'ByteString' argument with
799 -- white space removed from the end. I.e.
801 -- > reverse . (dropWhile isSpace) . reverse == dropSpaceEnd
803 -- but it is more efficient than using multiple reverses.
805 dropSpaceEnd :: ByteString -> ByteString
806 dropSpaceEnd (PS x s l) = inlinePerformIO $ withForeignPtr x $ \p -> do
807 i <- lastnonspace (p `plusPtr` s) (l-1)
808 return $ if i == (-1) then empty else PS x s (i+1)
809 {-# INLINE dropSpaceEnd #-}
811 lastnonspace :: Ptr Word8 -> Int -> IO Int
812 STRICT2(lastnonspace)
815 | otherwise = do w <- peekElemOff ptr n
816 if isSpaceWord8 w then lastnonspace ptr (n-1) else return n
818 -- | 'lines' breaks a ByteString up into a list of ByteStrings at
819 -- newline Chars. The resulting strings do not contain newlines.
821 lines :: ByteString -> [ByteString]
824 | otherwise = case search ps of
826 Just n -> take n ps : lines (drop (n+1) ps)
827 where search = elemIndex '\n'
833 P.length . lines = count '\n'
838 -- Just as fast, but more complex. Should be much faster, I thought.
839 lines :: ByteString -> [ByteString]
840 lines (PS _ _ 0) = []
841 lines (PS x s l) = inlinePerformIO $ withForeignPtr x $ \p -> do
842 let ptr = p `plusPtr` s
846 let q = memchr (ptr `plusPtr` n) 0x0a (fromIntegral (l-n))
848 then return [PS x (s+n) (l-n)]
849 else do let i = q `minusPtr` ptr
851 return $! PS x (s+n) (i-n) : ls
855 -- | 'unlines' is an inverse operation to 'lines'. It joins lines,
856 -- after appending a terminating newline to each.
857 unlines :: [ByteString] -> ByteString
859 unlines ss = (concat $ List.intersperse nl ss) `append` nl -- half as much space
860 where nl = packChar '\n'
862 -- | 'words' breaks a ByteString up into a list of words, which
863 -- were delimited by Chars representing white space. And
865 -- > tokens isSpace = words
867 words :: ByteString -> [ByteString]
868 words = B.tokens isSpaceWord8
870 -- | The 'unwords' function is analogous to the 'unlines' function, on words.
871 unwords :: [ByteString] -> ByteString
872 unwords = join (packChar ' ')
874 -- | /O(n)/ Indicies of newlines. Shorthand for
876 -- > elemIndices '\n'
878 lineIndices :: ByteString -> [Int]
879 lineIndices = elemIndices '\n'
881 -- | 'lines\'' behaves like 'lines', in that it breaks a ByteString on
882 -- newline Chars. However, unlike the Prelude functions, 'lines\'' and
883 -- 'unlines\'' correctly reconstruct lines that are missing terminating
884 -- newlines characters. I.e.
886 -- > unlines (lines "a\nb\nc") == "a\nb\nc\n"
887 -- > unlines' (lines' "a\nb\nc") == "a\nb\nc"
889 -- Note that this means:
891 -- > lines "a\nb\nc\n" == ["a","b","c"]
892 -- > lines' "a\nb\nc\n" == ["a","b","c",""]
894 lines' :: ByteString -> [ByteString]
895 lines' ps = ps `seq` case elemIndex '\n' ps of
897 Just n -> take n ps : lines' (drop (n+1) ps)
899 -- | 'linesCRLF\'' behaves like 'lines\'', but breaks on (\\cr?\\lf)
900 linesCRLF' :: ByteString -> [ByteString]
901 linesCRLF' ps = ps `seq` case elemIndex '\n' ps of
903 Just 0 -> empty : linesCRLF' (drop 1 ps)
904 Just n -> let k = if ps `unsafeIndex` (n-1) == '\r' then n-1 else n
905 in take k ps : linesCRLF' (drop (n+1) ps)
907 -- | 'unlines\'' behaves like 'unlines', except that it also correctly
908 -- retores lines that do not have terminating newlines (see the
909 -- description for 'lines\'').
911 unlines' :: [ByteString] -> ByteString
912 unlines' ss = concat $ intersperse_newlines ss
913 where intersperse_newlines (a:b:s) = a:newline: intersperse_newlines (b:s)
914 intersperse_newlines s = s
915 newline = packChar '\n'
917 -- | 'unlines\'' behaves like 'unlines', except that it also correctly
918 -- retores lines that do not have terminating newlines (see the
919 -- description for 'lines\''). Uses CRLF instead of LF.
921 unlinesCRLF' :: [ByteString] -> ByteString
922 unlinesCRLF' ss = concat $ intersperse_newlines ss
923 where intersperse_newlines (a:b:s) = a:newline: intersperse_newlines (b:s)
924 intersperse_newlines s = s
925 newline = pack "\r\n"
927 -- | 'words\'' behaves like 'words', with the exception that it produces
928 -- output on ByteStrings with trailing whitespace that can be
929 -- correctly inverted by 'unwords'. I.e.
931 -- > words "a b c " == ["a","b","c"]
932 -- > words' "a b c " == ["a","b","c",""]
934 -- > unwords $ words "a b c " == "a b c"
935 -- > unwords $ words' "a b c " == "a b c "
937 words' :: ByteString -> [ByteString]
938 words' = B.splitWith isSpaceWord8
940 -- | 'unwords\'' behaves like 'unwords'. It is provided for consistency
941 -- with the other invertable words and lines functions.
942 unwords' :: [ByteString] -> ByteString
945 -- | 'betweenLines' returns the ByteString between the two lines given,
946 -- or Nothing if they do not appear. The returned string is the first
947 -- and shortest string such that the line before it is the given first
948 -- line, and the line after it is the given second line.
949 betweenLines :: ByteString -- ^ First line to look for
950 -> ByteString -- ^ Second line to look for
951 -> ByteString -- ^ 'ByteString' to look in
952 -> Maybe (ByteString)
954 betweenLines start end ps =
955 case P.break (start ==) (lines ps) of
956 (_, _:rest@(PS ps1 s1 _:_)) ->
957 case P.break (end ==) rest of
958 (_, PS _ s2 _:_) -> Just $ PS ps1 s1 (s2 - s1)
962 -- ---------------------------------------------------------------------
963 -- Reading from ByteStrings
965 -- | readInt skips any whitespace at the beginning of its argument, and
966 -- reads an Int from the beginning of the ByteString. If there is no
967 -- integer at the beginning of the string, it returns Nothing, otherwise
968 -- it just returns the int read, and the rest of the string.
969 readInt :: ByteString -> Maybe (Int, ByteString)
970 readInt p@(PS x s l) = inlinePerformIO $ useAsCString p $ \cstr ->
971 with (castPtr cstr) $ \endpp -> do
972 val <- c_strtol (castPtr cstr) endpp 0
973 skipped <- (`minusPtr` cstr) `fmap` peek endpp
974 return $ if skipped == 0
976 else Just (fromIntegral val, PS x (s+skipped) (l-skipped))
978 -- | unsafeReadInt is like readInt, but requires a null terminated
979 -- ByteString. It avoids a copy if this is the case. It returns the Int
980 -- read, if any, and the rest of the string.
981 unsafeReadInt :: ByteString -> Maybe (Int, ByteString)
982 unsafeReadInt p@(PS x s l) = inlinePerformIO $ unsafeUseAsCString p $ \cstr ->
983 with (castPtr cstr) $ \endpp -> do
984 val <- c_strtol (castPtr cstr) endpp 0
985 skipped <- (`minusPtr` cstr) `fmap` peek endpp
986 return $ if skipped == 0
988 else Just (fromIntegral val, PS x (s+skipped) (l-skipped))
990 foreign import ccall unsafe "stdlib.h strtol" c_strtol
991 :: Ptr Word8 -> Ptr (Ptr Word8) -> Int -> IO CLong
995 -- not quite there yet
997 readInt :: ByteString -> Maybe (Int, ByteString)
1002 | B.null ps = Nothing
1003 | x == '-' = neg 0 xs
1004 | otherwise = pos (parse x) xs
1005 where (x, xs) = (ps `unsafeIndex` 0, unsafeTail ps)
1008 neg n qs | isSpace x = return $ Just ((i-n),xs)
1009 | otherwise = neg (parse x + (10 * n)) xs
1010 where (x, xs) = (qs `unsafeIndex` 0, unsafeTail qs)
1013 pos n qs | isSpace x = go (i+n) xs
1014 | otherwise = pos (parse x + (10 * n)) xs
1015 where (x, xs) = (qs `unsafeIndexWord8` 0, unsafeTail qs)
1017 parse w = fromIntegral (w - 48) :: Int
1018 {-# INLINE parse #-}
1021 -- ---------------------------------------------------------------------
1024 -- Just like inlinePerformIO, but we inline it. Big performance gains as
1025 -- it exposes lots of things to further inlining
1027 {-# INLINE inlinePerformIO #-}
1028 inlinePerformIO :: IO a -> a
1029 #if defined(__GLASGOW_HASKELL__)
1030 inlinePerformIO (IO m) = case m realWorld# of (# _, r #) -> r
1032 inlinePerformIO = unsafePerformIO
1035 -- Selects white-space characters in the Latin-1 range
1036 -- ordered by frequency
1038 isSpaceWord8 :: Word8 -> Bool
1039 isSpaceWord8 w = case w of
1040 0x20 -> True -- SPACE
1041 0x0A -> True -- LF, \n
1042 0x09 -> True -- HT, \t
1043 0x0C -> True -- FF, \f
1044 0x0D -> True -- CR, \r
1045 0x0B -> True -- VT, \v
1046 0xA0 -> True -- spotted by QC..
1048 {-# INLINE isSpaceWord8 #-}
1050 -- | /O(n)/ Like 'map', but not fuseable. The benefit is that it is
1051 -- slightly faster for one-shot cases.
1052 mapF :: (Char -> Char) -> ByteString -> ByteString
1053 mapF f = B.mapF (c2w . f . w2c)
1055 -- | /O(n)/ 'filterF' is a non-fuseable version of filter, that may be
1056 -- around 2x faster for some one-shot applications.
1057 filterF :: (Char -> Bool) -> ByteString -> ByteString
1058 filterF f = B.filterF (f . w2c)