1 {-# OPTIONS_GHC -cpp -fglasgow-exts #-}
3 -- Module : Data.ByteString.Base
5 -- Maintainer : dons@cse.unsw.edu.au
6 -- Stability : experimental
7 -- Portability : portable
9 -- A module containing semi-public 'ByteString' internals. This exposes
10 -- the 'ByteString' representation and low level construction functions.
11 -- Modules which extend the 'ByteString' system will need to use this module
12 -- while ideally most users will be able to make do with the public interface
15 module Data.ByteString.Base (
17 -- * The @ByteString@ type and representation
18 ByteString(..), -- instances: Eq, Ord, Show, Read, Data, Typeable
19 LazyByteString(..), -- instances: Eq, Ord, Show, Read, Data, Typeable
22 unsafeHead, -- :: ByteString -> Word8
23 unsafeTail, -- :: ByteString -> ByteString
24 unsafeIndex, -- :: ByteString -> Int -> Word8
25 unsafeTake, -- :: Int -> ByteString -> ByteString
26 unsafeDrop, -- :: Int -> ByteString -> ByteString
28 -- * Low level introduction and elimination
29 empty, -- :: ByteString
30 create, -- :: Int -> (Ptr Word8 -> IO ()) -> IO ByteString
31 createAndTrim, -- :: Int -> (Ptr Word8 -> IO Int) -> IO ByteString
32 createAndTrim', -- :: Int -> (Ptr Word8 -> IO (Int, Int, a)) -> IO (ByteString, a)
33 mallocByteString, -- :: Int -> IO (ForeignPtr a)
35 unsafeCreate, -- :: Int -> (Ptr Word8 -> IO ()) -> ByteString
36 unsafeUseAsCString, -- :: ByteString -> (CString -> IO a) -> IO a
37 unsafeUseAsCStringLen, -- :: ByteString -> (CStringLen -> IO a) -> IO a
39 fromForeignPtr, -- :: ForeignPtr Word8 -> Int -> ByteString
40 toForeignPtr, -- :: ByteString -> (ForeignPtr Word8, Int, Int)
42 #if defined(__GLASGOW_HASKELL__)
43 packCStringFinalizer, -- :: Ptr Word8 -> Int -> IO () -> IO ByteString
44 packAddress, -- :: Addr# -> ByteString
45 unsafePackAddress, -- :: Int -> Addr# -> ByteString
46 unsafeFinalize, -- :: ByteString -> IO ()
50 inlinePerformIO, -- :: IO a -> a
51 nullForeignPtr, -- :: ForeignPtr Word8
53 countOccurrences, -- :: (Storable a, Num a) => Ptr a -> Ptr Word8 -> Int -> IO ()
55 -- * Standard C Functions
56 c_strlen, -- :: CString -> IO CInt
57 c_malloc, -- :: CInt -> IO (Ptr Word8)
58 c_free, -- :: Ptr Word8 -> IO ()
59 c_free_finalizer, -- :: FunPtr (Ptr Word8 -> IO ())
61 memchr, -- :: Ptr Word8 -> Word8 -> CSize -> IO Ptr Word8
62 memcmp, -- :: Ptr Word8 -> Ptr Word8 -> CSize -> IO CInt
63 memcpy, -- :: Ptr Word8 -> Ptr Word8 -> CSize -> IO ()
64 memmove, -- :: Ptr Word8 -> Ptr Word8 -> CSize -> IO ()
65 memset, -- :: Ptr Word8 -> Word8 -> CSize -> IO (Ptr Word8)
68 c_reverse, -- :: Ptr Word8 -> Ptr Word8 -> CInt -> IO ()
69 c_intersperse, -- :: Ptr Word8 -> Ptr Word8 -> CInt -> Word8 -> IO ()
70 c_maximum, -- :: Ptr Word8 -> CInt -> IO Word8
71 c_minimum, -- :: Ptr Word8 -> CInt -> IO Word8
72 c_count, -- :: Ptr Word8 -> CInt -> Word8 -> IO CInt
74 -- * Internal GHC magic
75 #if defined(__GLASGOW_HASKELL__)
76 memcpy_ptr_baoff, -- :: Ptr a -> RawBuffer -> CInt -> CSize -> IO (Ptr ())
80 w2c, c2w, isSpaceWord8
84 import Foreign.ForeignPtr (ForeignPtr, newForeignPtr_, withForeignPtr)
85 import Foreign.Ptr (Ptr, FunPtr, plusPtr, castPtr)
86 import Foreign.Storable (Storable(..))
87 import Foreign.C.Types (CInt, CSize, CULong)
88 import Foreign.C.String (CString, CStringLen)
90 import Control.Exception (assert)
92 import Data.Char (ord)
93 import Data.Word (Word8)
95 #if defined(__GLASGOW_HASKELL__)
96 import qualified Foreign.ForeignPtr as FC (finalizeForeignPtr)
97 import qualified Foreign.Concurrent as FC (newForeignPtr)
99 import Data.Generics (Data(..), Typeable(..))
100 import GHC.Prim (Addr#)
101 import GHC.Ptr (Ptr(..))
102 import GHC.Base (realWorld#,unsafeChr)
103 import GHC.IOBase (IO(IO), unsafePerformIO, RawBuffer)
105 import Data.Char (chr)
106 import System.IO.Unsafe (unsafePerformIO)
109 #if __GLASGOW_HASKELL__ >= 605 && !defined(SLOW_FOREIGN_PTR)
110 import GHC.ForeignPtr (mallocPlainForeignPtrBytes)
112 import Foreign.ForeignPtr (mallocForeignPtrBytes)
115 #if __GLASGOW_HASKELL__>=605
116 import GHC.ForeignPtr (ForeignPtr(ForeignPtr))
117 import GHC.Base (nullAddr#)
119 import Foreign.Ptr (nullPtr)
122 -- CFILES stuff is Hugs only
123 {-# CFILES cbits/fpstring.c #-}
125 -- -----------------------------------------------------------------------------
127 -- Useful macros, until we have bang patterns
130 #define STRICT1(f) f a | a `seq` False = undefined
131 #define STRICT2(f) f a b | a `seq` b `seq` False = undefined
132 #define STRICT3(f) f a b c | a `seq` b `seq` c `seq` False = undefined
133 #define STRICT4(f) f a b c d | a `seq` b `seq` c `seq` d `seq` False = undefined
134 #define STRICT5(f) f a b c d e | a `seq` b `seq` c `seq` d `seq` e `seq` False = undefined
136 -- -----------------------------------------------------------------------------
138 -- | A space-efficient representation of a Word8 vector, supporting many
139 -- efficient operations. A 'ByteString' contains 8-bit characters only.
141 -- Instances of Eq, Ord, Read, Show, Data, Typeable
143 data ByteString = PS {-# UNPACK #-} !(ForeignPtr Word8)
144 {-# UNPACK #-} !Int -- offset
145 {-# UNPACK #-} !Int -- length
147 #if defined(__GLASGOW_HASKELL__)
148 deriving (Data, Typeable)
151 instance Show ByteString where
152 showsPrec p ps r = showsPrec p (unpackWith w2c ps) r
154 instance Read ByteString where
155 readsPrec p str = [ (packWith c2w x, y) | (x, y) <- readsPrec p str ]
157 -- | /O(n)/ Converts a 'ByteString' to a '[a]', using a conversion function.
158 unpackWith :: (Word8 -> a) -> ByteString -> [a]
159 unpackWith _ (PS _ _ 0) = []
160 unpackWith k (PS ps s l) = inlinePerformIO $ withForeignPtr ps $ \p ->
161 go (p `plusPtr` s) (l - 1) []
164 go p 0 acc = peek p >>= \e -> return (k e : acc)
165 go p n acc = peekByteOff p n >>= \e -> go p (n-1) (k e : acc)
166 {-# INLINE unpackWith #-}
167 {-# SPECIALIZE unpackWith :: (Word8 -> Char) -> ByteString -> [Char] #-}
169 -- | /O(n)/ Convert a '[a]' into a 'ByteString' using some
170 -- conversion function
171 packWith :: (a -> Word8) -> [a] -> ByteString
172 packWith k str = unsafeCreate (length str) $ \p -> go p str
176 go p (x:xs) = poke p (k x) >> go (p `plusPtr` 1) xs -- less space than pokeElemOff
177 {-# INLINE packWith #-}
178 {-# SPECIALIZE packWith :: (Char -> Word8) -> [Char] -> ByteString #-}
180 ------------------------------------------------------------------------
182 -- | A space-efficient representation of a Word8 vector, supporting many
183 -- efficient operations. A 'ByteString' contains 8-bit characters only.
185 -- Instances of Eq, Ord, Read, Show, Data, Typeable
187 newtype LazyByteString = LPS [ByteString] -- LPS for lazy packed string
189 #if defined(__GLASGOW_HASKELL__)
194 ------------------------------------------------------------------------
196 -- | /O(1)/ The empty 'ByteString'
198 empty = PS nullForeignPtr 0 0
200 nullForeignPtr :: ForeignPtr Word8
201 #if __GLASGOW_HASKELL__>=605
202 nullForeignPtr = ForeignPtr nullAddr# undefined --TODO: should ForeignPtrContents be strict?
204 nullForeignPtr = unsafePerformIO $ newForeignPtr_ nullPtr
205 {-# NOINLINE nullForeignPtr #-}
208 -- ---------------------------------------------------------------------
210 -- Extensions to the basic interface
213 -- | A variety of 'head' for non-empty ByteStrings. 'unsafeHead' omits the
214 -- check for the empty case, so there is an obligation on the programmer
215 -- to provide a proof that the ByteString is non-empty.
216 unsafeHead :: ByteString -> Word8
217 unsafeHead (PS x s l) = assert (l > 0) $
218 inlinePerformIO $ withForeignPtr x $ \p -> peekByteOff p s
219 {-# INLINE unsafeHead #-}
221 -- | A variety of 'tail' for non-empty ByteStrings. 'unsafeTail' omits the
222 -- check for the empty case. As with 'unsafeHead', the programmer must
223 -- provide a separate proof that the ByteString is non-empty.
224 unsafeTail :: ByteString -> ByteString
225 unsafeTail (PS ps s l) = assert (l > 0) $ PS ps (s+1) (l-1)
226 {-# INLINE unsafeTail #-}
228 -- | Unsafe 'ByteString' index (subscript) operator, starting from 0, returning a 'Word8'
229 -- This omits the bounds check, which means there is an accompanying
230 -- obligation on the programmer to ensure the bounds are checked in some
232 unsafeIndex :: ByteString -> Int -> Word8
233 unsafeIndex (PS x s l) i = assert (i >= 0 && i < l) $
234 inlinePerformIO $ withForeignPtr x $ \p -> peekByteOff p (s+i)
235 {-# INLINE unsafeIndex #-}
237 -- | A variety of 'take' which omits the checks on @n@ so there is an
238 -- obligation on the programmer to provide a proof that @0 <= n <= 'length' xs@.
239 unsafeTake :: Int -> ByteString -> ByteString
240 unsafeTake n (PS x s l) = assert (0 <= n && n <= l) $ PS x s n
241 {-# INLINE unsafeTake #-}
243 -- | A variety of 'drop' which omits the checks on @n@ so there is an
244 -- obligation on the programmer to provide a proof that @0 <= n <= 'length' xs@.
245 unsafeDrop :: Int -> ByteString -> ByteString
246 unsafeDrop n (PS x s l) = assert (0 <= n && n <= l) $ PS x (s+n) (l-n)
247 {-# INLINE unsafeDrop #-}
249 -- ---------------------------------------------------------------------
250 -- Low level constructors
252 -- | /O(1)/ Build a ByteString from a ForeignPtr
253 fromForeignPtr :: ForeignPtr Word8 -> Int -> ByteString
254 fromForeignPtr fp l = PS fp 0 l
256 -- | /O(1)/ Deconstruct a ForeignPtr from a ByteString
257 toForeignPtr :: ByteString -> (ForeignPtr Word8, Int, Int)
258 toForeignPtr (PS ps s l) = (ps, s, l)
260 -- | A way of creating ByteStrings outside the IO monad. The @Int@
261 -- argument gives the final size of the ByteString. Unlike
262 -- 'createAndTrim' the ByteString is not reallocated if the final size
263 -- is less than the estimated size.
264 unsafeCreate :: Int -> (Ptr Word8 -> IO ()) -> ByteString
265 unsafeCreate l f = unsafePerformIO (create l f)
266 {-# INLINE unsafeCreate #-}
268 -- | Create ByteString of size @l@ and use action @f@ to fill it's contents.
269 create :: Int -> (Ptr Word8 -> IO ()) -> IO ByteString
271 fp <- mallocByteString l
272 withForeignPtr fp $ \p -> f p
275 -- | Given the maximum size needed and a function to make the contents
276 -- of a ByteString, createAndTrim makes the 'ByteString'. The generating
277 -- function is required to return the actual final size (<= the maximum
278 -- size), and the resulting byte array is realloced to this size.
280 -- createAndTrim is the main mechanism for creating custom, efficient
281 -- ByteString functions, using Haskell or C functions to fill the space.
283 createAndTrim :: Int -> (Ptr Word8 -> IO Int) -> IO ByteString
284 createAndTrim l f = do
285 fp <- mallocByteString l
286 withForeignPtr fp $ \p -> do
288 if assert (l' <= l) $ l' >= l
289 then return $! PS fp 0 l
290 else create l' $ \p' -> memcpy p' p (fromIntegral l')
292 createAndTrim' :: Int -> (Ptr Word8 -> IO (Int, Int, a)) -> IO (ByteString, a)
293 createAndTrim' l f = do
294 fp <- mallocByteString l
295 withForeignPtr fp $ \p -> do
296 (off, l', res) <- f p
297 if assert (l' <= l) $ l' >= l
298 then return $! (PS fp 0 l, res)
299 else do ps <- create l' $ \p' ->
300 memcpy p' (p `plusPtr` off) (fromIntegral l')
303 -- | Wrapper of mallocForeignPtrBytes with faster implementation
304 -- for GHC 6.5 builds newer than 06/06/06
305 mallocByteString :: Int -> IO (ForeignPtr a)
306 mallocByteString l = do
307 #if __GLASGOW_HASKELL__ >= 605 && !defined(SLOW_FOREIGN_PTR)
308 mallocPlainForeignPtrBytes l
310 mallocForeignPtrBytes l
313 #if defined(__GLASGOW_HASKELL__)
314 -- | /O(n)/ Pack a null-terminated sequence of bytes, pointed to by an
315 -- Addr\# (an arbitrary machine address assumed to point outside the
316 -- garbage-collected heap) into a @ByteString@. A much faster way to
317 -- create an Addr\# is with an unboxed string literal, than to pack a
318 -- boxed string. A unboxed string literal is compiled to a static @char
319 -- []@ by GHC. Establishing the length of the string requires a call to
320 -- @strlen(3)@, so the Addr# must point to a null-terminated buffer (as
321 -- is the case with "string"# literals in GHC). Use 'unsafePackAddress'
322 -- if you know the length of the string statically.
326 -- > literalFS = packAddress "literal"#
328 packAddress :: Addr# -> ByteString
329 packAddress addr# = inlinePerformIO $ do
330 p <- newForeignPtr_ cstr
332 return $ PS p 0 (fromIntegral l)
335 {-# INLINE packAddress #-}
337 -- | /O(1)/ 'unsafePackAddress' provides constant-time construction of
338 -- 'ByteStrings' -- which is ideal for string literals. It packs a
339 -- null-terminated sequence of bytes into a 'ByteString', given a raw
340 -- 'Addr\#' to the string, and the length of the string. Make sure the
341 -- length is correct, otherwise use the safer 'packAddress' (where the
342 -- length will be calculated once at runtime).
343 unsafePackAddress :: Int -> Addr# -> ByteString
344 unsafePackAddress len addr# = inlinePerformIO $ do
345 p <- newForeignPtr_ cstr
347 where cstr = Ptr addr#
349 -- | /O(1)/ Construct a 'ByteString' given a C Ptr Word8 buffer, a
350 -- length, and an IO action representing a finalizer. This function is
351 -- not available on Hugs.
353 packCStringFinalizer :: Ptr Word8 -> Int -> IO () -> IO ByteString
354 packCStringFinalizer p l f = do
355 fp <- FC.newForeignPtr p f
358 -- | Explicitly run the finaliser associated with a 'ByteString'.
359 -- Further references to this value may generate invalid memory
360 -- references. This operation is unsafe, as there may be other
361 -- 'ByteStrings' referring to the same underlying pages. If you use
362 -- this, you need to have a proof of some kind that all 'ByteString's
363 -- ever generated from the underlying byte array are no longer live.
364 unsafeFinalize :: ByteString -> IO ()
365 unsafeFinalize (PS p _ _) = FC.finalizeForeignPtr p
369 ------------------------------------------------------------------------
371 -- | Conversion between 'Word8' and 'Char'. Should compile to a no-op.
373 #if !defined(__GLASGOW_HASKELL__)
374 w2c = chr . fromIntegral
376 w2c = unsafeChr . fromIntegral
380 -- | Unsafe conversion between 'Char' and 'Word8'. This is a no-op and
381 -- silently truncates to 8 bits Chars > '\255'. It is provided as
382 -- convenience for ByteString construction.
384 c2w = fromIntegral . ord
387 -- Selects white-space characters in the Latin-1 range
388 -- ordered by frequency
390 isSpaceWord8 :: Word8 -> Bool
391 isSpaceWord8 w = case w of
392 0x20 -> True -- SPACE
393 0x0A -> True -- LF, \n
394 0x09 -> True -- HT, \t
395 0x0C -> True -- FF, \f
396 0x0D -> True -- CR, \r
397 0x0B -> True -- VT, \v
398 0xA0 -> True -- spotted by QC..
400 {-# INLINE isSpaceWord8 #-}
402 ------------------------------------------------------------------------
403 -- | Just like unsafePerformIO, but we inline it. Big performance gains as
404 -- it exposes lots of things to further inlining
406 {-# INLINE inlinePerformIO #-}
407 inlinePerformIO :: IO a -> a
408 #if defined(__GLASGOW_HASKELL__)
409 inlinePerformIO (IO m) = case m realWorld# of (# _, r #) -> r
411 inlinePerformIO = unsafePerformIO
414 -- | Count the number of occurrences of each byte.
416 {-# SPECIALIZE countOccurrences :: Ptr CSize -> Ptr Word8 -> Int -> IO () #-}
417 countOccurrences :: (Storable a, Num a) => Ptr a -> Ptr Word8 -> Int -> IO ()
418 STRICT3(countOccurrences)
419 countOccurrences counts str l = go 0
422 go i | i == l = return ()
423 | otherwise = do k <- fromIntegral `fmap` peekElemOff str i
424 x <- peekElemOff counts k
425 pokeElemOff counts k (x + 1)
428 -- | /O(1) construction/ Use a @ByteString@ with a function requiring a
429 -- @CString@. Warning: modifying the @CString@ will affect the
430 -- @ByteString@. Why is this function unsafe? It relies on the null
431 -- byte at the end of the ByteString to be there. Unless you can
432 -- guarantee the null byte, you should use the safe version, which will
433 -- copy the string first.
434 unsafeUseAsCString :: ByteString -> (CString -> IO a) -> IO a
435 unsafeUseAsCString (PS ps s _) ac = withForeignPtr ps $ \p -> ac (castPtr p `plusPtr` s)
437 -- | /O(1) construction/ Use a @ByteString@ with a function requiring a
439 unsafeUseAsCStringLen :: ByteString -> (CStringLen -> IO a) -> IO a
440 unsafeUseAsCStringLen (PS ps s l) f = withForeignPtr ps $ \p -> f (castPtr p `plusPtr` s,l)
442 -- ---------------------------------------------------------------------
444 -- Standard C functions
447 foreign import ccall unsafe "string.h strlen" c_strlen
448 :: CString -> IO CSize
450 foreign import ccall unsafe "stdlib.h malloc" c_malloc
451 :: CSize -> IO (Ptr Word8)
453 foreign import ccall unsafe "static stdlib.h free" c_free
454 :: Ptr Word8 -> IO ()
456 foreign import ccall unsafe "static stdlib.h &free" c_free_finalizer
457 :: FunPtr (Ptr Word8 -> IO ())
459 foreign import ccall unsafe "string.h memchr" memchr
460 :: Ptr Word8 -> Word8 -> CSize -> IO (Ptr Word8)
462 foreign import ccall unsafe "string.h memcmp" memcmp
463 :: Ptr Word8 -> Ptr Word8 -> CSize -> IO CInt
465 foreign import ccall unsafe "string.h memcpy" memcpy
466 :: Ptr Word8 -> Ptr Word8 -> CSize -> IO ()
468 foreign import ccall unsafe "string.h memmove" memmove
469 :: Ptr Word8 -> Ptr Word8 -> CSize -> IO ()
471 foreign import ccall unsafe "string.h memset" memset
472 :: Ptr Word8 -> Word8 -> CSize -> IO (Ptr Word8)
475 -- ---------------------------------------------------------------------
480 foreign import ccall unsafe "static fpstring.h fps_reverse" c_reverse
481 :: Ptr Word8 -> Ptr Word8 -> CULong -> IO ()
483 foreign import ccall unsafe "static fpstring.h fps_intersperse" c_intersperse
484 :: Ptr Word8 -> Ptr Word8 -> CULong -> Word8 -> IO ()
486 foreign import ccall unsafe "static fpstring.h fps_maximum" c_maximum
487 :: Ptr Word8 -> CULong -> IO Word8
489 foreign import ccall unsafe "static fpstring.h fps_minimum" c_minimum
490 :: Ptr Word8 -> CULong -> IO Word8
492 foreign import ccall unsafe "static fpstring.h fps_count" c_count
493 :: Ptr Word8 -> CULong -> Word8 -> IO CULong
495 -- ---------------------------------------------------------------------
499 foreign import ccall unsafe "static fpstring.h my_mmap" my_mmap
500 :: Int -> Int -> IO (Ptr Word8)
502 foreign import ccall unsafe "static unistd.h close" c_close
505 # if !defined(__OpenBSD__)
506 foreign import ccall unsafe "static sys/mman.h munmap" c_munmap
507 :: Ptr Word8 -> Int -> IO Int
511 -- ---------------------------------------------------------------------
512 -- Internal GHC Haskell magic
514 #if defined(__GLASGOW_HASKELL__)
515 foreign import ccall unsafe "__hscore_memcpy_src_off"
516 memcpy_ptr_baoff :: Ptr a -> RawBuffer -> CInt -> CSize -> IO (Ptr ())