1 {-# OPTIONS_GHC -cpp -fglasgow-exts #-}
3 -- Module : ByteString.Base
5 -- Maintainer : dons@cse.unsw.edu.au
6 -- Stability : experimental
7 -- Portability : portable, requires ffi and cpp
8 -- Tested with : GHC 6.4.1 and Hugs March 2005
11 -- | A module containing semi-public ByteString internals. This exposes
12 -- the ByteString representation and low level construction functions.
13 -- Modules which extend the ByteString system will need to use this module
14 -- while ideally most users will be able to make do with the public interface
17 module Data.ByteString.Base (
19 -- * The @ByteString@ type and representation
20 ByteString(..), -- instances: Eq, Ord, Show, Read, Data, Typeable
21 LazyByteString(..), -- instances: Eq, Ord, Show, Read, Data, Typeable
24 unsafeHead, -- :: ByteString -> Word8
25 unsafeTail, -- :: ByteString -> ByteString
26 unsafeIndex, -- :: ByteString -> Int -> Word8
27 unsafeTake, -- :: Int -> ByteString -> ByteString
28 unsafeDrop, -- :: Int -> ByteString -> ByteString
30 -- * Low level introduction and elimination
31 empty, -- :: ByteString
32 create, -- :: Int -> (Ptr Word8 -> IO ()) -> IO ByteString
33 createAndTrim, -- :: Int -> (Ptr Word8 -> IO Int) -> IO ByteString
34 createAndTrim', -- :: Int -> (Ptr Word8 -> IO (Int, Int, a)) -> IO (ByteString, a)
35 mallocByteString, -- :: Int -> IO (ForeignPtr a)
37 unsafeCreate, -- :: Int -> (Ptr Word8 -> IO ()) -> ByteString
38 unsafeUseAsCString, -- :: ByteString -> (CString -> IO a) -> IO a
39 unsafeUseAsCStringLen, -- :: ByteString -> (CStringLen -> IO a) -> IO a
41 fromForeignPtr, -- :: ForeignPtr Word8 -> Int -> ByteString
42 toForeignPtr, -- :: ByteString -> (ForeignPtr Word8, Int, Int)
44 #if defined(__GLASGOW_HASKELL__)
45 packCStringFinalizer, -- :: Ptr Word8 -> Int -> IO () -> IO ByteString
46 packAddress, -- :: Addr# -> ByteString
47 unsafePackAddress, -- :: Int -> Addr# -> ByteString
48 unsafeFinalize, -- :: ByteString -> IO ()
52 inlinePerformIO, -- :: IO a -> a
53 nullForeignPtr, -- :: ForeignPtr Word8
55 countOccurrences, -- :: (Storable a, Num a) => Ptr a -> Ptr Word8 -> Int -> IO ()
57 -- * Standard C Functions
58 c_strlen, -- :: CString -> IO CInt
59 c_malloc, -- :: CInt -> IO (Ptr Word8)
60 c_free, -- :: Ptr Word8 -> IO ()
61 c_free_finalizer, -- :: FunPtr (Ptr Word8 -> IO ())
63 memchr, -- :: Ptr Word8 -> Word8 -> CSize -> IO Ptr Word8
64 memcmp, -- :: Ptr Word8 -> Ptr Word8 -> CSize -> IO CInt
65 memcpy, -- :: Ptr Word8 -> Ptr Word8 -> CSize -> IO ()
66 memmove, -- :: Ptr Word8 -> Ptr Word8 -> CSize -> IO ()
67 memset, -- :: Ptr Word8 -> Word8 -> CSize -> IO (Ptr Word8)
70 c_reverse, -- :: Ptr Word8 -> Ptr Word8 -> CInt -> IO ()
71 c_intersperse, -- :: Ptr Word8 -> Ptr Word8 -> CInt -> Word8 -> IO ()
72 c_maximum, -- :: Ptr Word8 -> CInt -> IO Word8
73 c_minimum, -- :: Ptr Word8 -> CInt -> IO Word8
74 c_count, -- :: Ptr Word8 -> CInt -> Word8 -> IO CInt
76 -- * Internal GHC magic
77 #if defined(__GLASGOW_HASKELL__)
78 memcpy_ptr_baoff, -- :: Ptr a -> RawBuffer -> CInt -> CSize -> IO (Ptr ())
82 w2c, c2w, isSpaceWord8
86 import Foreign.ForeignPtr (ForeignPtr, newForeignPtr_, withForeignPtr)
87 import Foreign.Ptr (Ptr, FunPtr, plusPtr, castPtr)
88 import Foreign.Storable (Storable(..))
89 import Foreign.C.Types (CInt, CSize, CULong)
90 import Foreign.C.String (CString, CStringLen)
92 import Control.Exception (assert)
94 import Data.Char (ord)
95 import Data.Word (Word8)
97 #if defined(__GLASGOW_HASKELL__)
98 import qualified Foreign.ForeignPtr as FC (finalizeForeignPtr)
99 import qualified Foreign.Concurrent as FC (newForeignPtr)
101 import Data.Generics (Data(..), Typeable(..))
102 import GHC.Prim (Addr#)
103 import GHC.Ptr (Ptr(..))
104 import GHC.Base (realWorld#,unsafeChr)
105 import GHC.IOBase (IO(IO), unsafePerformIO, RawBuffer)
107 import Data.Char (chr)
108 import System.IO.Unsafe (unsafePerformIO)
111 #if __GLASGOW_HASKELL__ >= 605 && !defined(SLOW_FOREIGN_PTR)
112 import GHC.ForeignPtr (mallocPlainForeignPtrBytes)
114 import Foreign.ForeignPtr (mallocForeignPtrBytes)
117 #if __GLASGOW_HASKELL__>=605
118 import GHC.ForeignPtr (ForeignPtr(ForeignPtr))
119 import GHC.Base (nullAddr#)
121 import Foreign.Ptr (nullPtr)
124 -- CFILES stuff is Hugs only
125 {-# CFILES cbits/fpstring.c #-}
127 -- -----------------------------------------------------------------------------
129 -- Useful macros, until we have bang patterns
132 #define STRICT1(f) f a | a `seq` False = undefined
133 #define STRICT2(f) f a b | a `seq` b `seq` False = undefined
134 #define STRICT3(f) f a b c | a `seq` b `seq` c `seq` False = undefined
135 #define STRICT4(f) f a b c d | a `seq` b `seq` c `seq` d `seq` False = undefined
136 #define STRICT5(f) f a b c d e | a `seq` b `seq` c `seq` d `seq` e `seq` False = undefined
138 -- -----------------------------------------------------------------------------
140 -- | A space-efficient representation of a Word8 vector, supporting many
141 -- efficient operations. A 'ByteString' contains 8-bit characters only.
143 -- Instances of Eq, Ord, Read, Show, Data, Typeable
145 data ByteString = PS {-# UNPACK #-} !(ForeignPtr Word8)
146 {-# UNPACK #-} !Int -- offset
147 {-# UNPACK #-} !Int -- length
149 #if defined(__GLASGOW_HASKELL__)
150 deriving (Data, Typeable)
153 instance Show ByteString where
154 showsPrec p ps r = showsPrec p (unpackWith w2c ps) r
156 instance Read ByteString where
157 readsPrec p str = [ (packWith c2w x, y) | (x, y) <- readsPrec p str ]
159 -- | /O(n)/ Converts a 'ByteString' to a '[a]', using a conversion function.
160 unpackWith :: (Word8 -> a) -> ByteString -> [a]
161 unpackWith _ (PS _ _ 0) = []
162 unpackWith k (PS ps s l) = inlinePerformIO $ withForeignPtr ps $ \p ->
163 go (p `plusPtr` s) (l - 1) []
166 go p 0 acc = peek p >>= \e -> return (k e : acc)
167 go p n acc = peekByteOff p n >>= \e -> go p (n-1) (k e : acc)
168 {-# INLINE unpackWith #-}
169 {-# SPECIALIZE unpackWith :: (Word8 -> Char) -> ByteString -> [Char] #-}
171 -- | /O(n)/ Convert a '[a]' into a 'ByteString' using some
172 -- conversion function
173 packWith :: (a -> Word8) -> [a] -> ByteString
174 packWith k str = unsafeCreate (length str) $ \p -> go p str
178 go p (x:xs) = poke p (k x) >> go (p `plusPtr` 1) xs -- less space than pokeElemOff
179 {-# INLINE packWith #-}
180 {-# SPECIALIZE packWith :: (Char -> Word8) -> [Char] -> ByteString #-}
182 ------------------------------------------------------------------------
184 -- | A space-efficient representation of a Word8 vector, supporting many
185 -- efficient operations. A 'ByteString' contains 8-bit characters only.
187 -- Instances of Eq, Ord, Read, Show, Data, Typeable
189 newtype LazyByteString = LPS [ByteString] -- LPS for lazy packed string
191 #if defined(__GLASGOW_HASKELL__)
196 ------------------------------------------------------------------------
198 -- | /O(1)/ The empty 'ByteString'
200 empty = PS nullForeignPtr 0 0
202 nullForeignPtr :: ForeignPtr Word8
203 #if __GLASGOW_HASKELL__>=605
204 nullForeignPtr = ForeignPtr nullAddr# undefined --TODO: should ForeignPtrContents be strict?
206 nullForeignPtr = unsafePerformIO $ newForeignPtr_ nullPtr
207 {-# NOINLINE nullForeignPtr #-}
210 -- ---------------------------------------------------------------------
212 -- Extensions to the basic interface
215 -- | A variety of 'head' for non-empty ByteStrings. 'unsafeHead' omits the
216 -- check for the empty case, so there is an obligation on the programmer
217 -- to provide a proof that the ByteString is non-empty.
218 unsafeHead :: ByteString -> Word8
219 unsafeHead (PS x s l) = assert (l > 0) $
220 inlinePerformIO $ withForeignPtr x $ \p -> peekByteOff p s
221 {-# INLINE unsafeHead #-}
223 -- | A variety of 'tail' for non-empty ByteStrings. 'unsafeTail' omits the
224 -- check for the empty case. As with 'unsafeHead', the programmer must
225 -- provide a separate proof that the ByteString is non-empty.
226 unsafeTail :: ByteString -> ByteString
227 unsafeTail (PS ps s l) = assert (l > 0) $ PS ps (s+1) (l-1)
228 {-# INLINE unsafeTail #-}
230 -- | Unsafe 'ByteString' index (subscript) operator, starting from 0, returning a 'Word8'
231 -- This omits the bounds check, which means there is an accompanying
232 -- obligation on the programmer to ensure the bounds are checked in some
234 unsafeIndex :: ByteString -> Int -> Word8
235 unsafeIndex (PS x s l) i = assert (i >= 0 && i < l) $
236 inlinePerformIO $ withForeignPtr x $ \p -> peekByteOff p (s+i)
237 {-# INLINE unsafeIndex #-}
239 -- | A variety of 'take' which omits the checks on @n@ so there is an
240 -- obligation on the programmer to provide a proof that @0 <= n <= 'length' xs@.
241 unsafeTake :: Int -> ByteString -> ByteString
242 unsafeTake n (PS x s l) = assert (0 <= n && n <= l) $ PS x s n
243 {-# INLINE unsafeTake #-}
245 -- | A variety of 'drop' which omits the checks on @n@ so there is an
246 -- obligation on the programmer to provide a proof that @0 <= n <= 'length' xs@.
247 unsafeDrop :: Int -> ByteString -> ByteString
248 unsafeDrop n (PS x s l) = assert (0 <= n && n <= l) $ PS x (s+n) (l-n)
249 {-# INLINE unsafeDrop #-}
251 -- ---------------------------------------------------------------------
252 -- Low level constructors
254 -- | /O(1)/ Build a ByteString from a ForeignPtr
255 fromForeignPtr :: ForeignPtr Word8 -> Int -> ByteString
256 fromForeignPtr fp l = PS fp 0 l
258 -- | /O(1)/ Deconstruct a ForeignPtr from a ByteString
259 toForeignPtr :: ByteString -> (ForeignPtr Word8, Int, Int)
260 toForeignPtr (PS ps s l) = (ps, s, l)
262 -- | A way of creating ByteStrings outside the IO monad. The @Int@
263 -- argument gives the final size of the ByteString. Unlike
264 -- 'createAndTrim' the ByteString is not reallocated if the final size
265 -- is less than the estimated size.
266 unsafeCreate :: Int -> (Ptr Word8 -> IO ()) -> ByteString
267 unsafeCreate l f = unsafePerformIO (create l f)
268 {-# INLINE unsafeCreate #-}
270 -- | Create ByteString of size @l@ and use action @f@ to fill it's contents.
271 create :: Int -> (Ptr Word8 -> IO ()) -> IO ByteString
273 fp <- mallocByteString l
274 withForeignPtr fp $ \p -> f p
277 -- | Given the maximum size needed and a function to make the contents
278 -- of a ByteString, createAndTrim makes the 'ByteString'. The generating
279 -- function is required to return the actual final size (<= the maximum
280 -- size), and the resulting byte array is realloced to this size.
282 -- createAndTrim is the main mechanism for creating custom, efficient
283 -- ByteString functions, using Haskell or C functions to fill the space.
285 createAndTrim :: Int -> (Ptr Word8 -> IO Int) -> IO ByteString
286 createAndTrim l f = do
287 fp <- mallocByteString l
288 withForeignPtr fp $ \p -> do
290 if assert (l' <= l) $ l' >= l
291 then return $! PS fp 0 l
292 else create l' $ \p' -> memcpy p' p (fromIntegral l')
294 createAndTrim' :: Int -> (Ptr Word8 -> IO (Int, Int, a)) -> IO (ByteString, a)
295 createAndTrim' l f = do
296 fp <- mallocByteString l
297 withForeignPtr fp $ \p -> do
298 (off, l', res) <- f p
299 if assert (l' <= l) $ l' >= l
300 then return $! (PS fp 0 l, res)
301 else do ps <- create l' $ \p' ->
302 memcpy p' (p `plusPtr` off) (fromIntegral l')
305 -- | Wrapper of mallocForeignPtrBytes with faster implementation
306 -- for GHC 6.5 builds newer than 06/06/06
307 mallocByteString :: Int -> IO (ForeignPtr a)
308 mallocByteString l = do
309 #if __GLASGOW_HASKELL__ >= 605 && !defined(SLOW_FOREIGN_PTR)
310 mallocPlainForeignPtrBytes l
312 mallocForeignPtrBytes l
315 #if defined(__GLASGOW_HASKELL__)
316 -- | /O(n)/ Pack a null-terminated sequence of bytes, pointed to by an
317 -- Addr\# (an arbitrary machine address assumed to point outside the
318 -- garbage-collected heap) into a @ByteString@. A much faster way to
319 -- create an Addr\# is with an unboxed string literal, than to pack a
320 -- boxed string. A unboxed string literal is compiled to a static @char
321 -- []@ by GHC. Establishing the length of the string requires a call to
322 -- @strlen(3)@, so the Addr# must point to a null-terminated buffer (as
323 -- is the case with "string"# literals in GHC). Use 'unsafePackAddress'
324 -- if you know the length of the string statically.
328 -- > literalFS = packAddress "literal"#
330 packAddress :: Addr# -> ByteString
331 packAddress addr# = inlinePerformIO $ do
332 p <- newForeignPtr_ cstr
334 return $ PS p 0 (fromIntegral l)
337 {-# INLINE packAddress #-}
339 -- | /O(1)/ 'unsafePackAddress' provides constant-time construction of
340 -- 'ByteStrings' -- which is ideal for string literals. It packs a
341 -- null-terminated sequence of bytes into a 'ByteString', given a raw
342 -- 'Addr\#' to the string, and the length of the string. Make sure the
343 -- length is correct, otherwise use the safer 'packAddress' (where the
344 -- length will be calculated once at runtime).
345 unsafePackAddress :: Int -> Addr# -> ByteString
346 unsafePackAddress len addr# = inlinePerformIO $ do
347 p <- newForeignPtr_ cstr
349 where cstr = Ptr addr#
351 -- | /O(1)/ Construct a 'ByteString' given a C Ptr Word8 buffer, a
352 -- length, and an IO action representing a finalizer. This function is
353 -- not available on Hugs.
355 packCStringFinalizer :: Ptr Word8 -> Int -> IO () -> IO ByteString
356 packCStringFinalizer p l f = do
357 fp <- FC.newForeignPtr p f
360 -- | Explicitly run the finaliser associated with a 'ByteString'.
361 -- Further references to this value may generate invalid memory
362 -- references. This operation is unsafe, as there may be other
363 -- 'ByteStrings' referring to the same underlying pages. If you use
364 -- this, you need to have a proof of some kind that all 'ByteString's
365 -- ever generated from the underlying byte array are no longer live.
366 unsafeFinalize :: ByteString -> IO ()
367 unsafeFinalize (PS p _ _) = FC.finalizeForeignPtr p
371 ------------------------------------------------------------------------
373 -- | Conversion between 'Word8' and 'Char'. Should compile to a no-op.
375 #if !defined(__GLASGOW_HASKELL__)
376 w2c = chr . fromIntegral
378 w2c = unsafeChr . fromIntegral
382 -- | Unsafe conversion between 'Char' and 'Word8'. This is a no-op and
383 -- silently truncates to 8 bits Chars > '\255'. It is provided as
384 -- convenience for ByteString construction.
386 c2w = fromIntegral . ord
389 -- Selects white-space characters in the Latin-1 range
390 -- ordered by frequency
392 isSpaceWord8 :: Word8 -> Bool
393 isSpaceWord8 w = case w of
394 0x20 -> True -- SPACE
395 0x0A -> True -- LF, \n
396 0x09 -> True -- HT, \t
397 0x0C -> True -- FF, \f
398 0x0D -> True -- CR, \r
399 0x0B -> True -- VT, \v
400 0xA0 -> True -- spotted by QC..
402 {-# INLINE isSpaceWord8 #-}
404 ------------------------------------------------------------------------
405 -- | Just like unsafePerformIO, but we inline it. Big performance gains as
406 -- it exposes lots of things to further inlining
408 {-# INLINE inlinePerformIO #-}
409 inlinePerformIO :: IO a -> a
410 #if defined(__GLASGOW_HASKELL__)
411 inlinePerformIO (IO m) = case m realWorld# of (# _, r #) -> r
413 inlinePerformIO = unsafePerformIO
416 -- | Count the number of occurrences of each byte.
418 {-# SPECIALIZE countOccurrences :: Ptr CSize -> Ptr Word8 -> Int -> IO () #-}
419 countOccurrences :: (Storable a, Num a) => Ptr a -> Ptr Word8 -> Int -> IO ()
420 STRICT3(countOccurrences)
421 countOccurrences counts str l = go 0
424 go i | i == l = return ()
425 | otherwise = do k <- fromIntegral `fmap` peekElemOff str i
426 x <- peekElemOff counts k
427 pokeElemOff counts k (x + 1)
430 -- | /O(1) construction/ Use a @ByteString@ with a function requiring a
431 -- @CString@. Warning: modifying the @CString@ will affect the
432 -- @ByteString@. Why is this function unsafe? It relies on the null
433 -- byte at the end of the ByteString to be there. Unless you can
434 -- guarantee the null byte, you should use the safe version, which will
435 -- copy the string first.
436 unsafeUseAsCString :: ByteString -> (CString -> IO a) -> IO a
437 unsafeUseAsCString (PS ps s _) ac = withForeignPtr ps $ \p -> ac (castPtr p `plusPtr` s)
439 -- | /O(1) construction/ Use a @ByteString@ with a function requiring a
441 unsafeUseAsCStringLen :: ByteString -> (CStringLen -> IO a) -> IO a
442 unsafeUseAsCStringLen (PS ps s l) f = withForeignPtr ps $ \p -> f (castPtr p `plusPtr` s,l)
444 -- ---------------------------------------------------------------------
446 -- Standard C functions
449 foreign import ccall unsafe "string.h strlen" c_strlen
450 :: CString -> IO CSize
452 foreign import ccall unsafe "stdlib.h malloc" c_malloc
453 :: CSize -> IO (Ptr Word8)
455 foreign import ccall unsafe "static stdlib.h free" c_free
456 :: Ptr Word8 -> IO ()
458 foreign import ccall unsafe "static stdlib.h &free" c_free_finalizer
459 :: FunPtr (Ptr Word8 -> IO ())
461 foreign import ccall unsafe "string.h memchr" memchr
462 :: Ptr Word8 -> Word8 -> CSize -> IO (Ptr Word8)
464 foreign import ccall unsafe "string.h memcmp" memcmp
465 :: Ptr Word8 -> Ptr Word8 -> CSize -> IO CInt
467 foreign import ccall unsafe "string.h memcpy" memcpy
468 :: Ptr Word8 -> Ptr Word8 -> CSize -> IO ()
470 foreign import ccall unsafe "string.h memmove" memmove
471 :: Ptr Word8 -> Ptr Word8 -> CSize -> IO ()
473 foreign import ccall unsafe "string.h memset" memset
474 :: Ptr Word8 -> Word8 -> CSize -> IO (Ptr Word8)
477 -- ---------------------------------------------------------------------
482 foreign import ccall unsafe "static fpstring.h fps_reverse" c_reverse
483 :: Ptr Word8 -> Ptr Word8 -> CULong -> IO ()
485 foreign import ccall unsafe "static fpstring.h fps_intersperse" c_intersperse
486 :: Ptr Word8 -> Ptr Word8 -> CULong -> Word8 -> IO ()
488 foreign import ccall unsafe "static fpstring.h fps_maximum" c_maximum
489 :: Ptr Word8 -> CULong -> IO Word8
491 foreign import ccall unsafe "static fpstring.h fps_minimum" c_minimum
492 :: Ptr Word8 -> CULong -> IO Word8
494 foreign import ccall unsafe "static fpstring.h fps_count" c_count
495 :: Ptr Word8 -> CULong -> Word8 -> IO CULong
497 -- ---------------------------------------------------------------------
501 foreign import ccall unsafe "static fpstring.h my_mmap" my_mmap
502 :: Int -> Int -> IO (Ptr Word8)
504 foreign import ccall unsafe "static unistd.h close" c_close
507 # if !defined(__OpenBSD__)
508 foreign import ccall unsafe "static sys/mman.h munmap" c_munmap
509 :: Ptr Word8 -> Int -> IO Int
513 -- ---------------------------------------------------------------------
514 -- Internal GHC Haskell magic
516 #if defined(__GLASGOW_HASKELL__)
517 foreign import ccall unsafe "__hscore_memcpy_src_off"
518 memcpy_ptr_baoff :: Ptr a -> RawBuffer -> CInt -> CSize -> IO (Ptr ())