1 {-# OPTIONS_GHC -cpp -fglasgow-exts #-}
3 -- Module : ByteString.Base
5 -- Maintainer : dons@cse.unsw.edu.au
6 -- Stability : experimental
7 -- Portability : portable, requires ffi and cpp
8 -- Tested with : GHC 6.4.1 and Hugs March 2005
11 -- | A module containing semi-public ByteString internals. This exposes
12 -- the ByteString representation and low level construction functions.
13 -- Modules which extend the ByteString system will need to use this module
14 -- while ideally most users will be able to make do with the public interface
17 module Data.ByteString.Base (
19 -- * The @ByteString@ type and representation
20 ByteString(..), -- instances: Eq, Ord, Show, Read, Data, Typeable
23 unsafeHead, -- :: ByteString -> Word8
24 unsafeTail, -- :: ByteString -> ByteString
25 unsafeIndex, -- :: ByteString -> Int -> Word8
26 unsafeTake, -- :: Int -> ByteString -> ByteString
27 unsafeDrop, -- :: Int -> ByteString -> ByteString
29 -- * Low level introduction and elimination
30 create, -- :: Int -> (Ptr Word8 -> IO ()) -> IO ByteString
31 createAndTrim, -- :: Int -> (Ptr Word8 -> IO Int) -> IO ByteString
32 createAndTrim', -- :: Int -> (Ptr Word8 -> IO (Int, Int, a)) -> IO (ByteString, a)
34 unsafeCreate, -- :: Int -> (Ptr Word8 -> IO ()) -> ByteString
35 unsafeUseAsCString, -- :: ByteString -> (CString -> IO a) -> IO a
36 unsafeUseAsCStringLen, -- :: ByteString -> (CStringLen -> IO a) -> IO a
38 fromForeignPtr, -- :: ForeignPtr Word8 -> Int -> ByteString
39 toForeignPtr, -- :: ByteString -> (ForeignPtr Word8, Int, Int)
41 #if defined(__GLASGOW_HASKELL__)
42 packCStringFinalizer, -- :: Ptr Word8 -> Int -> IO () -> IO ByteString
43 packAddress, -- :: Addr# -> ByteString
44 unsafePackAddress, -- :: Int -> Addr# -> ByteString
45 unsafeFinalize, -- :: ByteString -> IO ()
49 inlinePerformIO, -- :: IO a -> a
51 countOccurrences, -- :: (Storable a, Num a) => Ptr a -> Ptr Word8 -> Int -> IO ()
53 -- * Standard C Functions
54 c_strlen, -- :: CString -> IO CInt
55 c_malloc, -- :: CInt -> IO (Ptr Word8)
56 c_free, -- :: Ptr Word8 -> IO ()
58 #if !defined(__GLASGOW_HASKELL__)
59 c_free_finalizer, -- :: FunPtr (Ptr Word8 -> IO ())
62 memchr, -- :: Ptr Word8 -> Word8 -> CSize -> IO Ptr Word8
63 memcmp, -- :: Ptr Word8 -> Ptr Word8 -> CSize -> IO CInt
64 memcpy, -- :: Ptr Word8 -> Ptr Word8 -> CSize -> IO ()
65 memmove, -- :: Ptr Word8 -> Ptr Word8 -> CSize -> IO ()
66 memset, -- :: Ptr Word8 -> Word8 -> CSize -> IO (Ptr Word8)
69 c_reverse, -- :: Ptr Word8 -> Ptr Word8 -> CInt -> IO ()
70 c_intersperse, -- :: Ptr Word8 -> Ptr Word8 -> CInt -> Word8 -> IO ()
71 c_maximum, -- :: Ptr Word8 -> CInt -> IO Word8
72 c_minimum, -- :: Ptr Word8 -> CInt -> IO Word8
73 c_count, -- :: Ptr Word8 -> CInt -> Word8 -> IO CInt
75 -- * Internal GHC magic
76 #if defined(__GLASGOW_HASKELL__)
77 getProgArgv, -- :: Ptr CInt -> Ptr (Ptr CString) -> IO ()
78 memcpy_ptr_baoff, -- :: Ptr a -> RawBuffer -> CInt -> CSize -> IO (Ptr ())
82 w2c, c2w, isSpaceWord8
86 import Foreign.ForeignPtr
88 import Foreign.Storable (Storable(..))
89 import Foreign.C.Types
90 import Foreign.C.String (CString, CStringLen)
92 import Control.Exception (assert)
94 import Data.Char (ord)
95 import Data.Word (Word8)
97 #if defined(__GLASGOW_HASKELL__)
98 import qualified Foreign.Concurrent as FC (newForeignPtr)
100 import Data.Generics (Data(..), Typeable(..))
101 import GHC.Prim (Addr#)
102 import GHC.Ptr (Ptr(..))
103 import GHC.Base (realWorld#,unsafeChr)
106 #if defined(__GLASGOW_HASKELL__) && !defined(SLOW_FOREIGN_PTR)
107 import GHC.ForeignPtr (mallocPlainForeignPtrBytes)
111 import Data.Char (chr)
112 import System.IO.Unsafe (unsafePerformIO)
115 -- CFILES stuff is Hugs only
116 {-# CFILES cbits/fpstring.c #-}
118 -- -----------------------------------------------------------------------------
120 -- Useful macros, until we have bang patterns
123 #define STRICT1(f) f a | a `seq` False = undefined
124 #define STRICT2(f) f a b | a `seq` b `seq` False = undefined
125 #define STRICT3(f) f a b c | a `seq` b `seq` c `seq` False = undefined
126 #define STRICT4(f) f a b c d | a `seq` b `seq` c `seq` d `seq` False = undefined
127 #define STRICT5(f) f a b c d e | a `seq` b `seq` c `seq` d `seq` e `seq` False = undefined
129 -- -----------------------------------------------------------------------------
131 -- | A space-efficient representation of a Word8 vector, supporting many
132 -- efficient operations. A 'ByteString' contains 8-bit characters only.
134 -- Instances of Eq, Ord, Read, Show, Data, Typeable
136 data ByteString = PS {-# UNPACK #-} !(ForeignPtr Word8)
137 {-# UNPACK #-} !Int -- offset
138 {-# UNPACK #-} !Int -- length
140 #if defined(__GLASGOW_HASKELL__)
141 deriving (Data, Typeable)
144 -- ---------------------------------------------------------------------
146 -- Extensions to the basic interface
149 -- | A variety of 'head' for non-empty ByteStrings. 'unsafeHead' omits the
150 -- check for the empty case, so there is an obligation on the programmer
151 -- to provide a proof that the ByteString is non-empty.
152 unsafeHead :: ByteString -> Word8
153 unsafeHead (PS x s l) = assert (l > 0) $
154 inlinePerformIO $ withForeignPtr x $ \p -> peekByteOff p s
155 {-# INLINE unsafeHead #-}
157 -- | A variety of 'tail' for non-empty ByteStrings. 'unsafeTail' omits the
158 -- check for the empty case. As with 'unsafeHead', the programmer must
159 -- provide a separate proof that the ByteString is non-empty.
160 unsafeTail :: ByteString -> ByteString
161 unsafeTail (PS ps s l) = assert (l > 0) $ PS ps (s+1) (l-1)
162 {-# INLINE unsafeTail #-}
164 -- | Unsafe 'ByteString' index (subscript) operator, starting from 0, returning a 'Word8'
165 -- This omits the bounds check, which means there is an accompanying
166 -- obligation on the programmer to ensure the bounds are checked in some
168 unsafeIndex :: ByteString -> Int -> Word8
169 unsafeIndex (PS x s l) i = assert (i >= 0 && i < l) $
170 inlinePerformIO $ withForeignPtr x $ \p -> peekByteOff p (s+i)
171 {-# INLINE unsafeIndex #-}
173 -- | A variety of 'take' which omits the checks on @n@ so there is an
174 -- obligation on the programmer to provide a proof that @0 <= n <= 'length' xs@.
175 unsafeTake :: Int -> ByteString -> ByteString
176 unsafeTake n (PS x s l) = assert (0 <= n && n <= l) $ PS x s n
177 {-# INLINE unsafeTake #-}
179 -- | A variety of 'drop' which omits the checks on @n@ so there is an
180 -- obligation on the programmer to provide a proof that @0 <= n <= 'length' xs@.
181 unsafeDrop :: Int -> ByteString -> ByteString
182 unsafeDrop n (PS x s l) = assert (0 <= n && n <= l) $ PS x (s+n) (l-n)
183 {-# INLINE unsafeDrop #-}
185 -- ---------------------------------------------------------------------
186 -- Low level constructors
188 -- | /O(1)/ Build a ByteString from a ForeignPtr
189 fromForeignPtr :: ForeignPtr Word8 -> Int -> ByteString
190 fromForeignPtr fp l = PS fp 0 l
192 -- | /O(1)/ Deconstruct a ForeignPtr from a ByteString
193 toForeignPtr :: ByteString -> (ForeignPtr Word8, Int, Int)
194 toForeignPtr (PS ps s l) = (ps, s, l)
196 -- | A way of creating ByteStrings outside the IO monad. The @Int@
197 -- argument gives the final size of the ByteString. Unlike
198 -- 'createAndTrim' the ByteString is not reallocated if the final size
199 -- is less than the estimated size.
200 unsafeCreate :: Int -> (Ptr Word8 -> IO ()) -> ByteString
201 unsafeCreate l f = unsafePerformIO (create l f)
202 {-# INLINE unsafeCreate #-}
204 -- | Wrapper of mallocForeignPtrBytes.
205 create :: Int -> (Ptr Word8 -> IO ()) -> IO ByteString
207 #if defined(SLOW_FOREIGN_PTR) || !defined(__GLASGOW_HASKELL__)
208 fp <- mallocForeignPtrBytes l
210 fp <- mallocPlainForeignPtrBytes l
212 withForeignPtr fp $ \p -> f p
215 -- | Given the maximum size needed and a function to make the contents
216 -- of a ByteString, createAndTrim makes the 'ByteString'. The generating
217 -- function is required to return the actual final size (<= the maximum
218 -- size), and the resulting byte array is realloced to this size.
220 -- createAndTrim is the main mechanism for creating custom, efficient
221 -- ByteString functions, using Haskell or C functions to fill the space.
223 createAndTrim :: Int -> (Ptr Word8 -> IO Int) -> IO ByteString
224 createAndTrim l f = do
225 #if defined(SLOW_FOREIGN_PTR) || !defined(__GLASGOW_HASKELL__)
226 fp <- mallocForeignPtrBytes l
228 fp <- mallocPlainForeignPtrBytes l
230 withForeignPtr fp $ \p -> do
232 if assert (l' <= l) $ l' >= l
233 then return $! PS fp 0 l
234 else create l' $ \p' -> memcpy p' p (fromIntegral l')
236 createAndTrim' :: Int -> (Ptr Word8 -> IO (Int, Int, a)) -> IO (ByteString, a)
237 createAndTrim' l f = do
238 #if defined(SLOW_FOREIGN_PTR) || !defined(__GLASGOW_HASKELL__)
239 fp <- mallocForeignPtrBytes l
241 fp <- mallocPlainForeignPtrBytes l
243 withForeignPtr fp $ \p -> do
244 (off, l', res) <- f p
245 if assert (l' <= l) $ l' >= l
246 then return $! (PS fp 0 l, res)
247 else do ps <- create l' $ \p' ->
248 memcpy p' (p `plusPtr` off) (fromIntegral l')
251 #if defined(__GLASGOW_HASKELL__)
252 -- | /O(n)/ Pack a null-terminated sequence of bytes, pointed to by an
253 -- Addr\# (an arbitrary machine address assumed to point outside the
254 -- garbage-collected heap) into a @ByteString@. A much faster way to
255 -- create an Addr\# is with an unboxed string literal, than to pack a
256 -- boxed string. A unboxed string literal is compiled to a static @char
257 -- []@ by GHC. Establishing the length of the string requires a call to
258 -- @strlen(3)@, so the Addr# must point to a null-terminated buffer (as
259 -- is the case with "string"# literals in GHC). Use 'unsafePackAddress'
260 -- if you know the length of the string statically.
264 -- > literalFS = packAddress "literal"#
266 packAddress :: Addr# -> ByteString
267 packAddress addr# = inlinePerformIO $ do
268 p <- newForeignPtr_ cstr
270 return $ PS p 0 (fromIntegral l)
273 {-# INLINE packAddress #-}
275 -- | /O(1)/ 'unsafePackAddress' provides constant-time construction of
276 -- 'ByteStrings' -- which is ideal for string literals. It packs a
277 -- null-terminated sequence of bytes into a 'ByteString', given a raw
278 -- 'Addr\#' to the string, and the length of the string. Make sure the
279 -- length is correct, otherwise use the safer 'packAddress' (where the
280 -- length will be calculated once at runtime).
281 unsafePackAddress :: Int -> Addr# -> ByteString
282 unsafePackAddress len addr# = inlinePerformIO $ do
283 p <- newForeignPtr_ cstr
285 where cstr = Ptr addr#
287 -- | /O(1)/ Construct a 'ByteString' given a C Ptr Word8 buffer, a
288 -- length, and an IO action representing a finalizer. This function is
289 -- not available on Hugs.
291 packCStringFinalizer :: Ptr Word8 -> Int -> IO () -> IO ByteString
292 packCStringFinalizer p l f = do
293 fp <- FC.newForeignPtr p f
296 -- | Explicitly run the finaliser associated with a 'ByteString'.
297 -- Further references to this value may generate invalid memory
298 -- references. This operation is unsafe, as there may be other
299 -- 'ByteStrings' referring to the same underlying pages. If you use
300 -- this, you need to have a proof of some kind that all 'ByteString's
301 -- ever generated from the underlying byte array are no longer live.
302 unsafeFinalize :: ByteString -> IO ()
303 unsafeFinalize (PS p _ _) = finalizeForeignPtr p
307 ------------------------------------------------------------------------
309 -- | Conversion between 'Word8' and 'Char'. Should compile to a no-op.
311 #if !defined(__GLASGOW_HASKELL__)
312 w2c = chr . fromIntegral
314 w2c = unsafeChr . fromIntegral
318 -- | Unsafe conversion between 'Char' and 'Word8'. This is a no-op and
319 -- silently truncates to 8 bits Chars > '\255'. It is provided as
320 -- convenience for ByteString construction.
322 c2w = fromIntegral . ord
325 -- Selects white-space characters in the Latin-1 range
326 -- ordered by frequency
328 isSpaceWord8 :: Word8 -> Bool
329 isSpaceWord8 w = case w of
330 0x20 -> True -- SPACE
331 0x0A -> True -- LF, \n
332 0x09 -> True -- HT, \t
333 0x0C -> True -- FF, \f
334 0x0D -> True -- CR, \r
335 0x0B -> True -- VT, \v
336 0xA0 -> True -- spotted by QC..
338 {-# INLINE isSpaceWord8 #-}
340 ------------------------------------------------------------------------
341 -- | Just like unsafePerformIO, but we inline it. Big performance gains as
342 -- it exposes lots of things to further inlining
344 {-# INLINE inlinePerformIO #-}
345 inlinePerformIO :: IO a -> a
346 #if defined(__GLASGOW_HASKELL__)
347 inlinePerformIO (IO m) = case m realWorld# of (# _, r #) -> r
349 inlinePerformIO = unsafePerformIO
352 -- | Count the number of occurrences of each byte.
354 {-# SPECIALIZE countOccurrences :: Ptr CSize -> Ptr Word8 -> Int -> IO () #-}
355 countOccurrences :: (Storable a, Num a) => Ptr a -> Ptr Word8 -> Int -> IO ()
356 STRICT3(countOccurrences)
357 countOccurrences counts str l = go 0
360 go i | i == l = return ()
361 | otherwise = do k <- fromIntegral `fmap` peekElemOff str i
362 x <- peekElemOff counts k
363 pokeElemOff counts k (x + 1)
366 -- | /O(1) construction/ Use a @ByteString@ with a function requiring a
367 -- @CString@. Warning: modifying the @CString@ will affect the
368 -- @ByteString@. Why is this function unsafe? It relies on the null
369 -- byte at the end of the ByteString to be there. Unless you can
370 -- guarantee the null byte, you should use the safe version, which will
371 -- copy the string first.
372 unsafeUseAsCString :: ByteString -> (CString -> IO a) -> IO a
373 unsafeUseAsCString (PS ps s _) ac = withForeignPtr ps $ \p -> ac (castPtr p `plusPtr` s)
375 -- | /O(1) construction/ Use a @ByteString@ with a function requiring a
376 -- @CStringLen@. Warning: modifying the @CStringLen@ will affect the
377 -- @ByteString@. This is analogous to unsafeUseAsCString, and comes
378 -- with the same safety requirements. The user must ensure there is a
379 -- null byte at the end of the string.
380 unsafeUseAsCStringLen :: ByteString -> (CStringLen -> IO a) -> IO a
381 unsafeUseAsCStringLen (PS ps s l) ac = withForeignPtr ps $ \p -> ac (castPtr p `plusPtr` s,l)
383 -- ---------------------------------------------------------------------
385 -- Standard C functions
388 foreign import ccall unsafe "string.h strlen" c_strlen
389 :: CString -> IO CInt
391 foreign import ccall unsafe "stdlib.h malloc" c_malloc
392 :: CInt -> IO (Ptr Word8)
394 foreign import ccall unsafe "static stdlib.h free" c_free
395 :: Ptr Word8 -> IO ()
397 #if !defined(__GLASGOW_HASKELL__)
398 foreign import ccall unsafe "static stdlib.h &free" c_free_finalizer
399 :: FunPtr (Ptr Word8 -> IO ())
402 foreign import ccall unsafe "string.h memchr" memchr
403 :: Ptr Word8 -> Word8 -> CSize -> IO (Ptr Word8)
405 foreign import ccall unsafe "string.h memcmp" memcmp
406 :: Ptr Word8 -> Ptr Word8 -> CSize -> IO CInt
408 foreign import ccall unsafe "string.h memcpy" memcpy
409 :: Ptr Word8 -> Ptr Word8 -> CSize -> IO ()
411 foreign import ccall unsafe "string.h memmove" memmove
412 :: Ptr Word8 -> Ptr Word8 -> CSize -> IO ()
414 foreign import ccall unsafe "string.h memset" memset
415 :: Ptr Word8 -> Word8 -> CSize -> IO (Ptr Word8)
418 -- ---------------------------------------------------------------------
423 foreign import ccall unsafe "static fpstring.h fps_reverse" c_reverse
424 :: Ptr Word8 -> Ptr Word8 -> CInt -> IO ()
426 foreign import ccall unsafe "static fpstring.h fps_intersperse" c_intersperse
427 :: Ptr Word8 -> Ptr Word8 -> CInt -> Word8 -> IO ()
429 foreign import ccall unsafe "static fpstring.h fps_maximum" c_maximum
430 :: Ptr Word8 -> CInt -> IO Word8
432 foreign import ccall unsafe "static fpstring.h fps_minimum" c_minimum
433 :: Ptr Word8 -> CInt -> IO Word8
435 foreign import ccall unsafe "static fpstring.h fps_count" c_count
436 :: Ptr Word8 -> CInt -> Word8 -> IO CInt
438 -- ---------------------------------------------------------------------
442 foreign import ccall unsafe "static fpstring.h my_mmap" my_mmap
443 :: Int -> Int -> IO (Ptr Word8)
445 foreign import ccall unsafe "static unistd.h close" c_close
448 # if !defined(__OpenBSD__)
449 foreign import ccall unsafe "static sys/mman.h munmap" c_munmap
450 :: Ptr Word8 -> Int -> IO Int
454 -- ---------------------------------------------------------------------
455 -- Internal GHC Haskell magic
457 #if defined(__GLASGOW_HASKELL__)
458 foreign import ccall unsafe "RtsAPI.h getProgArgv"
459 getProgArgv :: Ptr CInt -> Ptr (Ptr CString) -> IO ()
461 foreign import ccall unsafe "__hscore_memcpy_src_off"
462 memcpy_ptr_baoff :: Ptr a -> RawBuffer -> CInt -> CSize -> IO (Ptr ())