1 {-# LANGUAGE CPP, NoImplicitPrelude #-}
3 -----------------------------------------------------------------------------
5 -- Module : GHC.IO.Encoding
6 -- Copyright : (c) The University of Glasgow, 2008-2011
7 -- License : see libraries/base/LICENSE
9 -- Maintainer : libraries@haskell.org
10 -- Stability : internal
11 -- Portability : non-portable
13 -- Foreign marshalling support for CStrings with configurable encodings
15 -----------------------------------------------------------------------------
18 -- * C strings with a configurable encoding
20 -- conversion of C strings into Haskell strings
22 peekCString, -- :: TextEncoding -> CString -> IO String
23 peekCStringLen, -- :: TextEncoding -> CStringLen -> IO String
25 -- conversion of Haskell strings into C strings
27 newCString, -- :: TextEncoding -> String -> IO CString
28 newCStringLen, -- :: TextEncoding -> String -> IO CStringLen
30 -- conversion of Haskell strings into C strings using temporary storage
32 withCString, -- :: TextEncoding -> String -> (CString -> IO a) -> IO a
33 withCStringLen, -- :: TextEncoding -> String -> (CStringLen -> IO a) -> IO a
35 charIsRepresentable, -- :: TextEncoding -> Char -> IO Bool
38 import Foreign.Marshal.Array
39 import Foreign.C.Types
41 import Foreign.Storable
45 -- Imports for the locale-encoding version of marshallers
48 import Data.Tuple (fst)
51 import {-# SOURCE #-} System.Posix.Internals (puts)
52 import GHC.Show ( show )
54 import Foreign.Marshal.Alloc
55 import Foreign.ForeignPtr
57 import GHC.Err (undefined)
63 import GHC.IO.Exception
65 import GHC.IO.Encoding.Failure (surrogatifyRoundtripCharacter, desurrogatifyRoundtripCharacter)
66 import GHC.IO.Encoding.Types
72 putDebugMsg :: String -> IO ()
73 putDebugMsg | c_DEBUG_DUMP = puts
74 | otherwise = const (return ())
77 -- These definitions are identical to those in Foreign.C.String, but copied in here to avoid a cycle:
78 type CString = Ptr CChar
79 type CStringLen = (Ptr CChar, Int)
84 -- | Marshal a NUL terminated C string into a Haskell string.
86 peekCString :: TextEncoding -> CString -> IO String
87 peekCString enc cp = do
88 sz <- lengthArray0 nUL cp
89 peekEncodedCString enc (cp, sz * cCharSize)
91 -- | Marshal a C string with explicit length into a Haskell string.
93 peekCStringLen :: TextEncoding -> CStringLen -> IO String
94 peekCStringLen = peekEncodedCString
96 -- | Marshal a Haskell string into a NUL terminated C string.
98 -- * the Haskell string may /not/ contain any NUL characters
100 -- * new storage is allocated for the C string and must be
101 -- explicitly freed using 'Foreign.Marshal.Alloc.free' or
102 -- 'Foreign.Marshal.Alloc.finalizerFree'.
104 newCString :: TextEncoding -> String -> IO CString
105 newCString enc = liftM fst . newEncodedCString enc True
107 -- | Marshal a Haskell string into a C string (ie, character array) with
108 -- explicit length information.
110 -- * new storage is allocated for the C string and must be
111 -- explicitly freed using 'Foreign.Marshal.Alloc.free' or
112 -- 'Foreign.Marshal.Alloc.finalizerFree'.
114 newCStringLen :: TextEncoding -> String -> IO CStringLen
115 newCStringLen enc = newEncodedCString enc False
117 -- | Marshal a Haskell string into a NUL terminated C string using temporary
120 -- * the Haskell string may /not/ contain any NUL characters
122 -- * the memory is freed when the subcomputation terminates (either
123 -- normally or via an exception), so the pointer to the temporary
124 -- storage must /not/ be used after this.
126 withCString :: TextEncoding -> String -> (CString -> IO a) -> IO a
127 withCString enc s act = withEncodedCString enc True s $ \(cp, _sz) -> act cp
129 -- | Marshal a Haskell string into a C string (ie, character array)
130 -- in temporary storage, with explicit length information.
132 -- * the memory is freed when the subcomputation terminates (either
133 -- normally or via an exception), so the pointer to the temporary
134 -- storage must /not/ be used after this.
136 withCStringLen :: TextEncoding -> String -> (CStringLen -> IO a) -> IO a
137 withCStringLen enc = withEncodedCString enc False
140 -- | Determines whether a character can be accurately encoded in a 'CString'.
142 -- Pretty much anyone who uses this function is in a state of sin because
143 -- whether or not a character is encodable will, in general, depend on the
144 -- context in which it occurs.
145 charIsRepresentable :: TextEncoding -> Char -> IO Bool
146 charIsRepresentable enc c = withCString enc [c] (fmap (== [c]) . peekCString enc) `catchException` (\e -> let _ = e :: IOException in return False)
148 -- auxiliary definitions
149 -- ----------------------
151 -- C's end of string character
155 -- Size of a CChar in bytes
157 cCharSize = sizeOf (undefined :: CChar)
160 {-# INLINE peekEncodedCString #-}
161 peekEncodedCString :: TextEncoding -- ^ Encoding of CString
163 -> IO String -- ^ String in Haskell terms
164 peekEncodedCString (TextEncoding { mkTextDecoder = mk_decoder }) (p, sz_bytes)
165 = bracket mk_decoder close $ \decoder -> do
166 let chunk_size = sz_bytes `max` 1 -- Decode buffer chunk size in characters: one iteration only for ASCII
167 from0 <- fmap (\fp -> bufferAdd sz_bytes (emptyBuffer fp sz_bytes ReadBuffer)) $ newForeignPtr_ (castPtr p)
168 to <- newCharBuffer chunk_size WriteBuffer
170 let go iteration from = do
171 (why, from', to') <- encode decoder from to
172 if isEmptyBuffer from'
174 -- No input remaining: @why@ will be InputUnderflow, but we don't care
175 fmap (map desurrogatifyRoundtripCharacter) $ withBuffer to' $ peekArray (bufferElems to')
177 -- Input remaining: what went wrong?
178 putDebugMsg ("peekEncodedCString: " ++ show iteration ++ " " ++ show why)
179 (from'', to'') <- case why of InvalidSequence -> recover decoder from' to' -- These conditions are equally bad because
180 InputUnderflow -> recover decoder from' to' -- they indicate malformed/truncated input
181 OutputUnderflow -> return (from', to') -- We will have more space next time round
182 putDebugMsg ("peekEncodedCString: from " ++ summaryBuffer from ++ " " ++ summaryBuffer from' ++ " " ++ summaryBuffer from'')
183 putDebugMsg ("peekEncodedCString: to " ++ summaryBuffer to ++ " " ++ summaryBuffer to' ++ " " ++ summaryBuffer to'')
184 to_chars <- withBuffer to'' $ peekArray (bufferElems to'')
185 fmap (map desurrogatifyRoundtripCharacter to_chars++) $ go (iteration + 1) from''
189 {-# INLINE withEncodedCString #-}
190 withEncodedCString :: TextEncoding -- ^ Encoding of CString to create
191 -> Bool -- ^ Null-terminate?
192 -> String -- ^ String to encode
193 -> (CStringLen -> IO a) -- ^ Worker that can safely use the allocated memory
195 withEncodedCString (TextEncoding { mkTextEncoder = mk_encoder }) null_terminate s act
196 = bracket mk_encoder close $ \encoder -> withArrayLen (map surrogatifyRoundtripCharacter s) $ \sz p -> do
197 from <- fmap (\fp -> bufferAdd sz (emptyBuffer fp sz ReadBuffer)) $ newForeignPtr_ p
199 let go iteration to_sz_bytes = do
200 putDebugMsg ("withEncodedCString: " ++ show iteration)
201 allocaBytes to_sz_bytes $ \to_p -> do
202 mb_res <- tryFillBufferAndCall encoder null_terminate from to_p to_sz_bytes act
204 Nothing -> go (iteration + 1) (to_sz_bytes * 2)
205 Just res -> return res
207 -- If the input string is ASCII, this value will ensure we only allocate once
208 go (0 :: Int) (cCharSize * (sz + 1))
210 {-# INLINE newEncodedCString #-}
211 newEncodedCString :: TextEncoding -- ^ Encoding of CString to create
212 -> Bool -- ^ Null-terminate?
213 -> String -- ^ String to encode
215 newEncodedCString (TextEncoding { mkTextEncoder = mk_encoder }) null_terminate s
216 = bracket mk_encoder close $ \encoder -> withArrayLen (map surrogatifyRoundtripCharacter s) $ \sz p -> do
217 from <- fmap (\fp -> bufferAdd sz (emptyBuffer fp sz ReadBuffer)) $ newForeignPtr_ p
219 let go iteration to_p to_sz_bytes = do
220 putDebugMsg ("newEncodedCString: " ++ show iteration)
221 mb_res <- tryFillBufferAndCall encoder null_terminate from to_p to_sz_bytes return
224 let to_sz_bytes' = to_sz_bytes * 2
225 to_p' <- reallocBytes to_p to_sz_bytes'
226 go (iteration + 1) to_p' to_sz_bytes'
227 Just res -> return res
229 -- If the input string is ASCII, this value will ensure we only allocate once
230 let to_sz_bytes = cCharSize * (sz + 1)
231 to_p <- mallocBytes to_sz_bytes
232 go (0 :: Int) to_p to_sz_bytes
235 tryFillBufferAndCall :: TextEncoder dstate -> Bool -> Buffer Char -> Ptr Word8 -> Int
236 -> (CStringLen -> IO a) -> IO (Maybe a)
237 tryFillBufferAndCall encoder null_terminate from0 to_p to_sz_bytes act = do
238 to_fp <- newForeignPtr_ to_p
239 go (0 :: Int) (from0, emptyBuffer to_fp to_sz_bytes WriteBuffer)
241 go iteration (from, to) = do
242 (why, from', to') <- encode encoder from to
243 putDebugMsg ("tryFillBufferAndCall: " ++ show iteration ++ " " ++ show why ++ " " ++ summaryBuffer from ++ " " ++ summaryBuffer from')
244 if isEmptyBuffer from'
245 then if null_terminate && bufferAvailable to' == 0
246 then return Nothing -- We had enough for the string but not the terminator: ask the caller for more buffer
248 -- Awesome, we had enough buffer
249 let bytes = bufferElems to'
250 withBuffer to' $ \to_ptr -> do
251 when null_terminate $ pokeElemOff to_ptr (bufR to') 0
252 fmap Just $ act (castPtr to_ptr, bytes) -- NB: the length information is specified as being in *bytes*
253 else case why of -- We didn't consume all of the input
254 InputUnderflow -> recover encoder from' to' >>= go (iteration + 1) -- These conditions are equally bad
255 InvalidSequence -> recover encoder from' to' >>= go (iteration + 1) -- since the input was truncated/invalid
256 OutputUnderflow -> return Nothing -- Oops, out of buffer during decoding: ask the caller for more