3 , ForeignFunctionInterface
4 , NondecreasingIndentation
7 -----------------------------------------------------------------------------
9 -- Module : GHC.IO.Encoding.Iconv
10 -- Copyright : (c) The University of Glasgow, 2008-2009
11 -- License : see libraries/base/LICENSE
13 -- Maintainer : libraries@haskell.org
14 -- Stability : internal
15 -- Portability : non-portable
17 -- This module provides text encoding/decoding using iconv
19 -----------------------------------------------------------------------------
22 module GHC.IO.Encoding.Iconv (
23 #if !defined(mingw32_HOST_OS)
27 utf16, utf16le, utf16be,
28 utf32, utf32le, utf32be,
34 #include "HsBaseConfig.h"
36 #if !defined(mingw32_HOST_OS)
38 import Foreign hiding (unsafePerformIO)
43 import GHC.IO.Encoding.Types
44 import GHC.List (span)
48 import System.IO.Unsafe (unsafePerformIO)
49 import System.Posix.Internals
54 iconv_trace :: String -> IO ()
56 | c_DEBUG_DUMP = puts s
57 | otherwise = return ()
59 puts :: String -> IO ()
60 puts s = do _ <- withCAStringLen (s ++ "\n") $ \(p, len) ->
61 -- In reality should be withCString, but assume ASCII to avoid loop
62 c_write 1 (castPtr p) (fromIntegral len)
65 -- -----------------------------------------------------------------------------
66 -- iconv encoders/decoders
68 {-# NOINLINE latin1 #-}
69 latin1 :: TextEncoding
70 latin1 = unsafePerformIO (mkTextEncoding "Latin1")
74 utf8 = unsafePerformIO (mkTextEncoding "UTF8")
76 {-# NOINLINE utf16 #-}
78 utf16 = unsafePerformIO (mkTextEncoding "UTF16")
80 {-# NOINLINE utf16le #-}
81 utf16le :: TextEncoding
82 utf16le = unsafePerformIO (mkTextEncoding "UTF16LE")
84 {-# NOINLINE utf16be #-}
85 utf16be :: TextEncoding
86 utf16be = unsafePerformIO (mkTextEncoding "UTF16BE")
88 {-# NOINLINE utf32 #-}
90 utf32 = unsafePerformIO (mkTextEncoding "UTF32")
92 {-# NOINLINE utf32le #-}
93 utf32le :: TextEncoding
94 utf32le = unsafePerformIO (mkTextEncoding "UTF32LE")
96 {-# NOINLINE utf32be #-}
97 utf32be :: TextEncoding
98 utf32be = unsafePerformIO (mkTextEncoding "UTF32BE")
100 {-# NOINLINE localeEncodingName #-}
101 localeEncodingName :: String
102 localeEncodingName = unsafePerformIO $ do
103 -- Use locale_charset() or nl_langinfo(CODESET) to get the encoding
104 -- if we have either of them.
105 cstr <- c_localeEncoding
106 peekCAString cstr -- Assume charset names are ASCII
108 {-# NOINLINE localeEncoding #-}
109 localeEncoding :: TextEncoding
110 localeEncoding = unsafePerformIO $ mkTextEncoding localeEncodingName
112 -- We hope iconv_t is a storable type. It should be, since it has at least the
113 -- value -1, which is a possible return value from iconv_open.
114 type IConv = CLong -- ToDo: (#type iconv_t)
116 foreign import ccall unsafe "hs_iconv_open"
117 hs_iconv_open :: CString -> CString -> IO IConv
119 foreign import ccall unsafe "hs_iconv_close"
120 hs_iconv_close :: IConv -> IO CInt
122 foreign import ccall unsafe "hs_iconv"
123 hs_iconv :: IConv -> Ptr CString -> Ptr CSize -> Ptr CString -> Ptr CSize
126 foreign import ccall unsafe "localeEncoding"
127 c_localeEncoding :: IO CString
129 haskellChar :: String
130 #ifdef WORDS_BIGENDIAN
131 haskellChar | charSize == 2 = "UTF-16BE"
132 | otherwise = "UTF-32BE"
134 haskellChar | charSize == 2 = "UTF-16LE"
135 | otherwise = "UTF-32LE"
139 char_shift | charSize == 2 = 1
142 mkTextEncoding :: String -> IO TextEncoding
143 mkTextEncoding charset = do
144 return (TextEncoding {
145 textEncodingName = charset,
146 mkTextDecoder = newIConv raw_charset (haskellChar ++ suffix) iconvDecode,
147 mkTextEncoder = newIConv haskellChar charset iconvEncode})
149 -- An annoying feature of GNU iconv is that the //PREFIXES only take
150 -- effect when they appear on the tocode parameter to iconv_open:
151 (raw_charset, suffix) = span (/= '/') charset
153 newIConv :: String -> String
154 -> (IConv -> Buffer a -> Buffer b -> IO (Buffer a, Buffer b))
155 -> IO (BufferCodec a b ())
156 newIConv from to fn =
157 -- Assume charset names are ASCII
158 withCAString from $ \ from_str ->
159 withCAString to $ \ to_str -> do
160 iconvt <- throwErrnoIfMinus1 "mkTextEncoding" $ hs_iconv_open to_str from_str
161 let iclose = throwErrnoIfMinus1_ "Iconv.close" $ hs_iconv_close iconvt
165 -- iconv doesn't supply a way to save/restore the state
166 getState = return (),
167 setState = const $ return ()
170 iconvDecode :: IConv -> Buffer Word8 -> Buffer CharBufElem
171 -> IO (Buffer Word8, Buffer CharBufElem)
172 iconvDecode iconv_t ibuf obuf = iconvRecode iconv_t ibuf 0 obuf char_shift
174 iconvEncode :: IConv -> Buffer CharBufElem -> Buffer Word8
175 -> IO (Buffer CharBufElem, Buffer Word8)
176 iconvEncode iconv_t ibuf obuf = iconvRecode iconv_t ibuf char_shift obuf 0
178 iconvRecode :: IConv -> Buffer a -> Int -> Buffer b -> Int
179 -> IO (Buffer a, Buffer b)
181 input@Buffer{ bufRaw=iraw, bufL=ir, bufR=iw, bufSize=_ } iscale
182 output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow, bufSize=os } oscale
184 iconv_trace ("haskelChar=" ++ show haskellChar)
185 iconv_trace ("iconvRecode before, input=" ++ show (summaryBuffer input))
186 iconv_trace ("iconvRecode before, output=" ++ show (summaryBuffer output))
187 withRawBuffer iraw $ \ piraw -> do
188 withRawBuffer oraw $ \ poraw -> do
189 with (piraw `plusPtr` (ir `shiftL` iscale)) $ \ p_inbuf -> do
190 with (poraw `plusPtr` (ow `shiftL` oscale)) $ \ p_outbuf -> do
191 with (fromIntegral ((iw-ir) `shiftL` iscale)) $ \ p_inleft -> do
192 with (fromIntegral ((os-ow) `shiftL` oscale)) $ \ p_outleft -> do
193 res <- hs_iconv iconv_t p_inbuf p_inleft p_outbuf p_outleft
194 new_inleft <- peek p_inleft
195 new_outleft <- peek p_outleft
197 new_inleft' = fromIntegral new_inleft `shiftR` iscale
198 new_outleft' = fromIntegral new_outleft `shiftR` oscale
200 | new_inleft == 0 = input { bufL = 0, bufR = 0 }
201 | otherwise = input { bufL = iw - new_inleft' }
202 new_output = output{ bufR = os - new_outleft' }
203 iconv_trace ("iconv res=" ++ show res)
204 iconv_trace ("iconvRecode after, input=" ++ show (summaryBuffer new_input))
205 iconv_trace ("iconvRecode after, output=" ++ show (summaryBuffer new_output))
207 then do -- all input translated
208 return (new_input, new_output)
212 e | e == eINVAL || e == e2BIG
213 || e == eILSEQ && new_inleft' /= (iw-ir) -> do
214 iconv_trace ("iconv ignoring error: " ++ show (errnoToIOError "iconv" e Nothing Nothing))
215 -- Output overflow is harmless
217 -- Similarly, we ignore EILSEQ unless we converted no
218 -- characters. Sometimes iconv reports EILSEQ for a
219 -- character in the input even when there is no room
220 -- in the output; in this case we might be about to
221 -- change the encoding anyway, so the following bytes
222 -- could very well be in a different encoding.
223 -- This also helps with pinpointing EILSEQ errors: we
224 -- don't report it until the rest of the characters in
225 -- the buffer have been drained.
226 return (new_input, new_output)
229 iconv_trace ("iconv returned error: " ++ show (errnoToIOError "iconv" e Nothing Nothing))
230 throwErrno "iconvRecoder"
231 -- illegal sequence, or some other error
233 #endif /* !mingw32_HOST_OS */