1 {-# OPTIONS_GHC -XNoImplicitPrelude #-}
2 -----------------------------------------------------------------------------
4 -- Module : GHC.IO.Encoding.Iconv
5 -- Copyright : (c) The University of Glasgow, 2008-2009
6 -- License : see libraries/base/LICENSE
8 -- Maintainer : libraries@haskell.org
9 -- Stability : internal
10 -- Portability : non-portable
12 -- This module provides text encoding/decoding using iconv
14 -----------------------------------------------------------------------------
17 module GHC.IO.Encoding.Iconv (
18 #if !defined(mingw32_HOST_OS)
22 utf16, utf16le, utf16be,
23 utf32, utf32le, utf32be,
29 #include "HsBaseConfig.h"
31 #if !defined(mingw32_HOST_OS)
33 import Foreign hiding (unsafePerformIO)
38 import GHC.IO.Encoding.Types
42 import System.IO.Unsafe (unsafePerformIO)
43 import System.Posix.Internals
48 iconv_trace :: String -> IO ()
50 | c_DEBUG_DUMP = puts s
51 | otherwise = return ()
53 puts :: String -> IO ()
54 puts s = do _ <- withCStringLen (s ++ "\n") $ \(p, len) ->
55 c_write 1 (castPtr p) (fromIntegral len)
58 -- -----------------------------------------------------------------------------
59 -- iconv encoders/decoders
61 {-# NOINLINE latin1 #-}
62 latin1 :: TextEncoding
63 latin1 = unsafePerformIO (mkTextEncoding "Latin1")
67 utf8 = unsafePerformIO (mkTextEncoding "UTF8")
69 {-# NOINLINE utf16 #-}
71 utf16 = unsafePerformIO (mkTextEncoding "UTF16")
73 {-# NOINLINE utf16le #-}
74 utf16le :: TextEncoding
75 utf16le = unsafePerformIO (mkTextEncoding "UTF16LE")
77 {-# NOINLINE utf16be #-}
78 utf16be :: TextEncoding
79 utf16be = unsafePerformIO (mkTextEncoding "UTF16BE")
81 {-# NOINLINE utf32 #-}
83 utf32 = unsafePerformIO (mkTextEncoding "UTF32")
85 {-# NOINLINE utf32le #-}
86 utf32le :: TextEncoding
87 utf32le = unsafePerformIO (mkTextEncoding "UTF32LE")
89 {-# NOINLINE utf32be #-}
90 utf32be :: TextEncoding
91 utf32be = unsafePerformIO (mkTextEncoding "UTF32BE")
93 {-# NOINLINE localeEncoding #-}
94 localeEncoding :: TextEncoding
95 localeEncoding = unsafePerformIO $ do
96 -- Use locale_charset() or nl_langinfo(CODESET) to get the encoding
97 -- if we have either of them.
98 cstr <- c_localeEncoding
102 -- We hope iconv_t is a storable type. It should be, since it has at least the
103 -- value -1, which is a possible return value from iconv_open.
104 type IConv = CLong -- ToDo: (#type iconv_t)
106 foreign import ccall unsafe "hs_iconv_open"
107 hs_iconv_open :: CString -> CString -> IO IConv
109 foreign import ccall unsafe "hs_iconv_close"
110 hs_iconv_close :: IConv -> IO CInt
112 foreign import ccall unsafe "hs_iconv"
113 hs_iconv :: IConv -> Ptr CString -> Ptr CSize -> Ptr CString -> Ptr CSize
116 foreign import ccall unsafe "localeEncoding"
117 c_localeEncoding :: IO CString
119 haskellChar :: String
120 #ifdef WORDS_BIGENDIAN
121 haskellChar | charSize == 2 = "UTF-16BE"
122 | otherwise = "UTF-32BE"
124 haskellChar | charSize == 2 = "UTF-16LE"
125 | otherwise = "UTF-32LE"
129 char_shift | charSize == 2 = 1
132 mkTextEncoding :: String -> IO TextEncoding
133 mkTextEncoding charset = do
134 return (TextEncoding {
135 textEncodingName = charset,
136 mkTextDecoder = newIConv charset haskellChar iconvDecode,
137 mkTextEncoder = newIConv haskellChar charset iconvEncode})
139 newIConv :: String -> String
140 -> (IConv -> Buffer a -> Buffer b -> IO (Buffer a, Buffer b))
141 -> IO (BufferCodec a b ())
142 newIConv from to fn =
143 withCString from $ \ from_str ->
144 withCString to $ \ to_str -> do
145 iconvt <- throwErrnoIfMinus1 "mkTextEncoding" $ hs_iconv_open to_str from_str
146 let iclose = throwErrnoIfMinus1_ "Iconv.close" $ hs_iconv_close iconvt
150 -- iconv doesn't supply a way to save/restore the state
151 getState = return (),
152 setState = const $ return ()
155 iconvDecode :: IConv -> Buffer Word8 -> Buffer CharBufElem
156 -> IO (Buffer Word8, Buffer CharBufElem)
157 iconvDecode iconv_t ibuf obuf = iconvRecode iconv_t ibuf 0 obuf char_shift
159 iconvEncode :: IConv -> Buffer CharBufElem -> Buffer Word8
160 -> IO (Buffer CharBufElem, Buffer Word8)
161 iconvEncode iconv_t ibuf obuf = iconvRecode iconv_t ibuf char_shift obuf 0
163 iconvRecode :: IConv -> Buffer a -> Int -> Buffer b -> Int
164 -> IO (Buffer a, Buffer b)
166 input@Buffer{ bufRaw=iraw, bufL=ir, bufR=iw, bufSize=_ } iscale
167 output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow, bufSize=os } oscale
169 iconv_trace ("haskelChar=" ++ show haskellChar)
170 iconv_trace ("iconvRecode before, input=" ++ show (summaryBuffer input))
171 iconv_trace ("iconvRecode before, output=" ++ show (summaryBuffer output))
172 withRawBuffer iraw $ \ piraw -> do
173 withRawBuffer oraw $ \ poraw -> do
174 with (piraw `plusPtr` (ir `shiftL` iscale)) $ \ p_inbuf -> do
175 with (poraw `plusPtr` (ow `shiftL` oscale)) $ \ p_outbuf -> do
176 with (fromIntegral ((iw-ir) `shiftL` iscale)) $ \ p_inleft -> do
177 with (fromIntegral ((os-ow) `shiftL` oscale)) $ \ p_outleft -> do
178 res <- hs_iconv iconv_t p_inbuf p_inleft p_outbuf p_outleft
179 new_inleft <- peek p_inleft
180 new_outleft <- peek p_outleft
182 new_inleft' = fromIntegral new_inleft `shiftR` iscale
183 new_outleft' = fromIntegral new_outleft `shiftR` oscale
185 | new_inleft == 0 = input { bufL = 0, bufR = 0 }
186 | otherwise = input { bufL = iw - new_inleft' }
187 new_output = output{ bufR = os - new_outleft' }
188 iconv_trace ("iconv res=" ++ show res)
189 iconv_trace ("iconvRecode after, input=" ++ show (summaryBuffer new_input))
190 iconv_trace ("iconvRecode after, output=" ++ show (summaryBuffer new_output))
192 then do -- all input translated
193 return (new_input, new_output)
198 || (e == e2BIG || e == eILSEQ) && new_inleft' /= (iw-ir) -> do
199 iconv_trace ("iconv ignoring error: " ++ show (errnoToIOError "iconv" e Nothing Nothing))
200 -- Output overflow is relatively harmless, unless
201 -- we made no progress at all.
203 -- Similarly, we ignore EILSEQ unless we converted no
204 -- characters. Sometimes iconv reports EILSEQ for a
205 -- character in the input even when there is no room
206 -- in the output; in this case we might be about to
207 -- change the encoding anyway, so the following bytes
208 -- could very well be in a different encoding.
209 -- This also helps with pinpointing EILSEQ errors: we
210 -- don't report it until the rest of the characters in
211 -- the buffer have been drained.
212 return (new_input, new_output)
215 throwErrno "iconvRecoder"
216 -- illegal sequence, or some other error
218 #endif /* !mingw32_HOST_OS */