1 {-# OPTIONS_GHC -XNoImplicitPrelude -funbox-strict-fields #-}
2 {-# LANGUAGE BangPatterns #-}
3 -----------------------------------------------------------------------------
5 -- Module : GHC.IO.Encoding.UTF16
6 -- Copyright : (c) The University of Glasgow, 2009
7 -- License : see libraries/base/LICENSE
9 -- Maintainer : libraries@haskell.org
10 -- Stability : internal
11 -- Portability : non-portable
13 -- UTF-16 Codecs for the IO library
15 -- Portions Copyright : (c) Tom Harper 2008-2009,
16 -- (c) Bryan O'Sullivan 2009,
17 -- (c) Duncan Coutts 2009
19 -----------------------------------------------------------------------------
21 module GHC.IO.Encoding.UTF16 (
39 import GHC.IO.Exception
41 import GHC.IO.Encoding.Types
48 import System.Posix.Internals
53 puts :: String -> IO ()
54 puts s = do withCStringLen (s++"\n") $ \(p,len) ->
55 c_write 1 (castPtr p) (fromIntegral len)
59 -- -----------------------------------------------------------------------------
60 -- The UTF-16 codec: either UTF16BE or UTF16LE with a BOM
63 utf16 = TextEncoding { textEncodingName = "UTF-16",
64 mkTextDecoder = utf16_DF,
65 mkTextEncoder = utf16_EF }
67 utf16_DF :: IO (TextDecoder (Maybe DecodeBuffer))
69 seen_bom <- newIORef Nothing
71 encode = utf16_decode seen_bom,
73 getState = readIORef seen_bom,
74 setState = writeIORef seen_bom
77 utf16_EF :: IO (TextEncoder Bool)
79 done_bom <- newIORef False
81 encode = utf16_encode done_bom,
83 getState = readIORef done_bom,
84 setState = writeIORef done_bom
87 utf16_encode :: IORef Bool -> EncodeBuffer
88 utf16_encode done_bom input
89 output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow, bufSize=os }
91 b <- readIORef done_bom
92 if b then utf16_native_encode input output
94 then return (input,output)
96 writeIORef done_bom True
97 writeWord8Buf oraw ow bom1
98 writeWord8Buf oraw (ow+1) bom2
99 utf16_native_encode input output{ bufR = ow+2 }
101 utf16_decode :: IORef (Maybe DecodeBuffer) -> DecodeBuffer
102 utf16_decode seen_bom
103 input@Buffer{ bufRaw=iraw, bufL=ir, bufR=iw, bufSize=_ }
106 mb <- readIORef seen_bom
108 Just decode -> decode input output
110 if iw - ir < 2 then return (input,output) else do
111 c0 <- readWord8Buf iraw ir
112 c1 <- readWord8Buf iraw (ir+1)
114 _ | c0 == bomB && c1 == bomL -> do
115 writeIORef seen_bom (Just utf16be_decode)
116 utf16be_decode input{ bufL= ir+2 } output
117 | c0 == bomL && c1 == bomB -> do
118 writeIORef seen_bom (Just utf16le_decode)
119 utf16le_decode input{ bufL= ir+2 } output
121 writeIORef seen_bom (Just utf16_native_decode)
122 utf16_native_decode input output
125 bomB, bomL, bom1, bom2 :: Word8
129 -- choose UTF-16BE by default for UTF-16 output
130 utf16_native_decode :: DecodeBuffer
131 utf16_native_decode = utf16be_decode
133 utf16_native_encode :: EncodeBuffer
134 utf16_native_encode = utf16be_encode
139 -- -----------------------------------------------------------------------------
140 -- UTF16LE and UTF16BE
142 utf16be :: TextEncoding
143 utf16be = TextEncoding { textEncodingName = "UTF-16BE",
144 mkTextDecoder = utf16be_DF,
145 mkTextEncoder = utf16be_EF }
147 utf16be_DF :: IO (TextDecoder ())
149 return (BufferCodec {
150 encode = utf16be_decode,
152 getState = return (),
153 setState = const $ return ()
156 utf16be_EF :: IO (TextEncoder ())
158 return (BufferCodec {
159 encode = utf16be_encode,
161 getState = return (),
162 setState = const $ return ()
165 utf16le :: TextEncoding
166 utf16le = TextEncoding { textEncodingName = "UTF16-LE",
167 mkTextDecoder = utf16le_DF,
168 mkTextEncoder = utf16le_EF }
170 utf16le_DF :: IO (TextDecoder ())
172 return (BufferCodec {
173 encode = utf16le_decode,
175 getState = return (),
176 setState = const $ return ()
179 utf16le_EF :: IO (TextEncoder ())
181 return (BufferCodec {
182 encode = utf16le_encode,
184 getState = return (),
185 setState = const $ return ()
189 utf16be_decode :: DecodeBuffer
191 input@Buffer{ bufRaw=iraw, bufL=ir0, bufR=iw, bufSize=_ }
192 output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow0, bufSize=os }
195 | ow >= os || ir >= iw = done ir ow
196 | ir + 1 == iw = done ir ow
198 c0 <- readWord8Buf iraw ir
199 c1 <- readWord8Buf iraw (ir+1)
200 let x1 = fromIntegral c0 `shiftL` 8 + fromIntegral c1
202 then do ow' <- writeCharBuf oraw ow (unsafeChr (fromIntegral x1))
204 else if iw - ir < 4 then done ir ow else do
205 c2 <- readWord8Buf iraw (ir+2)
206 c3 <- readWord8Buf iraw (ir+3)
207 let x2 = fromIntegral c2 `shiftL` 8 + fromIntegral c3
208 if not (validate2 x1 x2) then invalid else do
209 ow' <- writeCharBuf oraw ow (chr2 x1 x2)
212 invalid = if ir > ir0 then done ir ow else ioe_decodingError
214 -- lambda-lifted, to avoid thunks being built in the inner-loop:
215 done !ir !ow = return (if ir == iw then input{ bufL=0, bufR=0 }
216 else input{ bufL=ir },
221 utf16le_decode :: DecodeBuffer
223 input@Buffer{ bufRaw=iraw, bufL=ir0, bufR=iw, bufSize=_ }
224 output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow0, bufSize=os }
227 | ow >= os || ir >= iw = done ir ow
228 | ir + 1 == iw = done ir ow
230 c0 <- readWord8Buf iraw ir
231 c1 <- readWord8Buf iraw (ir+1)
232 let x1 = fromIntegral c1 `shiftL` 8 + fromIntegral c0
234 then do ow' <- writeCharBuf oraw ow (unsafeChr (fromIntegral x1))
236 else if iw - ir < 4 then done ir ow else do
237 c2 <- readWord8Buf iraw (ir+2)
238 c3 <- readWord8Buf iraw (ir+3)
239 let x2 = fromIntegral c3 `shiftL` 8 + fromIntegral c2
240 if not (validate2 x1 x2) then invalid else do
241 ow' <- writeCharBuf oraw ow (chr2 x1 x2)
244 invalid = if ir > ir0 then done ir ow else ioe_decodingError
246 -- lambda-lifted, to avoid thunks being built in the inner-loop:
247 done !ir !ow = return (if ir == iw then input{ bufL=0, bufR=0 }
248 else input{ bufL=ir },
253 ioe_decodingError :: IO a
254 ioe_decodingError = ioException
255 (IOError Nothing InvalidArgument "utf16_decode"
256 "invalid UTF-16 byte sequence" Nothing Nothing)
258 utf16be_encode :: EncodeBuffer
260 input@Buffer{ bufRaw=iraw, bufL=ir0, bufR=iw, bufSize=_ }
261 output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow0, bufSize=os }
263 done !ir !ow = return (if ir == iw then input{ bufL=0, bufR=0 }
264 else input{ bufL=ir },
267 | ir >= iw = done ir ow
268 | os - ow < 2 = done ir ow
270 (c,ir') <- readCharBuf iraw ir
272 x | x < 0x10000 -> do
273 writeWord8Buf oraw ow (fromIntegral (x `shiftR` 8))
274 writeWord8Buf oraw (ow+1) (fromIntegral x)
277 if os - ow < 4 then done ir ow else do
280 c1 = fromIntegral (n1 `shiftR` 18 + 0xD8)
281 c2 = fromIntegral (n1 `shiftR` 10)
283 c3 = fromIntegral (n2 `shiftR` 8 + 0xDC)
286 writeWord8Buf oraw ow c1
287 writeWord8Buf oraw (ow+1) c2
288 writeWord8Buf oraw (ow+2) c3
289 writeWord8Buf oraw (ow+3) c4
294 utf16le_encode :: EncodeBuffer
296 input@Buffer{ bufRaw=iraw, bufL=ir0, bufR=iw, bufSize=_ }
297 output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow0, bufSize=os }
299 done !ir !ow = return (if ir == iw then input{ bufL=0, bufR=0 }
300 else input{ bufL=ir },
303 | ir >= iw = done ir ow
304 | os - ow < 2 = done ir ow
306 (c,ir') <- readCharBuf iraw ir
308 x | x < 0x10000 -> do
309 writeWord8Buf oraw ow (fromIntegral x)
310 writeWord8Buf oraw (ow+1) (fromIntegral (x `shiftR` 8))
313 if os - ow < 4 then done ir ow else do
316 c1 = fromIntegral (n1 `shiftR` 18 + 0xD8)
317 c2 = fromIntegral (n1 `shiftR` 10)
319 c3 = fromIntegral (n2 `shiftR` 8 + 0xDC)
322 writeWord8Buf oraw ow c2
323 writeWord8Buf oraw (ow+1) c1
324 writeWord8Buf oraw (ow+2) c4
325 writeWord8Buf oraw (ow+3) c3
330 chr2 :: Word16 -> Word16 -> Char
331 chr2 (W16# a#) (W16# b#) = C# (chr# (upper# +# lower# +# 0x10000#))
335 !upper# = uncheckedIShiftL# (x# -# 0xD800#) 10#
336 !lower# = y# -# 0xDC00#
339 validate1 :: Word16 -> Bool
340 validate1 x1 = (x1 >= 0 && x1 < 0xD800) || x1 > 0xDFFF
341 {-# INLINE validate1 #-}
343 validate2 :: Word16 -> Word16 -> Bool
344 validate2 x1 x2 = x1 >= 0xD800 && x1 <= 0xDBFF &&
345 x2 >= 0xDC00 && x2 <= 0xDFFF
346 {-# INLINE validate2 #-}