1 {-# OPTIONS_GHC -XNoImplicitPrelude -funbox-strict-fields #-}
2 {-# LANGUAGE BangPatterns #-}
3 -----------------------------------------------------------------------------
5 -- Module : GHC.IO.Encoding.UTF16
6 -- Copyright : (c) The University of Glasgow, 2009
7 -- License : see libraries/base/LICENSE
9 -- Maintainer : libraries@haskell.org
10 -- Stability : internal
11 -- Portability : non-portable
13 -- UTF-16 Codecs for the IO library
15 -- Portions Copyright : (c) Tom Harper 2008-2009,
16 -- (c) Bryan O'Sullivan 2009,
17 -- (c) Duncan Coutts 2009
19 -----------------------------------------------------------------------------
21 module GHC.IO.Encoding.UTF16 (
39 import GHC.IO.Exception
41 import GHC.IO.Encoding.Types
48 import System.Posix.Internals
53 puts :: String -> IO ()
54 puts s = do withCStringLen (s++"\n") $ \(p,len) ->
55 c_write 1 (castPtr p) (fromIntegral len)
59 -- -----------------------------------------------------------------------------
60 -- The UTF-16 codec: either UTF16BE or UTF16LE with a BOM
63 utf16 = TextEncoding { mkTextDecoder = utf16_DF,
64 mkTextEncoder = utf16_EF }
66 utf16_DF :: IO (TextDecoder (Maybe DecodeBuffer))
68 seen_bom <- newIORef Nothing
70 encode = utf16_decode seen_bom,
72 getState = readIORef seen_bom,
73 setState = writeIORef seen_bom
76 utf16_EF :: IO (TextEncoder Bool)
78 done_bom <- newIORef False
80 encode = utf16_encode done_bom,
82 getState = readIORef done_bom,
83 setState = writeIORef done_bom
86 utf16_encode :: IORef Bool -> EncodeBuffer
87 utf16_encode done_bom input
88 output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow, bufSize=os }
90 b <- readIORef done_bom
91 if b then utf16_native_encode input output
93 then return (input,output)
95 writeIORef done_bom True
96 writeWord8Buf oraw ow bom1
97 writeWord8Buf oraw (ow+1) bom2
98 utf16_native_encode input output{ bufR = ow+2 }
100 utf16_decode :: IORef (Maybe DecodeBuffer) -> DecodeBuffer
101 utf16_decode seen_bom
102 input@Buffer{ bufRaw=iraw, bufL=ir, bufR=iw, bufSize=_ }
105 mb <- readIORef seen_bom
107 Just decode -> decode input output
109 if iw - ir < 2 then return (input,output) else do
110 c0 <- readWord8Buf iraw ir
111 c1 <- readWord8Buf iraw (ir+1)
113 _ | c0 == bomB && c1 == bomL -> do
114 writeIORef seen_bom (Just utf16be_decode)
115 utf16be_decode input{ bufL= ir+2 } output
116 | c0 == bomL && c1 == bomB -> do
117 writeIORef seen_bom (Just utf16le_decode)
118 utf16le_decode input{ bufL= ir+2 } output
120 writeIORef seen_bom (Just utf16_native_decode)
121 utf16_native_decode input output
124 bomB, bomL, bom1, bom2 :: Word8
128 -- choose UTF-16BE by default for UTF-16 output
129 utf16_native_decode :: DecodeBuffer
130 utf16_native_decode = utf16be_decode
132 utf16_native_encode :: EncodeBuffer
133 utf16_native_encode = utf16be_encode
138 -- -----------------------------------------------------------------------------
139 -- UTF16LE and UTF16BE
141 utf16be :: TextEncoding
142 utf16be = TextEncoding { mkTextDecoder = utf16be_DF,
143 mkTextEncoder = utf16be_EF }
145 utf16be_DF :: IO (TextDecoder ())
147 return (BufferCodec {
148 encode = utf16be_decode,
150 getState = return (),
151 setState = const $ return ()
154 utf16be_EF :: IO (TextEncoder ())
156 return (BufferCodec {
157 encode = utf16be_encode,
159 getState = return (),
160 setState = const $ return ()
163 utf16le :: TextEncoding
164 utf16le = TextEncoding { mkTextDecoder = utf16le_DF,
165 mkTextEncoder = utf16le_EF }
167 utf16le_DF :: IO (TextDecoder ())
169 return (BufferCodec {
170 encode = utf16le_decode,
172 getState = return (),
173 setState = const $ return ()
176 utf16le_EF :: IO (TextEncoder ())
178 return (BufferCodec {
179 encode = utf16le_encode,
181 getState = return (),
182 setState = const $ return ()
186 utf16be_decode :: DecodeBuffer
188 input@Buffer{ bufRaw=iraw, bufL=ir0, bufR=iw, bufSize=_ }
189 output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow0, bufSize=os }
192 | ow >= os || ir >= iw = done ir ow
193 | ir + 1 == iw = done ir ow
195 c0 <- readWord8Buf iraw ir
196 c1 <- readWord8Buf iraw (ir+1)
197 let x1 = fromIntegral c0 `shiftL` 8 + fromIntegral c1
199 then do ow' <- writeCharBuf oraw ow (unsafeChr (fromIntegral x1))
201 else if iw - ir < 4 then done ir ow else do
202 c2 <- readWord8Buf iraw (ir+2)
203 c3 <- readWord8Buf iraw (ir+3)
204 let x2 = fromIntegral c2 `shiftL` 8 + fromIntegral c3
205 if not (validate2 x1 x2) then invalid else do
206 ow' <- writeCharBuf oraw ow (chr2 x1 x2)
209 invalid = if ir > ir0 then done ir ow else ioe_decodingError
211 -- lambda-lifted, to avoid thunks being built in the inner-loop:
212 done !ir !ow = return (if ir == iw then input{ bufL=0, bufR=0 }
213 else input{ bufL=ir },
218 utf16le_decode :: DecodeBuffer
220 input@Buffer{ bufRaw=iraw, bufL=ir0, bufR=iw, bufSize=_ }
221 output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow0, bufSize=os }
224 | ow >= os || ir >= iw = done ir ow
225 | ir + 1 == iw = done ir ow
227 c0 <- readWord8Buf iraw ir
228 c1 <- readWord8Buf iraw (ir+1)
229 let x1 = fromIntegral c1 `shiftL` 8 + fromIntegral c0
231 then do ow' <- writeCharBuf oraw ow (unsafeChr (fromIntegral x1))
233 else if iw - ir < 4 then done ir ow else do
234 c2 <- readWord8Buf iraw (ir+2)
235 c3 <- readWord8Buf iraw (ir+3)
236 let x2 = fromIntegral c3 `shiftL` 8 + fromIntegral c2
237 if not (validate2 x1 x2) then invalid else do
238 ow' <- writeCharBuf oraw ow (chr2 x1 x2)
241 invalid = if ir > ir0 then done ir ow else ioe_decodingError
243 -- lambda-lifted, to avoid thunks being built in the inner-loop:
244 done !ir !ow = return (if ir == iw then input{ bufL=0, bufR=0 }
245 else input{ bufL=ir },
250 ioe_decodingError :: IO a
251 ioe_decodingError = ioException
252 (IOError Nothing InvalidArgument "utf16_decode"
253 "invalid UTF-16 byte sequence" Nothing Nothing)
255 utf16be_encode :: EncodeBuffer
257 input@Buffer{ bufRaw=iraw, bufL=ir0, bufR=iw, bufSize=_ }
258 output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow0, bufSize=os }
260 done !ir !ow = return (if ir == iw then input{ bufL=0, bufR=0 }
261 else input{ bufL=ir },
264 | ir >= iw = done ir ow
265 | os - ow < 2 = done ir ow
267 (c,ir') <- readCharBuf iraw ir
269 x | x < 0x10000 -> do
270 writeWord8Buf oraw ow (fromIntegral (x `shiftR` 8))
271 writeWord8Buf oraw (ow+1) (fromIntegral x)
274 if os - ow < 4 then done ir ow else do
277 c1 = fromIntegral (n1 `shiftR` 18 + 0xD8)
278 c2 = fromIntegral (n1 `shiftR` 10)
280 c3 = fromIntegral (n2 `shiftR` 8 + 0xDC)
283 writeWord8Buf oraw ow c1
284 writeWord8Buf oraw (ow+1) c2
285 writeWord8Buf oraw (ow+2) c3
286 writeWord8Buf oraw (ow+3) c4
291 utf16le_encode :: EncodeBuffer
293 input@Buffer{ bufRaw=iraw, bufL=ir0, bufR=iw, bufSize=_ }
294 output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow0, bufSize=os }
296 done !ir !ow = return (if ir == iw then input{ bufL=0, bufR=0 }
297 else input{ bufL=ir },
300 | ir >= iw = done ir ow
301 | os - ow < 2 = done ir ow
303 (c,ir') <- readCharBuf iraw ir
305 x | x < 0x10000 -> do
306 writeWord8Buf oraw ow (fromIntegral x)
307 writeWord8Buf oraw (ow+1) (fromIntegral (x `shiftR` 8))
310 if os - ow < 4 then done ir ow else do
313 c1 = fromIntegral (n1 `shiftR` 18 + 0xD8)
314 c2 = fromIntegral (n1 `shiftR` 10)
316 c3 = fromIntegral (n2 `shiftR` 8 + 0xDC)
319 writeWord8Buf oraw ow c2
320 writeWord8Buf oraw (ow+1) c1
321 writeWord8Buf oraw (ow+2) c4
322 writeWord8Buf oraw (ow+3) c3
327 chr2 :: Word16 -> Word16 -> Char
328 chr2 (W16# a#) (W16# b#) = C# (chr# (upper# +# lower# +# 0x10000#))
332 !upper# = uncheckedIShiftL# (x# -# 0xD800#) 10#
333 !lower# = y# -# 0xDC00#
336 validate1 :: Word16 -> Bool
337 validate1 x1 = (x1 >= 0 && x1 < 0xD800) || x1 > 0xDFFF
338 {-# INLINE validate1 #-}
340 validate2 :: Word16 -> Word16 -> Bool
341 validate2 x1 x2 = x1 >= 0xD800 && x1 <= 0xDBFF &&
342 x2 >= 0xDC00 && x2 <= 0xDFFF
343 {-# INLINE validate2 #-}