4 , NondecreasingIndentation
7 {-# OPTIONS_GHC -funbox-strict-fields #-}
9 -----------------------------------------------------------------------------
11 -- Module : GHC.IO.Encoding.UTF16
12 -- Copyright : (c) The University of Glasgow, 2009
13 -- License : see libraries/base/LICENSE
15 -- Maintainer : libraries@haskell.org
16 -- Stability : internal
17 -- Portability : non-portable
19 -- UTF-16 Codecs for the IO library
21 -- Portions Copyright : (c) Tom Harper 2008-2009,
22 -- (c) Bryan O'Sullivan 2009,
23 -- (c) Duncan Coutts 2009
25 -----------------------------------------------------------------------------
27 module GHC.IO.Encoding.UTF16 (
45 import GHC.IO.Exception
47 import GHC.IO.Encoding.Types
54 import System.Posix.Internals
59 puts :: String -> IO ()
60 puts s = do withCStringLen (s++"\n") $ \(p,len) ->
61 c_write 1 (castPtr p) (fromIntegral len)
65 -- -----------------------------------------------------------------------------
66 -- The UTF-16 codec: either UTF16BE or UTF16LE with a BOM
69 utf16 = TextEncoding { textEncodingName = "UTF-16",
70 mkTextDecoder = utf16_DF,
71 mkTextEncoder = utf16_EF }
73 utf16_DF :: IO (TextDecoder (Maybe DecodeBuffer))
75 seen_bom <- newIORef Nothing
77 encode = utf16_decode seen_bom,
79 getState = readIORef seen_bom,
80 setState = writeIORef seen_bom
83 utf16_EF :: IO (TextEncoder Bool)
85 done_bom <- newIORef False
87 encode = utf16_encode done_bom,
89 getState = readIORef done_bom,
90 setState = writeIORef done_bom
93 utf16_encode :: IORef Bool -> EncodeBuffer
94 utf16_encode done_bom input
95 output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow, bufSize=os }
97 b <- readIORef done_bom
98 if b then utf16_native_encode input output
100 then return (input,output)
102 writeIORef done_bom True
103 writeWord8Buf oraw ow bom1
104 writeWord8Buf oraw (ow+1) bom2
105 utf16_native_encode input output{ bufR = ow+2 }
107 utf16_decode :: IORef (Maybe DecodeBuffer) -> DecodeBuffer
108 utf16_decode seen_bom
109 input@Buffer{ bufRaw=iraw, bufL=ir, bufR=iw, bufSize=_ }
112 mb <- readIORef seen_bom
114 Just decode -> decode input output
116 if iw - ir < 2 then return (input,output) else do
117 c0 <- readWord8Buf iraw ir
118 c1 <- readWord8Buf iraw (ir+1)
120 _ | c0 == bomB && c1 == bomL -> do
121 writeIORef seen_bom (Just utf16be_decode)
122 utf16be_decode input{ bufL= ir+2 } output
123 | c0 == bomL && c1 == bomB -> do
124 writeIORef seen_bom (Just utf16le_decode)
125 utf16le_decode input{ bufL= ir+2 } output
127 writeIORef seen_bom (Just utf16_native_decode)
128 utf16_native_decode input output
131 bomB, bomL, bom1, bom2 :: Word8
135 -- choose UTF-16BE by default for UTF-16 output
136 utf16_native_decode :: DecodeBuffer
137 utf16_native_decode = utf16be_decode
139 utf16_native_encode :: EncodeBuffer
140 utf16_native_encode = utf16be_encode
145 -- -----------------------------------------------------------------------------
146 -- UTF16LE and UTF16BE
148 utf16be :: TextEncoding
149 utf16be = TextEncoding { textEncodingName = "UTF-16BE",
150 mkTextDecoder = utf16be_DF,
151 mkTextEncoder = utf16be_EF }
153 utf16be_DF :: IO (TextDecoder ())
155 return (BufferCodec {
156 encode = utf16be_decode,
158 getState = return (),
159 setState = const $ return ()
162 utf16be_EF :: IO (TextEncoder ())
164 return (BufferCodec {
165 encode = utf16be_encode,
167 getState = return (),
168 setState = const $ return ()
171 utf16le :: TextEncoding
172 utf16le = TextEncoding { textEncodingName = "UTF16-LE",
173 mkTextDecoder = utf16le_DF,
174 mkTextEncoder = utf16le_EF }
176 utf16le_DF :: IO (TextDecoder ())
178 return (BufferCodec {
179 encode = utf16le_decode,
181 getState = return (),
182 setState = const $ return ()
185 utf16le_EF :: IO (TextEncoder ())
187 return (BufferCodec {
188 encode = utf16le_encode,
190 getState = return (),
191 setState = const $ return ()
195 utf16be_decode :: DecodeBuffer
197 input@Buffer{ bufRaw=iraw, bufL=ir0, bufR=iw, bufSize=_ }
198 output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow0, bufSize=os }
201 | ow >= os || ir >= iw = done ir ow
202 | ir + 1 == iw = done ir ow
204 c0 <- readWord8Buf iraw ir
205 c1 <- readWord8Buf iraw (ir+1)
206 let x1 = fromIntegral c0 `shiftL` 8 + fromIntegral c1
208 then do ow' <- writeCharBuf oraw ow (unsafeChr (fromIntegral x1))
210 else if iw - ir < 4 then done ir ow else do
211 c2 <- readWord8Buf iraw (ir+2)
212 c3 <- readWord8Buf iraw (ir+3)
213 let x2 = fromIntegral c2 `shiftL` 8 + fromIntegral c3
214 if not (validate2 x1 x2) then invalid else do
215 ow' <- writeCharBuf oraw ow (chr2 x1 x2)
218 invalid = if ir > ir0 then done ir ow else ioe_decodingError
220 -- lambda-lifted, to avoid thunks being built in the inner-loop:
221 done !ir !ow = return (if ir == iw then input{ bufL=0, bufR=0 }
222 else input{ bufL=ir },
227 utf16le_decode :: DecodeBuffer
229 input@Buffer{ bufRaw=iraw, bufL=ir0, bufR=iw, bufSize=_ }
230 output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow0, bufSize=os }
233 | ow >= os || ir >= iw = done ir ow
234 | ir + 1 == iw = done ir ow
236 c0 <- readWord8Buf iraw ir
237 c1 <- readWord8Buf iraw (ir+1)
238 let x1 = fromIntegral c1 `shiftL` 8 + fromIntegral c0
240 then do ow' <- writeCharBuf oraw ow (unsafeChr (fromIntegral x1))
242 else if iw - ir < 4 then done ir ow else do
243 c2 <- readWord8Buf iraw (ir+2)
244 c3 <- readWord8Buf iraw (ir+3)
245 let x2 = fromIntegral c3 `shiftL` 8 + fromIntegral c2
246 if not (validate2 x1 x2) then invalid else do
247 ow' <- writeCharBuf oraw ow (chr2 x1 x2)
250 invalid = if ir > ir0 then done ir ow else ioe_decodingError
252 -- lambda-lifted, to avoid thunks being built in the inner-loop:
253 done !ir !ow = return (if ir == iw then input{ bufL=0, bufR=0 }
254 else input{ bufL=ir },
259 ioe_decodingError :: IO a
260 ioe_decodingError = ioException
261 (IOError Nothing InvalidArgument "utf16_decode"
262 "invalid UTF-16 byte sequence" Nothing Nothing)
264 utf16be_encode :: EncodeBuffer
266 input@Buffer{ bufRaw=iraw, bufL=ir0, bufR=iw, bufSize=_ }
267 output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow0, bufSize=os }
269 done !ir !ow = return (if ir == iw then input{ bufL=0, bufR=0 }
270 else input{ bufL=ir },
273 | ir >= iw = done ir ow
274 | os - ow < 2 = done ir ow
276 (c,ir') <- readCharBuf iraw ir
278 x | x < 0x10000 -> do
279 writeWord8Buf oraw ow (fromIntegral (x `shiftR` 8))
280 writeWord8Buf oraw (ow+1) (fromIntegral x)
283 if os - ow < 4 then done ir ow else do
286 c1 = fromIntegral (n1 `shiftR` 18 + 0xD8)
287 c2 = fromIntegral (n1 `shiftR` 10)
289 c3 = fromIntegral (n2 `shiftR` 8 + 0xDC)
292 writeWord8Buf oraw ow c1
293 writeWord8Buf oraw (ow+1) c2
294 writeWord8Buf oraw (ow+2) c3
295 writeWord8Buf oraw (ow+3) c4
300 utf16le_encode :: EncodeBuffer
302 input@Buffer{ bufRaw=iraw, bufL=ir0, bufR=iw, bufSize=_ }
303 output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow0, bufSize=os }
305 done !ir !ow = return (if ir == iw then input{ bufL=0, bufR=0 }
306 else input{ bufL=ir },
309 | ir >= iw = done ir ow
310 | os - ow < 2 = done ir ow
312 (c,ir') <- readCharBuf iraw ir
314 x | x < 0x10000 -> do
315 writeWord8Buf oraw ow (fromIntegral x)
316 writeWord8Buf oraw (ow+1) (fromIntegral (x `shiftR` 8))
319 if os - ow < 4 then done ir ow else do
322 c1 = fromIntegral (n1 `shiftR` 18 + 0xD8)
323 c2 = fromIntegral (n1 `shiftR` 10)
325 c3 = fromIntegral (n2 `shiftR` 8 + 0xDC)
328 writeWord8Buf oraw ow c2
329 writeWord8Buf oraw (ow+1) c1
330 writeWord8Buf oraw (ow+2) c4
331 writeWord8Buf oraw (ow+3) c3
336 chr2 :: Word16 -> Word16 -> Char
337 chr2 (W16# a#) (W16# b#) = C# (chr# (upper# +# lower# +# 0x10000#))
341 !upper# = uncheckedIShiftL# (x# -# 0xD800#) 10#
342 !lower# = y# -# 0xDC00#
345 validate1 :: Word16 -> Bool
346 validate1 x1 = (x1 >= 0 && x1 < 0xD800) || x1 > 0xDFFF
347 {-# INLINE validate1 #-}
349 validate2 :: Word16 -> Word16 -> Bool
350 validate2 x1 x2 = x1 >= 0xD800 && x1 <= 0xDBFF &&
351 x2 >= 0xDC00 && x2 <= 0xDFFF
352 {-# INLINE validate2 #-}