4 , NondecreasingIndentation
7 {-# OPTIONS_GHC -funbox-strict-fields #-}
9 -----------------------------------------------------------------------------
11 -- Module : GHC.IO.Encoding.UTF16
12 -- Copyright : (c) The University of Glasgow, 2009
13 -- License : see libraries/base/LICENSE
15 -- Maintainer : libraries@haskell.org
16 -- Stability : internal
17 -- Portability : non-portable
19 -- UTF-16 Codecs for the IO library
21 -- Portions Copyright : (c) Tom Harper 2008-2009,
22 -- (c) Bryan O'Sullivan 2009,
23 -- (c) Duncan Coutts 2009
25 -----------------------------------------------------------------------------
27 module GHC.IO.Encoding.UTF16 (
45 import GHC.IO.Exception
47 import GHC.IO.Encoding.Types
54 import System.Posix.Internals
59 puts :: String -> IO ()
60 -- In reality should be withCString, but assume ASCII to avoid possible loop
61 puts s = do withCAStringLen (s++"\n") $ \(p,len) ->
62 c_write 1 (castPtr p) (fromIntegral len)
66 -- -----------------------------------------------------------------------------
67 -- The UTF-16 codec: either UTF16BE or UTF16LE with a BOM
70 utf16 = TextEncoding { textEncodingName = "UTF-16",
71 mkTextDecoder = utf16_DF,
72 mkTextEncoder = utf16_EF }
74 utf16_DF :: IO (TextDecoder (Maybe DecodeBuffer))
76 seen_bom <- newIORef Nothing
78 encode = utf16_decode seen_bom,
80 getState = readIORef seen_bom,
81 setState = writeIORef seen_bom
84 utf16_EF :: IO (TextEncoder Bool)
86 done_bom <- newIORef False
88 encode = utf16_encode done_bom,
90 getState = readIORef done_bom,
91 setState = writeIORef done_bom
94 utf16_encode :: IORef Bool -> EncodeBuffer
95 utf16_encode done_bom input
96 output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow, bufSize=os }
98 b <- readIORef done_bom
99 if b then utf16_native_encode input output
101 then return (input,output)
103 writeIORef done_bom True
104 writeWord8Buf oraw ow bom1
105 writeWord8Buf oraw (ow+1) bom2
106 utf16_native_encode input output{ bufR = ow+2 }
108 utf16_decode :: IORef (Maybe DecodeBuffer) -> DecodeBuffer
109 utf16_decode seen_bom
110 input@Buffer{ bufRaw=iraw, bufL=ir, bufR=iw, bufSize=_ }
113 mb <- readIORef seen_bom
115 Just decode -> decode input output
117 if iw - ir < 2 then return (input,output) else do
118 c0 <- readWord8Buf iraw ir
119 c1 <- readWord8Buf iraw (ir+1)
121 _ | c0 == bomB && c1 == bomL -> do
122 writeIORef seen_bom (Just utf16be_decode)
123 utf16be_decode input{ bufL= ir+2 } output
124 | c0 == bomL && c1 == bomB -> do
125 writeIORef seen_bom (Just utf16le_decode)
126 utf16le_decode input{ bufL= ir+2 } output
128 writeIORef seen_bom (Just utf16_native_decode)
129 utf16_native_decode input output
132 bomB, bomL, bom1, bom2 :: Word8
136 -- choose UTF-16BE by default for UTF-16 output
137 utf16_native_decode :: DecodeBuffer
138 utf16_native_decode = utf16be_decode
140 utf16_native_encode :: EncodeBuffer
141 utf16_native_encode = utf16be_encode
146 -- -----------------------------------------------------------------------------
147 -- UTF16LE and UTF16BE
149 utf16be :: TextEncoding
150 utf16be = TextEncoding { textEncodingName = "UTF-16BE",
151 mkTextDecoder = utf16be_DF,
152 mkTextEncoder = utf16be_EF }
154 utf16be_DF :: IO (TextDecoder ())
156 return (BufferCodec {
157 encode = utf16be_decode,
159 getState = return (),
160 setState = const $ return ()
163 utf16be_EF :: IO (TextEncoder ())
165 return (BufferCodec {
166 encode = utf16be_encode,
168 getState = return (),
169 setState = const $ return ()
172 utf16le :: TextEncoding
173 utf16le = TextEncoding { textEncodingName = "UTF16-LE",
174 mkTextDecoder = utf16le_DF,
175 mkTextEncoder = utf16le_EF }
177 utf16le_DF :: IO (TextDecoder ())
179 return (BufferCodec {
180 encode = utf16le_decode,
182 getState = return (),
183 setState = const $ return ()
186 utf16le_EF :: IO (TextEncoder ())
188 return (BufferCodec {
189 encode = utf16le_encode,
191 getState = return (),
192 setState = const $ return ()
196 utf16be_decode :: DecodeBuffer
198 input@Buffer{ bufRaw=iraw, bufL=ir0, bufR=iw, bufSize=_ }
199 output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow0, bufSize=os }
202 | ow >= os || ir >= iw = done ir ow
203 | ir + 1 == iw = done ir ow
205 c0 <- readWord8Buf iraw ir
206 c1 <- readWord8Buf iraw (ir+1)
207 let x1 = fromIntegral c0 `shiftL` 8 + fromIntegral c1
209 then do ow' <- writeCharBuf oraw ow (unsafeChr (fromIntegral x1))
211 else if iw - ir < 4 then done ir ow else do
212 c2 <- readWord8Buf iraw (ir+2)
213 c3 <- readWord8Buf iraw (ir+3)
214 let x2 = fromIntegral c2 `shiftL` 8 + fromIntegral c3
215 if not (validate2 x1 x2) then invalid else do
216 ow' <- writeCharBuf oraw ow (chr2 x1 x2)
219 invalid = if ir > ir0 then done ir ow else ioe_decodingError
221 -- lambda-lifted, to avoid thunks being built in the inner-loop:
222 done !ir !ow = return (if ir == iw then input{ bufL=0, bufR=0 }
223 else input{ bufL=ir },
228 utf16le_decode :: DecodeBuffer
230 input@Buffer{ bufRaw=iraw, bufL=ir0, bufR=iw, bufSize=_ }
231 output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow0, bufSize=os }
234 | ow >= os || ir >= iw = done ir ow
235 | ir + 1 == iw = done ir ow
237 c0 <- readWord8Buf iraw ir
238 c1 <- readWord8Buf iraw (ir+1)
239 let x1 = fromIntegral c1 `shiftL` 8 + fromIntegral c0
241 then do ow' <- writeCharBuf oraw ow (unsafeChr (fromIntegral x1))
243 else if iw - ir < 4 then done ir ow else do
244 c2 <- readWord8Buf iraw (ir+2)
245 c3 <- readWord8Buf iraw (ir+3)
246 let x2 = fromIntegral c3 `shiftL` 8 + fromIntegral c2
247 if not (validate2 x1 x2) then invalid else do
248 ow' <- writeCharBuf oraw ow (chr2 x1 x2)
251 invalid = if ir > ir0 then done ir ow else ioe_decodingError
253 -- lambda-lifted, to avoid thunks being built in the inner-loop:
254 done !ir !ow = return (if ir == iw then input{ bufL=0, bufR=0 }
255 else input{ bufL=ir },
260 ioe_decodingError :: IO a
261 ioe_decodingError = ioException
262 (IOError Nothing InvalidArgument "utf16_decode"
263 "invalid UTF-16 byte sequence" Nothing Nothing)
265 utf16be_encode :: EncodeBuffer
267 input@Buffer{ bufRaw=iraw, bufL=ir0, bufR=iw, bufSize=_ }
268 output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow0, bufSize=os }
270 done !ir !ow = return (if ir == iw then input{ bufL=0, bufR=0 }
271 else input{ bufL=ir },
274 | ir >= iw = done ir ow
275 | os - ow < 2 = done ir ow
277 (c,ir') <- readCharBuf iraw ir
279 x | x < 0x10000 -> do
280 writeWord8Buf oraw ow (fromIntegral (x `shiftR` 8))
281 writeWord8Buf oraw (ow+1) (fromIntegral x)
284 if os - ow < 4 then done ir ow else do
287 c1 = fromIntegral (n1 `shiftR` 18 + 0xD8)
288 c2 = fromIntegral (n1 `shiftR` 10)
290 c3 = fromIntegral (n2 `shiftR` 8 + 0xDC)
293 writeWord8Buf oraw ow c1
294 writeWord8Buf oraw (ow+1) c2
295 writeWord8Buf oraw (ow+2) c3
296 writeWord8Buf oraw (ow+3) c4
301 utf16le_encode :: EncodeBuffer
303 input@Buffer{ bufRaw=iraw, bufL=ir0, bufR=iw, bufSize=_ }
304 output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow0, bufSize=os }
306 done !ir !ow = return (if ir == iw then input{ bufL=0, bufR=0 }
307 else input{ bufL=ir },
310 | ir >= iw = done ir ow
311 | os - ow < 2 = done ir ow
313 (c,ir') <- readCharBuf iraw ir
315 x | x < 0x10000 -> do
316 writeWord8Buf oraw ow (fromIntegral x)
317 writeWord8Buf oraw (ow+1) (fromIntegral (x `shiftR` 8))
320 if os - ow < 4 then done ir ow else do
323 c1 = fromIntegral (n1 `shiftR` 18 + 0xD8)
324 c2 = fromIntegral (n1 `shiftR` 10)
326 c3 = fromIntegral (n2 `shiftR` 8 + 0xDC)
329 writeWord8Buf oraw ow c2
330 writeWord8Buf oraw (ow+1) c1
331 writeWord8Buf oraw (ow+2) c4
332 writeWord8Buf oraw (ow+3) c3
337 chr2 :: Word16 -> Word16 -> Char
338 chr2 (W16# a#) (W16# b#) = C# (chr# (upper# +# lower# +# 0x10000#))
342 !upper# = uncheckedIShiftL# (x# -# 0xD800#) 10#
343 !lower# = y# -# 0xDC00#
346 validate1 :: Word16 -> Bool
347 validate1 x1 = (x1 >= 0 && x1 < 0xD800) || x1 > 0xDFFF
348 {-# INLINE validate1 #-}
350 validate2 :: Word16 -> Word16 -> Bool
351 validate2 x1 x2 = x1 >= 0xD800 && x1 <= 0xDBFF &&
352 x2 >= 0xDC00 && x2 <= 0xDFFF
353 {-# INLINE validate2 #-}