4 , NondecreasingIndentation
7 {-# OPTIONS_GHC -funbox-strict-fields #-}
9 -----------------------------------------------------------------------------
11 -- Module : GHC.IO.Encoding.UTF16
12 -- Copyright : (c) The University of Glasgow, 2009
13 -- License : see libraries/base/LICENSE
15 -- Maintainer : libraries@haskell.org
16 -- Stability : internal
17 -- Portability : non-portable
19 -- UTF-16 Codecs for the IO library
21 -- Portions Copyright : (c) Tom Harper 2008-2009,
22 -- (c) Bryan O'Sullivan 2009,
23 -- (c) Duncan Coutts 2009
25 -----------------------------------------------------------------------------
27 module GHC.IO.Encoding.UTF16 (
46 import GHC.IO.Encoding.Failure
47 import GHC.IO.Encoding.Types
53 -- -----------------------------------------------------------------------------
54 -- The UTF-16 codec: either UTF16BE or UTF16LE with a BOM
57 utf16 = mkUTF16 ErrorOnCodingFailure
59 mkUTF16 :: CodingFailureMode -> TextEncoding
60 mkUTF16 cfm = TextEncoding { textEncodingName = "UTF-16",
61 mkTextDecoder = utf16_DF cfm,
62 mkTextEncoder = utf16_EF cfm }
64 utf16_DF :: CodingFailureMode -> IO (TextDecoder (Maybe DecodeBuffer))
66 seen_bom <- newIORef Nothing
68 encode = utf16_decode seen_bom,
69 recover = recoverDecode cfm,
71 getState = readIORef seen_bom,
72 setState = writeIORef seen_bom
75 utf16_EF :: CodingFailureMode -> IO (TextEncoder Bool)
77 done_bom <- newIORef False
79 encode = utf16_encode done_bom,
80 recover = recoverEncode cfm,
82 getState = readIORef done_bom,
83 setState = writeIORef done_bom
86 utf16_encode :: IORef Bool -> EncodeBuffer
87 utf16_encode done_bom input
88 output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow, bufSize=os }
90 b <- readIORef done_bom
91 if b then utf16_native_encode input output
93 then return (OutputUnderflow,input,output)
95 writeIORef done_bom True
96 writeWord8Buf oraw ow bom1
97 writeWord8Buf oraw (ow+1) bom2
98 utf16_native_encode input output{ bufR = ow+2 }
100 utf16_decode :: IORef (Maybe DecodeBuffer) -> DecodeBuffer
101 utf16_decode seen_bom
102 input@Buffer{ bufRaw=iraw, bufL=ir, bufR=iw, bufSize=_ }
105 mb <- readIORef seen_bom
107 Just decode -> decode input output
109 if iw - ir < 2 then return (InputUnderflow,input,output) else do
110 c0 <- readWord8Buf iraw ir
111 c1 <- readWord8Buf iraw (ir+1)
113 _ | c0 == bomB && c1 == bomL -> do
114 writeIORef seen_bom (Just utf16be_decode)
115 utf16be_decode input{ bufL= ir+2 } output
116 | c0 == bomL && c1 == bomB -> do
117 writeIORef seen_bom (Just utf16le_decode)
118 utf16le_decode input{ bufL= ir+2 } output
120 writeIORef seen_bom (Just utf16_native_decode)
121 utf16_native_decode input output
124 bomB, bomL, bom1, bom2 :: Word8
128 -- choose UTF-16BE by default for UTF-16 output
129 utf16_native_decode :: DecodeBuffer
130 utf16_native_decode = utf16be_decode
132 utf16_native_encode :: EncodeBuffer
133 utf16_native_encode = utf16be_encode
138 -- -----------------------------------------------------------------------------
139 -- UTF16LE and UTF16BE
141 utf16be :: TextEncoding
142 utf16be = mkUTF16be ErrorOnCodingFailure
144 mkUTF16be :: CodingFailureMode -> TextEncoding
145 mkUTF16be cfm = TextEncoding { textEncodingName = "UTF-16BE",
146 mkTextDecoder = utf16be_DF cfm,
147 mkTextEncoder = utf16be_EF cfm }
149 utf16be_DF :: CodingFailureMode -> IO (TextDecoder ())
151 return (BufferCodec {
152 encode = utf16be_decode,
153 recover = recoverDecode cfm,
155 getState = return (),
156 setState = const $ return ()
159 utf16be_EF :: CodingFailureMode -> IO (TextEncoder ())
161 return (BufferCodec {
162 encode = utf16be_encode,
163 recover = recoverEncode cfm,
165 getState = return (),
166 setState = const $ return ()
169 utf16le :: TextEncoding
170 utf16le = mkUTF16le ErrorOnCodingFailure
172 mkUTF16le :: CodingFailureMode -> TextEncoding
173 mkUTF16le cfm = TextEncoding { textEncodingName = "UTF16-LE",
174 mkTextDecoder = utf16le_DF cfm,
175 mkTextEncoder = utf16le_EF cfm }
177 utf16le_DF :: CodingFailureMode -> IO (TextDecoder ())
179 return (BufferCodec {
180 encode = utf16le_decode,
181 recover = recoverDecode cfm,
183 getState = return (),
184 setState = const $ return ()
187 utf16le_EF :: CodingFailureMode -> IO (TextEncoder ())
189 return (BufferCodec {
190 encode = utf16le_encode,
191 recover = recoverEncode cfm,
193 getState = return (),
194 setState = const $ return ()
198 utf16be_decode :: DecodeBuffer
200 input@Buffer{ bufRaw=iraw, bufL=ir0, bufR=iw, bufSize=_ }
201 output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow0, bufSize=os }
204 | ow >= os = done OutputUnderflow ir ow
205 | ir >= iw = done InputUnderflow ir ow
206 | ir + 1 == iw = done InputUnderflow ir ow
208 c0 <- readWord8Buf iraw ir
209 c1 <- readWord8Buf iraw (ir+1)
210 let x1 = fromIntegral c0 `shiftL` 8 + fromIntegral c1
212 then do ow' <- writeCharBuf oraw ow (unsafeChr (fromIntegral x1))
214 else if iw - ir < 4 then done InputUnderflow ir ow else do
215 c2 <- readWord8Buf iraw (ir+2)
216 c3 <- readWord8Buf iraw (ir+3)
217 let x2 = fromIntegral c2 `shiftL` 8 + fromIntegral c3
218 if not (validate2 x1 x2) then invalid else do
219 ow' <- writeCharBuf oraw ow (chr2 x1 x2)
222 invalid = done InvalidSequence ir ow
224 -- lambda-lifted, to avoid thunks being built in the inner-loop:
225 done why !ir !ow = return (why,
226 if ir == iw then input{ bufL=0, bufR=0 }
227 else input{ bufL=ir },
232 utf16le_decode :: DecodeBuffer
234 input@Buffer{ bufRaw=iraw, bufL=ir0, bufR=iw, bufSize=_ }
235 output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow0, bufSize=os }
238 | ow >= os = done OutputUnderflow ir ow
239 | ir >= iw = done InputUnderflow ir ow
240 | ir + 1 == iw = done InputUnderflow ir ow
242 c0 <- readWord8Buf iraw ir
243 c1 <- readWord8Buf iraw (ir+1)
244 let x1 = fromIntegral c1 `shiftL` 8 + fromIntegral c0
246 then do ow' <- writeCharBuf oraw ow (unsafeChr (fromIntegral x1))
248 else if iw - ir < 4 then done InputUnderflow ir ow else do
249 c2 <- readWord8Buf iraw (ir+2)
250 c3 <- readWord8Buf iraw (ir+3)
251 let x2 = fromIntegral c3 `shiftL` 8 + fromIntegral c2
252 if not (validate2 x1 x2) then invalid else do
253 ow' <- writeCharBuf oraw ow (chr2 x1 x2)
256 invalid = done InvalidSequence ir ow
258 -- lambda-lifted, to avoid thunks being built in the inner-loop:
259 done why !ir !ow = return (why,
260 if ir == iw then input{ bufL=0, bufR=0 }
261 else input{ bufL=ir },
266 utf16be_encode :: EncodeBuffer
268 input@Buffer{ bufRaw=iraw, bufL=ir0, bufR=iw, bufSize=_ }
269 output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow0, bufSize=os }
271 done why !ir !ow = return (why,
272 if ir == iw then input{ bufL=0, bufR=0 }
273 else input{ bufL=ir },
276 | ir >= iw = done InputUnderflow ir ow
277 | os - ow < 2 = done OutputUnderflow ir ow
279 (c,ir') <- readCharBuf iraw ir
281 x | x < 0x10000 -> if isSurrogate c then done InvalidSequence ir ow else do
282 writeWord8Buf oraw ow (fromIntegral (x `shiftR` 8))
283 writeWord8Buf oraw (ow+1) (fromIntegral x)
286 if os - ow < 4 then done OutputUnderflow ir ow else do
289 c1 = fromIntegral (n1 `shiftR` 18 + 0xD8)
290 c2 = fromIntegral (n1 `shiftR` 10)
292 c3 = fromIntegral (n2 `shiftR` 8 + 0xDC)
295 writeWord8Buf oraw ow c1
296 writeWord8Buf oraw (ow+1) c2
297 writeWord8Buf oraw (ow+2) c3
298 writeWord8Buf oraw (ow+3) c4
303 utf16le_encode :: EncodeBuffer
305 input@Buffer{ bufRaw=iraw, bufL=ir0, bufR=iw, bufSize=_ }
306 output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow0, bufSize=os }
308 done why !ir !ow = return (why,
309 if ir == iw then input{ bufL=0, bufR=0 }
310 else input{ bufL=ir },
313 | ir >= iw = done InputUnderflow ir ow
314 | os - ow < 2 = done OutputUnderflow ir ow
316 (c,ir') <- readCharBuf iraw ir
318 x | x < 0x10000 -> if isSurrogate c then done InvalidSequence ir ow else do
319 writeWord8Buf oraw ow (fromIntegral x)
320 writeWord8Buf oraw (ow+1) (fromIntegral (x `shiftR` 8))
323 if os - ow < 4 then done OutputUnderflow ir ow else do
326 c1 = fromIntegral (n1 `shiftR` 18 + 0xD8)
327 c2 = fromIntegral (n1 `shiftR` 10)
329 c3 = fromIntegral (n2 `shiftR` 8 + 0xDC)
332 writeWord8Buf oraw ow c2
333 writeWord8Buf oraw (ow+1) c1
334 writeWord8Buf oraw (ow+2) c4
335 writeWord8Buf oraw (ow+3) c3
340 chr2 :: Word16 -> Word16 -> Char
341 chr2 (W16# a#) (W16# b#) = C# (chr# (upper# +# lower# +# 0x10000#))
345 !upper# = uncheckedIShiftL# (x# -# 0xD800#) 10#
346 !lower# = y# -# 0xDC00#
349 validate1 :: Word16 -> Bool
350 validate1 x1 = (x1 >= 0 && x1 < 0xD800) || x1 > 0xDFFF
351 {-# INLINE validate1 #-}
353 validate2 :: Word16 -> Word16 -> Bool
354 validate2 x1 x2 = x1 >= 0xD800 && x1 <= 0xDBFF &&
355 x2 >= 0xDC00 && x2 <= 0xDFFF
356 {-# INLINE validate2 #-}