1 {-# OPTIONS_GHC -fno-implicit-prelude -funbox-strict-fields #-}
2 {-# LANGUAGE BangPatterns #-}
3 -----------------------------------------------------------------------------
5 -- Module : GHC.IO.Encoding.UTF16
6 -- Copyright : (c) The University of Glasgow, 2009
7 -- License : see libraries/base/LICENSE
9 -- Maintainer : libraries@haskell.org
10 -- Stability : internal
11 -- Portability : non-portable
13 -- UTF-16 Codecs for the IO library
15 -- Portions Copyright : (c) Tom Harper 2008-2009,
16 -- (c) Bryan O'Sullivan 2009,
17 -- (c) Duncan Coutts 2009
19 -----------------------------------------------------------------------------
21 module GHC.IO.Encoding.UTF16 (
39 import GHC.IO.Exception
41 import GHC.IO.Encoding.Types
48 import System.Posix.Internals
52 puts :: String -> IO ()
53 puts s = do withCStringLen (s++"\n") $ \(p,len) ->
54 c_write 1 p (fromIntegral len)
58 -- -----------------------------------------------------------------------------
59 -- The UTF-16 codec: either UTF16BE or UTF16LE with a BOM
62 utf16 = TextEncoding { mkTextDecoder = utf16_DF,
63 mkTextEncoder = utf16_EF }
65 utf16_DF :: IO (TextDecoder (Maybe DecodeBuffer))
67 seen_bom <- newIORef Nothing
69 encode = utf16_decode seen_bom,
71 getState = readIORef seen_bom,
72 setState = writeIORef seen_bom
75 utf16_EF :: IO (TextEncoder Bool)
77 done_bom <- newIORef False
79 encode = utf16_encode done_bom,
81 getState = readIORef done_bom,
82 setState = writeIORef done_bom
85 utf16_encode :: IORef Bool -> EncodeBuffer
86 utf16_encode done_bom input
87 output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow, bufSize=os }
89 b <- readIORef done_bom
90 if b then utf16_native_encode input output
92 then return (input,output)
94 writeIORef done_bom True
95 writeWord8Buf oraw ow bom1
96 writeWord8Buf oraw (ow+1) bom2
97 utf16_native_encode input output{ bufR = ow+2 }
99 utf16_decode :: IORef (Maybe DecodeBuffer) -> DecodeBuffer
100 utf16_decode seen_bom
101 input@Buffer{ bufRaw=iraw, bufL=ir, bufR=iw, bufSize=_ }
104 mb <- readIORef seen_bom
106 Just decode -> decode input output
108 if iw - ir < 2 then return (input,output) else do
109 c0 <- readWord8Buf iraw ir
110 c1 <- readWord8Buf iraw (ir+1)
112 _ | c0 == bomB && c1 == bomL -> do
113 writeIORef seen_bom (Just utf16be_decode)
114 utf16be_decode input{ bufL= ir+2 } output
115 | c0 == bomL && c1 == bomB -> do
116 writeIORef seen_bom (Just utf16le_decode)
117 utf16le_decode input{ bufL= ir+2 } output
119 writeIORef seen_bom (Just utf16_native_decode)
120 utf16_native_decode input output
123 bomB, bomL, bom1, bom2 :: Word8
127 -- choose UTF-16BE by default for UTF-16 output
128 utf16_native_decode :: DecodeBuffer
129 utf16_native_decode = utf16be_decode
131 utf16_native_encode :: EncodeBuffer
132 utf16_native_encode = utf16be_encode
137 -- -----------------------------------------------------------------------------
138 -- UTF16LE and UTF16BE
140 utf16be :: TextEncoding
141 utf16be = TextEncoding { mkTextDecoder = utf16be_DF,
142 mkTextEncoder = utf16be_EF }
144 utf16be_DF :: IO (TextDecoder ())
146 return (BufferCodec {
147 encode = utf16be_decode,
149 getState = return (),
150 setState = const $ return ()
153 utf16be_EF :: IO (TextEncoder ())
155 return (BufferCodec {
156 encode = utf16be_encode,
158 getState = return (),
159 setState = const $ return ()
162 utf16le :: TextEncoding
163 utf16le = TextEncoding { mkTextDecoder = utf16le_DF,
164 mkTextEncoder = utf16le_EF }
166 utf16le_DF :: IO (TextDecoder ())
168 return (BufferCodec {
169 encode = utf16le_decode,
171 getState = return (),
172 setState = const $ return ()
175 utf16le_EF :: IO (TextEncoder ())
177 return (BufferCodec {
178 encode = utf16le_encode,
180 getState = return (),
181 setState = const $ return ()
185 utf16be_decode :: DecodeBuffer
187 input@Buffer{ bufRaw=iraw, bufL=ir0, bufR=iw, bufSize=_ }
188 output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow0, bufSize=os }
191 | ow >= os || ir >= iw = done ir ow
192 | ir + 1 == iw = done ir ow
194 c0 <- readWord8Buf iraw ir
195 c1 <- readWord8Buf iraw (ir+1)
196 let x1 = fromIntegral c0 `shiftL` 8 + fromIntegral c1
198 then do writeCharBuf oraw ow (unsafeChr (fromIntegral x1))
200 else if iw - ir < 4 then done ir ow else do
201 c2 <- readWord8Buf iraw (ir+2)
202 c3 <- readWord8Buf iraw (ir+3)
203 let x2 = fromIntegral c2 `shiftL` 8 + fromIntegral c3
204 if not (validate2 x1 x2) then invalid else do
205 writeCharBuf oraw ow (chr2 x1 x2)
208 invalid = if ir > ir0 then done ir ow else ioe_decodingError
210 -- lambda-lifted, to avoid thunks being built in the inner-loop:
211 done !ir !ow = return (if ir == iw then input{ bufL=0, bufR=0 }
212 else input{ bufL=ir },
217 utf16le_decode :: DecodeBuffer
219 input@Buffer{ bufRaw=iraw, bufL=ir0, bufR=iw, bufSize=_ }
220 output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow0, bufSize=os }
223 | ow >= os || ir >= iw = done ir ow
224 | ir + 1 == iw = done ir ow
226 c0 <- readWord8Buf iraw ir
227 c1 <- readWord8Buf iraw (ir+1)
228 let x1 = fromIntegral c1 `shiftL` 8 + fromIntegral c0
230 then do writeCharBuf oraw ow (unsafeChr (fromIntegral x1))
232 else if iw - ir < 4 then done ir ow else do
233 c2 <- readWord8Buf iraw (ir+2)
234 c3 <- readWord8Buf iraw (ir+3)
235 let x2 = fromIntegral c3 `shiftL` 8 + fromIntegral c2
236 if not (validate2 x1 x2) then invalid else do
237 writeCharBuf oraw ow (chr2 x1 x2)
240 invalid = if ir > ir0 then done ir ow else ioe_decodingError
242 -- lambda-lifted, to avoid thunks being built in the inner-loop:
243 done !ir !ow = return (if ir == iw then input{ bufL=0, bufR=0 }
244 else input{ bufL=ir },
249 ioe_decodingError :: IO a
250 ioe_decodingError = ioException
251 (IOError Nothing InvalidArgument "utf16_decode"
252 "invalid UTF-16 byte sequence" Nothing Nothing)
254 utf16be_encode :: EncodeBuffer
256 input@Buffer{ bufRaw=iraw, bufL=ir0, bufR=iw, bufSize=_ }
257 output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow0, bufSize=os }
259 done !ir !ow = return (if ir == iw then input{ bufL=0, bufR=0 }
260 else input{ bufL=ir },
263 | ir >= iw = done ir ow
264 | os - ow < 2 = done ir ow
266 (c,ir') <- readCharBuf iraw ir
268 x | x < 0x10000 -> do
269 writeWord8Buf oraw ow (fromIntegral (x `shiftR` 8))
270 writeWord8Buf oraw (ow+1) (fromIntegral x)
273 if os - ow < 4 then done ir ow else do
276 c1 = fromIntegral (n1 `shiftR` 18 + 0xD8)
277 c2 = fromIntegral (n1 `shiftR` 10)
279 c3 = fromIntegral (n2 `shiftR` 8 + 0xDC)
282 writeWord8Buf oraw ow c1
283 writeWord8Buf oraw (ow+1) c2
284 writeWord8Buf oraw (ow+2) c3
285 writeWord8Buf oraw (ow+3) c4
290 utf16le_encode :: EncodeBuffer
292 input@Buffer{ bufRaw=iraw, bufL=ir0, bufR=iw, bufSize=_ }
293 output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow0, bufSize=os }
295 done !ir !ow = return (if ir == iw then input{ bufL=0, bufR=0 }
296 else input{ bufL=ir },
299 | ir >= iw = done ir ow
300 | os - ow < 2 = done ir ow
302 (c,ir') <- readCharBuf iraw ir
304 x | x < 0x10000 -> do
305 writeWord8Buf oraw ow (fromIntegral x)
306 writeWord8Buf oraw (ow+1) (fromIntegral (x `shiftR` 8))
309 if os - ow < 4 then done ir ow else do
312 c1 = fromIntegral (n1 `shiftR` 18 + 0xD8)
313 c2 = fromIntegral (n1 `shiftR` 10)
315 c3 = fromIntegral (n2 `shiftR` 8 + 0xDC)
318 writeWord8Buf oraw ow c2
319 writeWord8Buf oraw (ow+1) c1
320 writeWord8Buf oraw (ow+2) c4
321 writeWord8Buf oraw (ow+3) c3
326 chr2 :: Word16 -> Word16 -> Char
327 chr2 (W16# a#) (W16# b#) = C# (chr# (upper# +# lower# +# 0x10000#))
331 !upper# = uncheckedIShiftL# (x# -# 0xD800#) 10#
332 !lower# = y# -# 0xDC00#
335 validate1 :: Word16 -> Bool
336 validate1 x1 = (x1 >= 0 && x1 < 0xD800) || x1 > 0xDFFF
337 {-# INLINE validate1 #-}
339 validate2 :: Word16 -> Word16 -> Bool
340 validate2 x1 x2 = x1 >= 0xD800 && x1 <= 0xDBFF &&
341 x2 >= 0xDC00 && x2 <= 0xDFFF
342 {-# INLINE validate2 #-}