1 {-# OPTIONS_GHC -XNoImplicitPrelude -funbox-strict-fields #-}
2 {-# LANGUAGE BangPatterns #-}
3 -----------------------------------------------------------------------------
5 -- Module : GHC.IO.Encoding.UTF32
6 -- Copyright : (c) The University of Glasgow, 2009
7 -- License : see libraries/base/LICENSE
9 -- Maintainer : libraries@haskell.org
10 -- Stability : internal
11 -- Portability : non-portable
13 -- UTF-32 Codecs for the IO library
15 -- Portions Copyright : (c) Tom Harper 2008-2009,
16 -- (c) Bryan O'Sullivan 2009,
17 -- (c) Duncan Coutts 2009
19 -----------------------------------------------------------------------------
21 module GHC.IO.Encoding.UTF32 (
39 import GHC.IO.Exception
41 import GHC.IO.Encoding.Types
47 -- -----------------------------------------------------------------------------
48 -- The UTF-32 codec: either UTF-32BE or UTF-32LE with a BOM
51 utf32 = TextEncoding { mkTextDecoder = utf32_DF,
52 mkTextEncoder = utf32_EF }
54 utf32_DF :: IO (TextDecoder (Maybe DecodeBuffer))
56 seen_bom <- newIORef Nothing
58 encode = utf32_decode seen_bom,
60 getState = readIORef seen_bom,
61 setState = writeIORef seen_bom
64 utf32_EF :: IO (TextEncoder Bool)
66 done_bom <- newIORef False
68 encode = utf32_encode done_bom,
70 getState = readIORef done_bom,
71 setState = writeIORef done_bom
74 utf32_encode :: IORef Bool -> EncodeBuffer
75 utf32_encode done_bom input
76 output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow, bufSize=os }
78 b <- readIORef done_bom
79 if b then utf32_native_encode input output
81 then return (input,output)
83 writeIORef done_bom True
84 writeWord8Buf oraw ow bom0
85 writeWord8Buf oraw (ow+1) bom1
86 writeWord8Buf oraw (ow+2) bom2
87 writeWord8Buf oraw (ow+3) bom3
88 utf32_native_encode input output{ bufR = ow+4 }
90 utf32_decode :: IORef (Maybe DecodeBuffer) -> DecodeBuffer
92 input@Buffer{ bufRaw=iraw, bufL=ir, bufR=iw, bufSize=_ }
95 mb <- readIORef seen_bom
97 Just decode -> decode input output
99 if iw - ir < 4 then return (input,output) else do
100 c0 <- readWord8Buf iraw ir
101 c1 <- readWord8Buf iraw (ir+1)
102 c2 <- readWord8Buf iraw (ir+2)
103 c3 <- readWord8Buf iraw (ir+3)
105 _ | c0 == bom0 && c1 == bom1 && c2 == bom2 && c3 == bom3 -> do
106 writeIORef seen_bom (Just utf32be_decode)
107 utf32be_decode input{ bufL= ir+4 } output
108 _ | c0 == bom3 && c1 == bom2 && c2 == bom1 && c3 == bom0 -> do
109 writeIORef seen_bom (Just utf32le_decode)
110 utf32le_decode input{ bufL= ir+4 } output
112 writeIORef seen_bom (Just utf32_native_decode)
113 utf32_native_decode input output
116 bom0, bom1, bom2, bom3 :: Word8
122 -- choose UTF-32BE by default for UTF-32 output
123 utf32_native_decode :: DecodeBuffer
124 utf32_native_decode = utf32be_decode
126 utf32_native_encode :: EncodeBuffer
127 utf32_native_encode = utf32be_encode
129 -- -----------------------------------------------------------------------------
130 -- UTF32LE and UTF32BE
132 utf32be :: TextEncoding
133 utf32be = TextEncoding { mkTextDecoder = utf32be_DF,
134 mkTextEncoder = utf32be_EF }
136 utf32be_DF :: IO (TextDecoder ())
138 return (BufferCodec {
139 encode = utf32be_decode,
141 getState = return (),
142 setState = const $ return ()
145 utf32be_EF :: IO (TextEncoder ())
147 return (BufferCodec {
148 encode = utf32be_encode,
150 getState = return (),
151 setState = const $ return ()
155 utf32le :: TextEncoding
156 utf32le = TextEncoding { mkTextDecoder = utf32le_DF,
157 mkTextEncoder = utf32le_EF }
159 utf32le_DF :: IO (TextDecoder ())
161 return (BufferCodec {
162 encode = utf32le_decode,
164 getState = return (),
165 setState = const $ return ()
168 utf32le_EF :: IO (TextEncoder ())
170 return (BufferCodec {
171 encode = utf32le_encode,
173 getState = return (),
174 setState = const $ return ()
178 utf32be_decode :: DecodeBuffer
180 input@Buffer{ bufRaw=iraw, bufL=ir0, bufR=iw, bufSize=_ }
181 output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow0, bufSize=os }
184 | ow >= os || iw - ir < 4 = done ir ow
186 c0 <- readWord8Buf iraw ir
187 c1 <- readWord8Buf iraw (ir+1)
188 c2 <- readWord8Buf iraw (ir+2)
189 c3 <- readWord8Buf iraw (ir+3)
190 let x1 = chr4 c0 c1 c2 c3
191 if not (validate x1) then invalid else do
192 ow' <- writeCharBuf oraw ow x1
195 invalid = if ir > ir0 then done ir ow else ioe_decodingError
197 -- lambda-lifted, to avoid thunks being built in the inner-loop:
198 done !ir !ow = return (if ir == iw then input{ bufL=0, bufR=0 }
199 else input{ bufL=ir },
204 utf32le_decode :: DecodeBuffer
206 input@Buffer{ bufRaw=iraw, bufL=ir0, bufR=iw, bufSize=_ }
207 output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow0, bufSize=os }
210 | ow >= os || iw - ir < 4 = done ir ow
212 c0 <- readWord8Buf iraw ir
213 c1 <- readWord8Buf iraw (ir+1)
214 c2 <- readWord8Buf iraw (ir+2)
215 c3 <- readWord8Buf iraw (ir+3)
216 let x1 = chr4 c3 c2 c1 c0
217 if not (validate x1) then invalid else do
218 ow' <- writeCharBuf oraw ow x1
221 invalid = if ir > ir0 then done ir ow else ioe_decodingError
223 -- lambda-lifted, to avoid thunks being built in the inner-loop:
224 done !ir !ow = return (if ir == iw then input{ bufL=0, bufR=0 }
225 else input{ bufL=ir },
230 ioe_decodingError :: IO a
231 ioe_decodingError = ioException
232 (IOError Nothing InvalidArgument "utf32_decode"
233 "invalid UTF-32 byte sequence" Nothing Nothing)
235 utf32be_encode :: EncodeBuffer
237 input@Buffer{ bufRaw=iraw, bufL=ir0, bufR=iw, bufSize=_ }
238 output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow0, bufSize=os }
240 done !ir !ow = return (if ir == iw then input{ bufL=0, bufR=0 }
241 else input{ bufL=ir },
244 | ir >= iw = done ir ow
245 | os - ow < 4 = done ir ow
247 (c,ir') <- readCharBuf iraw ir
248 let (c0,c1,c2,c3) = ord4 c
249 writeWord8Buf oraw ow c0
250 writeWord8Buf oraw (ow+1) c1
251 writeWord8Buf oraw (ow+2) c2
252 writeWord8Buf oraw (ow+3) c3
257 utf32le_encode :: EncodeBuffer
259 input@Buffer{ bufRaw=iraw, bufL=ir0, bufR=iw, bufSize=_ }
260 output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow0, bufSize=os }
262 done !ir !ow = return (if ir == iw then input{ bufL=0, bufR=0 }
263 else input{ bufL=ir },
266 | ir >= iw = done ir ow
267 | os - ow < 4 = done ir ow
269 (c,ir') <- readCharBuf iraw ir
270 let (c0,c1,c2,c3) = ord4 c
271 writeWord8Buf oraw ow c3
272 writeWord8Buf oraw (ow+1) c2
273 writeWord8Buf oraw (ow+2) c1
274 writeWord8Buf oraw (ow+3) c0
279 chr4 :: Word8 -> Word8 -> Word8 -> Word8 -> Char
280 chr4 (W8# x1#) (W8# x2#) (W8# x3#) (W8# x4#) =
281 C# (chr# (z1# +# z2# +# z3# +# z4#))
287 !z1# = uncheckedIShiftL# y1# 24#
288 !z2# = uncheckedIShiftL# y2# 16#
289 !z3# = uncheckedIShiftL# y3# 8#
293 ord4 :: Char -> (Word8,Word8,Word8,Word8)
294 ord4 c = (fromIntegral (x `shiftR` 24),
295 fromIntegral (x `shiftR` 16),
296 fromIntegral (x `shiftR` 8),
303 validate :: Char -> Bool
304 validate c = (x1 >= 0x0 && x1 < 0xD800) || (x1 > 0xDFFF && x1 <= 0x10FFFF)
306 {-# INLINE validate #-}