1 {-# LANGUAGE NoImplicitPrelude
3 , NondecreasingIndentation
6 {-# OPTIONS_GHC -funbox-strict-fields #-}
8 -----------------------------------------------------------------------------
10 -- Module : GHC.IO.Encoding.UTF32
11 -- Copyright : (c) The University of Glasgow, 2009
12 -- License : see libraries/base/LICENSE
14 -- Maintainer : libraries@haskell.org
15 -- Stability : internal
16 -- Portability : non-portable
18 -- UTF-32 Codecs for the IO library
20 -- Portions Copyright : (c) Tom Harper 2008-2009,
21 -- (c) Bryan O'Sullivan 2009,
22 -- (c) Duncan Coutts 2009
24 -----------------------------------------------------------------------------
26 module GHC.IO.Encoding.UTF32 (
44 import GHC.IO.Exception
46 import GHC.IO.Encoding.Types
52 -- -----------------------------------------------------------------------------
53 -- The UTF-32 codec: either UTF-32BE or UTF-32LE with a BOM
56 utf32 = TextEncoding { textEncodingName = "UTF-32",
57 mkTextDecoder = utf32_DF,
58 mkTextEncoder = utf32_EF }
60 utf32_DF :: IO (TextDecoder (Maybe DecodeBuffer))
62 seen_bom <- newIORef Nothing
64 encode = utf32_decode seen_bom,
66 getState = readIORef seen_bom,
67 setState = writeIORef seen_bom
70 utf32_EF :: IO (TextEncoder Bool)
72 done_bom <- newIORef False
74 encode = utf32_encode done_bom,
76 getState = readIORef done_bom,
77 setState = writeIORef done_bom
80 utf32_encode :: IORef Bool -> EncodeBuffer
81 utf32_encode done_bom input
82 output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow, bufSize=os }
84 b <- readIORef done_bom
85 if b then utf32_native_encode input output
87 then return (input,output)
89 writeIORef done_bom True
90 writeWord8Buf oraw ow bom0
91 writeWord8Buf oraw (ow+1) bom1
92 writeWord8Buf oraw (ow+2) bom2
93 writeWord8Buf oraw (ow+3) bom3
94 utf32_native_encode input output{ bufR = ow+4 }
96 utf32_decode :: IORef (Maybe DecodeBuffer) -> DecodeBuffer
98 input@Buffer{ bufRaw=iraw, bufL=ir, bufR=iw, bufSize=_ }
101 mb <- readIORef seen_bom
103 Just decode -> decode input output
105 if iw - ir < 4 then return (input,output) else do
106 c0 <- readWord8Buf iraw ir
107 c1 <- readWord8Buf iraw (ir+1)
108 c2 <- readWord8Buf iraw (ir+2)
109 c3 <- readWord8Buf iraw (ir+3)
111 _ | c0 == bom0 && c1 == bom1 && c2 == bom2 && c3 == bom3 -> do
112 writeIORef seen_bom (Just utf32be_decode)
113 utf32be_decode input{ bufL= ir+4 } output
114 _ | c0 == bom3 && c1 == bom2 && c2 == bom1 && c3 == bom0 -> do
115 writeIORef seen_bom (Just utf32le_decode)
116 utf32le_decode input{ bufL= ir+4 } output
118 writeIORef seen_bom (Just utf32_native_decode)
119 utf32_native_decode input output
122 bom0, bom1, bom2, bom3 :: Word8
128 -- choose UTF-32BE by default for UTF-32 output
129 utf32_native_decode :: DecodeBuffer
130 utf32_native_decode = utf32be_decode
132 utf32_native_encode :: EncodeBuffer
133 utf32_native_encode = utf32be_encode
135 -- -----------------------------------------------------------------------------
136 -- UTF32LE and UTF32BE
138 utf32be :: TextEncoding
139 utf32be = TextEncoding { textEncodingName = "UTF-32BE",
140 mkTextDecoder = utf32be_DF,
141 mkTextEncoder = utf32be_EF }
143 utf32be_DF :: IO (TextDecoder ())
145 return (BufferCodec {
146 encode = utf32be_decode,
148 getState = return (),
149 setState = const $ return ()
152 utf32be_EF :: IO (TextEncoder ())
154 return (BufferCodec {
155 encode = utf32be_encode,
157 getState = return (),
158 setState = const $ return ()
162 utf32le :: TextEncoding
163 utf32le = TextEncoding { textEncodingName = "UTF-32LE",
164 mkTextDecoder = utf32le_DF,
165 mkTextEncoder = utf32le_EF }
167 utf32le_DF :: IO (TextDecoder ())
169 return (BufferCodec {
170 encode = utf32le_decode,
172 getState = return (),
173 setState = const $ return ()
176 utf32le_EF :: IO (TextEncoder ())
178 return (BufferCodec {
179 encode = utf32le_encode,
181 getState = return (),
182 setState = const $ return ()
186 utf32be_decode :: DecodeBuffer
188 input@Buffer{ bufRaw=iraw, bufL=ir0, bufR=iw, bufSize=_ }
189 output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow0, bufSize=os }
192 | ow >= os || iw - ir < 4 = done ir ow
194 c0 <- readWord8Buf iraw ir
195 c1 <- readWord8Buf iraw (ir+1)
196 c2 <- readWord8Buf iraw (ir+2)
197 c3 <- readWord8Buf iraw (ir+3)
198 let x1 = chr4 c0 c1 c2 c3
199 if not (validate x1) then invalid else do
200 ow' <- writeCharBuf oraw ow x1
203 invalid = if ir > ir0 then done ir ow else ioe_decodingError
205 -- lambda-lifted, to avoid thunks being built in the inner-loop:
206 done !ir !ow = return (if ir == iw then input{ bufL=0, bufR=0 }
207 else input{ bufL=ir },
212 utf32le_decode :: DecodeBuffer
214 input@Buffer{ bufRaw=iraw, bufL=ir0, bufR=iw, bufSize=_ }
215 output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow0, bufSize=os }
218 | ow >= os || iw - ir < 4 = done ir ow
220 c0 <- readWord8Buf iraw ir
221 c1 <- readWord8Buf iraw (ir+1)
222 c2 <- readWord8Buf iraw (ir+2)
223 c3 <- readWord8Buf iraw (ir+3)
224 let x1 = chr4 c3 c2 c1 c0
225 if not (validate x1) then invalid else do
226 ow' <- writeCharBuf oraw ow x1
229 invalid = if ir > ir0 then done ir ow else ioe_decodingError
231 -- lambda-lifted, to avoid thunks being built in the inner-loop:
232 done !ir !ow = return (if ir == iw then input{ bufL=0, bufR=0 }
233 else input{ bufL=ir },
238 ioe_decodingError :: IO a
239 ioe_decodingError = ioException
240 (IOError Nothing InvalidArgument "utf32_decode"
241 "invalid UTF-32 byte sequence" Nothing Nothing)
243 utf32be_encode :: EncodeBuffer
245 input@Buffer{ bufRaw=iraw, bufL=ir0, bufR=iw, bufSize=_ }
246 output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow0, bufSize=os }
248 done !ir !ow = return (if ir == iw then input{ bufL=0, bufR=0 }
249 else input{ bufL=ir },
252 | ir >= iw = done ir ow
253 | os - ow < 4 = done ir ow
255 (c,ir') <- readCharBuf iraw ir
256 let (c0,c1,c2,c3) = ord4 c
257 writeWord8Buf oraw ow c0
258 writeWord8Buf oraw (ow+1) c1
259 writeWord8Buf oraw (ow+2) c2
260 writeWord8Buf oraw (ow+3) c3
265 utf32le_encode :: EncodeBuffer
267 input@Buffer{ bufRaw=iraw, bufL=ir0, bufR=iw, bufSize=_ }
268 output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow0, bufSize=os }
270 done !ir !ow = return (if ir == iw then input{ bufL=0, bufR=0 }
271 else input{ bufL=ir },
274 | ir >= iw = done ir ow
275 | os - ow < 4 = done ir ow
277 (c,ir') <- readCharBuf iraw ir
278 let (c0,c1,c2,c3) = ord4 c
279 writeWord8Buf oraw ow c3
280 writeWord8Buf oraw (ow+1) c2
281 writeWord8Buf oraw (ow+2) c1
282 writeWord8Buf oraw (ow+3) c0
287 chr4 :: Word8 -> Word8 -> Word8 -> Word8 -> Char
288 chr4 (W8# x1#) (W8# x2#) (W8# x3#) (W8# x4#) =
289 C# (chr# (z1# +# z2# +# z3# +# z4#))
295 !z1# = uncheckedIShiftL# y1# 24#
296 !z2# = uncheckedIShiftL# y2# 16#
297 !z3# = uncheckedIShiftL# y3# 8#
301 ord4 :: Char -> (Word8,Word8,Word8,Word8)
302 ord4 c = (fromIntegral (x `shiftR` 24),
303 fromIntegral (x `shiftR` 16),
304 fromIntegral (x `shiftR` 8),
311 validate :: Char -> Bool
312 validate c = (x1 >= 0x0 && x1 < 0xD800) || (x1 > 0xDFFF && x1 <= 0x10FFFF)
314 {-# INLINE validate #-}