1 {-# LANGUAGE NoImplicitPrelude
3 , NondecreasingIndentation
6 {-# OPTIONS_GHC -funbox-strict-fields #-}
8 -----------------------------------------------------------------------------
10 -- Module : GHC.IO.Encoding.UTF32
11 -- Copyright : (c) The University of Glasgow, 2009
12 -- License : see libraries/base/LICENSE
14 -- Maintainer : libraries@haskell.org
15 -- Stability : internal
16 -- Portability : non-portable
18 -- UTF-32 Codecs for the IO library
20 -- Portions Copyright : (c) Tom Harper 2008-2009,
21 -- (c) Bryan O'Sullivan 2009,
22 -- (c) Duncan Coutts 2009
24 -----------------------------------------------------------------------------
26 module GHC.IO.Encoding.UTF32 (
45 import GHC.IO.Encoding.Failure
46 import GHC.IO.Encoding.Types
52 -- -----------------------------------------------------------------------------
53 -- The UTF-32 codec: either UTF-32BE or UTF-32LE with a BOM
56 utf32 = mkUTF32 ErrorOnCodingFailure
58 mkUTF32 :: CodingFailureMode -> TextEncoding
59 mkUTF32 cfm = TextEncoding { textEncodingName = "UTF-32",
60 mkTextDecoder = utf32_DF cfm,
61 mkTextEncoder = utf32_EF cfm }
63 utf32_DF :: CodingFailureMode -> IO (TextDecoder (Maybe DecodeBuffer))
65 seen_bom <- newIORef Nothing
67 encode = utf32_decode seen_bom,
68 recover = recoverDecode cfm,
70 getState = readIORef seen_bom,
71 setState = writeIORef seen_bom
74 utf32_EF :: CodingFailureMode -> IO (TextEncoder Bool)
76 done_bom <- newIORef False
78 encode = utf32_encode done_bom,
79 recover = recoverEncode cfm,
81 getState = readIORef done_bom,
82 setState = writeIORef done_bom
85 utf32_encode :: IORef Bool -> EncodeBuffer
86 utf32_encode done_bom input
87 output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow, bufSize=os }
89 b <- readIORef done_bom
90 if b then utf32_native_encode input output
92 then return (OutputUnderflow, input,output)
94 writeIORef done_bom True
95 writeWord8Buf oraw ow bom0
96 writeWord8Buf oraw (ow+1) bom1
97 writeWord8Buf oraw (ow+2) bom2
98 writeWord8Buf oraw (ow+3) bom3
99 utf32_native_encode input output{ bufR = ow+4 }
101 utf32_decode :: IORef (Maybe DecodeBuffer) -> DecodeBuffer
102 utf32_decode seen_bom
103 input@Buffer{ bufRaw=iraw, bufL=ir, bufR=iw, bufSize=_ }
106 mb <- readIORef seen_bom
108 Just decode -> decode input output
110 if iw - ir < 4 then return (InputUnderflow, input,output) else do
111 c0 <- readWord8Buf iraw ir
112 c1 <- readWord8Buf iraw (ir+1)
113 c2 <- readWord8Buf iraw (ir+2)
114 c3 <- readWord8Buf iraw (ir+3)
116 _ | c0 == bom0 && c1 == bom1 && c2 == bom2 && c3 == bom3 -> do
117 writeIORef seen_bom (Just utf32be_decode)
118 utf32be_decode input{ bufL= ir+4 } output
119 _ | c0 == bom3 && c1 == bom2 && c2 == bom1 && c3 == bom0 -> do
120 writeIORef seen_bom (Just utf32le_decode)
121 utf32le_decode input{ bufL= ir+4 } output
123 writeIORef seen_bom (Just utf32_native_decode)
124 utf32_native_decode input output
127 bom0, bom1, bom2, bom3 :: Word8
133 -- choose UTF-32BE by default for UTF-32 output
134 utf32_native_decode :: DecodeBuffer
135 utf32_native_decode = utf32be_decode
137 utf32_native_encode :: EncodeBuffer
138 utf32_native_encode = utf32be_encode
140 -- -----------------------------------------------------------------------------
141 -- UTF32LE and UTF32BE
143 utf32be :: TextEncoding
144 utf32be = mkUTF32be ErrorOnCodingFailure
146 mkUTF32be :: CodingFailureMode -> TextEncoding
147 mkUTF32be cfm = TextEncoding { textEncodingName = "UTF-32BE",
148 mkTextDecoder = utf32be_DF cfm,
149 mkTextEncoder = utf32be_EF cfm }
151 utf32be_DF :: CodingFailureMode -> IO (TextDecoder ())
153 return (BufferCodec {
154 encode = utf32be_decode,
155 recover = recoverDecode cfm,
157 getState = return (),
158 setState = const $ return ()
161 utf32be_EF :: CodingFailureMode -> IO (TextEncoder ())
163 return (BufferCodec {
164 encode = utf32be_encode,
165 recover = recoverEncode cfm,
167 getState = return (),
168 setState = const $ return ()
172 utf32le :: TextEncoding
173 utf32le = mkUTF32le ErrorOnCodingFailure
175 mkUTF32le :: CodingFailureMode -> TextEncoding
176 mkUTF32le cfm = TextEncoding { textEncodingName = "UTF-32LE",
177 mkTextDecoder = utf32le_DF cfm,
178 mkTextEncoder = utf32le_EF cfm }
180 utf32le_DF :: CodingFailureMode -> IO (TextDecoder ())
182 return (BufferCodec {
183 encode = utf32le_decode,
184 recover = recoverDecode cfm,
186 getState = return (),
187 setState = const $ return ()
190 utf32le_EF :: CodingFailureMode -> IO (TextEncoder ())
192 return (BufferCodec {
193 encode = utf32le_encode,
194 recover = recoverEncode cfm,
196 getState = return (),
197 setState = const $ return ()
201 utf32be_decode :: DecodeBuffer
203 input@Buffer{ bufRaw=iraw, bufL=ir0, bufR=iw, bufSize=_ }
204 output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow0, bufSize=os }
207 | ow >= os = done OutputUnderflow ir ow
208 | iw - ir < 4 = done InputUnderflow ir ow
210 c0 <- readWord8Buf iraw ir
211 c1 <- readWord8Buf iraw (ir+1)
212 c2 <- readWord8Buf iraw (ir+2)
213 c3 <- readWord8Buf iraw (ir+3)
214 let x1 = chr4 c0 c1 c2 c3
215 if not (validate x1) then invalid else do
216 ow' <- writeCharBuf oraw ow x1
219 invalid = done InvalidSequence ir ow
221 -- lambda-lifted, to avoid thunks being built in the inner-loop:
222 done why !ir !ow = return (why,
223 if ir == iw then input{ bufL=0, bufR=0 }
224 else input{ bufL=ir },
229 utf32le_decode :: DecodeBuffer
231 input@Buffer{ bufRaw=iraw, bufL=ir0, bufR=iw, bufSize=_ }
232 output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow0, bufSize=os }
235 | ow >= os = done OutputUnderflow ir ow
236 | iw - ir < 4 = done InputUnderflow ir ow
238 c0 <- readWord8Buf iraw ir
239 c1 <- readWord8Buf iraw (ir+1)
240 c2 <- readWord8Buf iraw (ir+2)
241 c3 <- readWord8Buf iraw (ir+3)
242 let x1 = chr4 c3 c2 c1 c0
243 if not (validate x1) then invalid else do
244 ow' <- writeCharBuf oraw ow x1
247 invalid = done InvalidSequence ir ow
249 -- lambda-lifted, to avoid thunks being built in the inner-loop:
250 done why !ir !ow = return (why,
251 if ir == iw then input{ bufL=0, bufR=0 }
252 else input{ bufL=ir },
257 utf32be_encode :: EncodeBuffer
259 input@Buffer{ bufRaw=iraw, bufL=ir0, bufR=iw, bufSize=_ }
260 output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow0, bufSize=os }
262 done why !ir !ow = return (why,
263 if ir == iw then input{ bufL=0, bufR=0 }
264 else input{ bufL=ir },
267 | ir >= iw = done InputUnderflow ir ow
268 | os - ow < 4 = done OutputUnderflow ir ow
270 (c,ir') <- readCharBuf iraw ir
271 if isSurrogate c then done InvalidSequence ir ow else do
272 let (c0,c1,c2,c3) = ord4 c
273 writeWord8Buf oraw ow c0
274 writeWord8Buf oraw (ow+1) c1
275 writeWord8Buf oraw (ow+2) c2
276 writeWord8Buf oraw (ow+3) c3
281 utf32le_encode :: EncodeBuffer
283 input@Buffer{ bufRaw=iraw, bufL=ir0, bufR=iw, bufSize=_ }
284 output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow0, bufSize=os }
286 done why !ir !ow = return (why,
287 if ir == iw then input{ bufL=0, bufR=0 }
288 else input{ bufL=ir },
291 | ir >= iw = done InputUnderflow ir ow
292 | os - ow < 4 = done OutputUnderflow ir ow
294 (c,ir') <- readCharBuf iraw ir
295 if isSurrogate c then done InvalidSequence ir ow else do
296 let (c0,c1,c2,c3) = ord4 c
297 writeWord8Buf oraw ow c3
298 writeWord8Buf oraw (ow+1) c2
299 writeWord8Buf oraw (ow+2) c1
300 writeWord8Buf oraw (ow+3) c0
305 chr4 :: Word8 -> Word8 -> Word8 -> Word8 -> Char
306 chr4 (W8# x1#) (W8# x2#) (W8# x3#) (W8# x4#) =
307 C# (chr# (z1# +# z2# +# z3# +# z4#))
313 !z1# = uncheckedIShiftL# y1# 24#
314 !z2# = uncheckedIShiftL# y2# 16#
315 !z3# = uncheckedIShiftL# y3# 8#
319 ord4 :: Char -> (Word8,Word8,Word8,Word8)
320 ord4 c = (fromIntegral (x `shiftR` 24),
321 fromIntegral (x `shiftR` 16),
322 fromIntegral (x `shiftR` 8),
329 validate :: Char -> Bool
330 validate c = (x1 >= 0x0 && x1 < 0xD800) || (x1 > 0xDFFF && x1 <= 0x10FFFF)
332 {-# INLINE validate #-}