1 {-# OPTIONS_GHC -XNoImplicitPrelude -funbox-strict-fields #-}
2 {-# LANGUAGE BangPatterns #-}
3 -----------------------------------------------------------------------------
5 -- Module : GHC.IO.Encoding.UTF32
6 -- Copyright : (c) The University of Glasgow, 2009
7 -- License : see libraries/base/LICENSE
9 -- Maintainer : libraries@haskell.org
10 -- Stability : internal
11 -- Portability : non-portable
13 -- UTF-32 Codecs for the IO library
15 -- Portions Copyright : (c) Tom Harper 2008-2009,
16 -- (c) Bryan O'Sullivan 2009,
17 -- (c) Duncan Coutts 2009
19 -----------------------------------------------------------------------------
21 module GHC.IO.Encoding.UTF32 (
39 import GHC.IO.Exception
41 import GHC.IO.Encoding.Types
47 -- -----------------------------------------------------------------------------
48 -- The UTF-32 codec: either UTF-32BE or UTF-32LE with a BOM
51 utf32 = TextEncoding { textEncodingName = "UTF-32",
52 mkTextDecoder = utf32_DF,
53 mkTextEncoder = utf32_EF }
55 utf32_DF :: IO (TextDecoder (Maybe DecodeBuffer))
57 seen_bom <- newIORef Nothing
59 encode = utf32_decode seen_bom,
61 getState = readIORef seen_bom,
62 setState = writeIORef seen_bom
65 utf32_EF :: IO (TextEncoder Bool)
67 done_bom <- newIORef False
69 encode = utf32_encode done_bom,
71 getState = readIORef done_bom,
72 setState = writeIORef done_bom
75 utf32_encode :: IORef Bool -> EncodeBuffer
76 utf32_encode done_bom input
77 output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow, bufSize=os }
79 b <- readIORef done_bom
80 if b then utf32_native_encode input output
82 then return (input,output)
84 writeIORef done_bom True
85 writeWord8Buf oraw ow bom0
86 writeWord8Buf oraw (ow+1) bom1
87 writeWord8Buf oraw (ow+2) bom2
88 writeWord8Buf oraw (ow+3) bom3
89 utf32_native_encode input output{ bufR = ow+4 }
91 utf32_decode :: IORef (Maybe DecodeBuffer) -> DecodeBuffer
93 input@Buffer{ bufRaw=iraw, bufL=ir, bufR=iw, bufSize=_ }
96 mb <- readIORef seen_bom
98 Just decode -> decode input output
100 if iw - ir < 4 then return (input,output) else do
101 c0 <- readWord8Buf iraw ir
102 c1 <- readWord8Buf iraw (ir+1)
103 c2 <- readWord8Buf iraw (ir+2)
104 c3 <- readWord8Buf iraw (ir+3)
106 _ | c0 == bom0 && c1 == bom1 && c2 == bom2 && c3 == bom3 -> do
107 writeIORef seen_bom (Just utf32be_decode)
108 utf32be_decode input{ bufL= ir+4 } output
109 _ | c0 == bom3 && c1 == bom2 && c2 == bom1 && c3 == bom0 -> do
110 writeIORef seen_bom (Just utf32le_decode)
111 utf32le_decode input{ bufL= ir+4 } output
113 writeIORef seen_bom (Just utf32_native_decode)
114 utf32_native_decode input output
117 bom0, bom1, bom2, bom3 :: Word8
123 -- choose UTF-32BE by default for UTF-32 output
124 utf32_native_decode :: DecodeBuffer
125 utf32_native_decode = utf32be_decode
127 utf32_native_encode :: EncodeBuffer
128 utf32_native_encode = utf32be_encode
130 -- -----------------------------------------------------------------------------
131 -- UTF32LE and UTF32BE
133 utf32be :: TextEncoding
134 utf32be = TextEncoding { textEncodingName = "UTF-32BE",
135 mkTextDecoder = utf32be_DF,
136 mkTextEncoder = utf32be_EF }
138 utf32be_DF :: IO (TextDecoder ())
140 return (BufferCodec {
141 encode = utf32be_decode,
143 getState = return (),
144 setState = const $ return ()
147 utf32be_EF :: IO (TextEncoder ())
149 return (BufferCodec {
150 encode = utf32be_encode,
152 getState = return (),
153 setState = const $ return ()
157 utf32le :: TextEncoding
158 utf32le = TextEncoding { textEncodingName = "UTF-32LE",
159 mkTextDecoder = utf32le_DF,
160 mkTextEncoder = utf32le_EF }
162 utf32le_DF :: IO (TextDecoder ())
164 return (BufferCodec {
165 encode = utf32le_decode,
167 getState = return (),
168 setState = const $ return ()
171 utf32le_EF :: IO (TextEncoder ())
173 return (BufferCodec {
174 encode = utf32le_encode,
176 getState = return (),
177 setState = const $ return ()
181 utf32be_decode :: DecodeBuffer
183 input@Buffer{ bufRaw=iraw, bufL=ir0, bufR=iw, bufSize=_ }
184 output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow0, bufSize=os }
187 | ow >= os || iw - ir < 4 = done ir ow
189 c0 <- readWord8Buf iraw ir
190 c1 <- readWord8Buf iraw (ir+1)
191 c2 <- readWord8Buf iraw (ir+2)
192 c3 <- readWord8Buf iraw (ir+3)
193 let x1 = chr4 c0 c1 c2 c3
194 if not (validate x1) then invalid else do
195 ow' <- writeCharBuf oraw ow x1
198 invalid = if ir > ir0 then done ir ow else ioe_decodingError
200 -- lambda-lifted, to avoid thunks being built in the inner-loop:
201 done !ir !ow = return (if ir == iw then input{ bufL=0, bufR=0 }
202 else input{ bufL=ir },
207 utf32le_decode :: DecodeBuffer
209 input@Buffer{ bufRaw=iraw, bufL=ir0, bufR=iw, bufSize=_ }
210 output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow0, bufSize=os }
213 | ow >= os || iw - ir < 4 = done ir ow
215 c0 <- readWord8Buf iraw ir
216 c1 <- readWord8Buf iraw (ir+1)
217 c2 <- readWord8Buf iraw (ir+2)
218 c3 <- readWord8Buf iraw (ir+3)
219 let x1 = chr4 c3 c2 c1 c0
220 if not (validate x1) then invalid else do
221 ow' <- writeCharBuf oraw ow x1
224 invalid = if ir > ir0 then done ir ow else ioe_decodingError
226 -- lambda-lifted, to avoid thunks being built in the inner-loop:
227 done !ir !ow = return (if ir == iw then input{ bufL=0, bufR=0 }
228 else input{ bufL=ir },
233 ioe_decodingError :: IO a
234 ioe_decodingError = ioException
235 (IOError Nothing InvalidArgument "utf32_decode"
236 "invalid UTF-32 byte sequence" Nothing Nothing)
238 utf32be_encode :: EncodeBuffer
240 input@Buffer{ bufRaw=iraw, bufL=ir0, bufR=iw, bufSize=_ }
241 output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow0, bufSize=os }
243 done !ir !ow = return (if ir == iw then input{ bufL=0, bufR=0 }
244 else input{ bufL=ir },
247 | ir >= iw = done ir ow
248 | os - ow < 4 = done ir ow
250 (c,ir') <- readCharBuf iraw ir
251 let (c0,c1,c2,c3) = ord4 c
252 writeWord8Buf oraw ow c0
253 writeWord8Buf oraw (ow+1) c1
254 writeWord8Buf oraw (ow+2) c2
255 writeWord8Buf oraw (ow+3) c3
260 utf32le_encode :: EncodeBuffer
262 input@Buffer{ bufRaw=iraw, bufL=ir0, bufR=iw, bufSize=_ }
263 output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow0, bufSize=os }
265 done !ir !ow = return (if ir == iw then input{ bufL=0, bufR=0 }
266 else input{ bufL=ir },
269 | ir >= iw = done ir ow
270 | os - ow < 4 = done ir ow
272 (c,ir') <- readCharBuf iraw ir
273 let (c0,c1,c2,c3) = ord4 c
274 writeWord8Buf oraw ow c3
275 writeWord8Buf oraw (ow+1) c2
276 writeWord8Buf oraw (ow+2) c1
277 writeWord8Buf oraw (ow+3) c0
282 chr4 :: Word8 -> Word8 -> Word8 -> Word8 -> Char
283 chr4 (W8# x1#) (W8# x2#) (W8# x3#) (W8# x4#) =
284 C# (chr# (z1# +# z2# +# z3# +# z4#))
290 !z1# = uncheckedIShiftL# y1# 24#
291 !z2# = uncheckedIShiftL# y2# 16#
292 !z3# = uncheckedIShiftL# y3# 8#
296 ord4 :: Char -> (Word8,Word8,Word8,Word8)
297 ord4 c = (fromIntegral (x `shiftR` 24),
298 fromIntegral (x `shiftR` 16),
299 fromIntegral (x `shiftR` 8),
306 validate :: Char -> Bool
307 validate c = (x1 >= 0x0 && x1 < 0xD800) || (x1 > 0xDFFF && x1 <= 0x10FFFF)
309 {-# INLINE validate #-}