1 {-# OPTIONS_GHC -fno-implicit-prelude -funbox-strict-fields #-}
2 {-# LANGUAGE BangPatterns #-}
3 -----------------------------------------------------------------------------
5 -- Module : GHC.IO.Encoding.UTF32
6 -- Copyright : (c) The University of Glasgow, 2009
7 -- License : see libraries/base/LICENSE
9 -- Maintainer : libraries@haskell.org
10 -- Stability : internal
11 -- Portability : non-portable
13 -- UTF-32 Codecs for the IO library
15 -- Portions Copyright : (c) Tom Harper 2008-2009,
16 -- (c) Bryan O'Sullivan 2009,
17 -- (c) Duncan Coutts 2009
19 -----------------------------------------------------------------------------
21 module GHC.IO.Encoding.UTF32 (
39 import GHC.IO.Exception
41 import GHC.IO.Encoding.Types
47 -- -----------------------------------------------------------------------------
48 -- The UTF-32 codec: either UTF-32BE or UTF-32LE with a BOM
51 utf32 = TextEncoding { mkTextDecoder = utf32_DF,
52 mkTextEncoder = utf32_EF }
54 utf32_DF :: IO TextDecoder
56 seen_bom <- newIORef Nothing
57 return (BufferCodec (utf32_decode seen_bom) (return ()))
59 utf32_EF :: IO TextEncoder
61 done_bom <- newIORef False
62 return (BufferCodec (utf32_encode done_bom) (return ()))
64 utf32_encode :: IORef Bool -> EncodeBuffer
65 utf32_encode done_bom input
66 output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow, bufSize=os }
68 b <- readIORef done_bom
69 if b then utf32_native_encode input output
71 then return (input,output)
73 writeIORef done_bom True
74 writeWord8Buf oraw ow bom0
75 writeWord8Buf oraw (ow+1) bom1
76 writeWord8Buf oraw (ow+2) bom2
77 writeWord8Buf oraw (ow+3) bom3
78 utf32_native_encode input output{ bufR = ow+4 }
80 utf32_decode :: IORef (Maybe DecodeBuffer) -> DecodeBuffer
82 input@Buffer{ bufRaw=iraw, bufL=ir, bufR=iw, bufSize=_ }
85 mb <- readIORef seen_bom
87 Just decode -> decode input output
89 if iw - ir < 4 then return (input,output) else do
90 c0 <- readWord8Buf iraw ir
91 c1 <- readWord8Buf iraw (ir+1)
92 c2 <- readWord8Buf iraw (ir+2)
93 c3 <- readWord8Buf iraw (ir+3)
95 _ | c0 == bom0 && c1 == bom1 && c2 == bom2 && c3 == bom3 -> do
96 writeIORef seen_bom (Just utf32be_decode)
97 utf32be_decode input{ bufL= ir+4 } output
98 _ | c0 == bom3 && c1 == bom2 && c2 == bom1 && c3 == bom0 -> do
99 writeIORef seen_bom (Just utf32le_decode)
100 utf32le_decode input{ bufL= ir+4 } output
102 writeIORef seen_bom (Just utf32_native_decode)
103 utf32_native_decode input output
106 bom0, bom1, bom2, bom3 :: Word8
112 -- choose UTF-32BE by default for UTF-32 output
113 utf32_native_decode :: DecodeBuffer
114 utf32_native_decode = utf32be_decode
116 utf32_native_encode :: EncodeBuffer
117 utf32_native_encode = utf32be_encode
119 -- -----------------------------------------------------------------------------
120 -- UTF32LE and UTF32BE
122 utf32be :: TextEncoding
123 utf32be = TextEncoding { mkTextDecoder = utf32be_DF,
124 mkTextEncoder = utf32be_EF }
126 utf32be_DF :: IO TextDecoder
127 utf32be_DF = return (BufferCodec utf32be_decode (return ()))
129 utf32be_EF :: IO TextEncoder
130 utf32be_EF = return (BufferCodec utf32be_encode (return ()))
133 utf32le :: TextEncoding
134 utf32le = TextEncoding { mkTextDecoder = utf32le_DF,
135 mkTextEncoder = utf32le_EF }
137 utf32le_DF :: IO TextDecoder
138 utf32le_DF = return (BufferCodec utf32le_decode (return ()))
140 utf32le_EF :: IO TextEncoder
141 utf32le_EF = return (BufferCodec utf32le_encode (return ()))
145 utf32be_decode :: DecodeBuffer
147 input@Buffer{ bufRaw=iraw, bufL=ir0, bufR=iw, bufSize=_ }
148 output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow0, bufSize=os }
151 | ow >= os || iw - ir < 4 = done ir ow
153 c0 <- readWord8Buf iraw ir
154 c1 <- readWord8Buf iraw (ir+1)
155 c2 <- readWord8Buf iraw (ir+2)
156 c3 <- readWord8Buf iraw (ir+3)
157 let x1 = chr4 c0 c1 c2 c3
158 if not (validate x1) then invalid else do
159 writeCharBuf oraw ow x1
162 invalid = if ir > ir0 then done ir ow else ioe_decodingError
164 -- lambda-lifted, to avoid thunks being built in the inner-loop:
165 done !ir !ow = return (if ir == iw then input{ bufL=0, bufR=0 }
166 else input{ bufL=ir },
171 utf32le_decode :: DecodeBuffer
173 input@Buffer{ bufRaw=iraw, bufL=ir0, bufR=iw, bufSize=_ }
174 output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow0, bufSize=os }
177 | ow >= os || iw - ir < 4 = done ir ow
179 c0 <- readWord8Buf iraw ir
180 c1 <- readWord8Buf iraw (ir+1)
181 c2 <- readWord8Buf iraw (ir+2)
182 c3 <- readWord8Buf iraw (ir+3)
183 let x1 = chr4 c3 c2 c1 c0
184 if not (validate x1) then invalid else do
185 writeCharBuf oraw ow x1
188 invalid = if ir > ir0 then done ir ow else ioe_decodingError
190 -- lambda-lifted, to avoid thunks being built in the inner-loop:
191 done !ir !ow = return (if ir == iw then input{ bufL=0, bufR=0 }
192 else input{ bufL=ir },
197 ioe_decodingError :: IO a
198 ioe_decodingError = ioException
199 (IOError Nothing InvalidArgument "utf32_decode"
200 "invalid UTF-32 byte sequence" Nothing Nothing)
202 utf32be_encode :: EncodeBuffer
204 input@Buffer{ bufRaw=iraw, bufL=ir0, bufR=iw, bufSize=_ }
205 output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow0, bufSize=os }
207 done !ir !ow = return (if ir == iw then input{ bufL=0, bufR=0 }
208 else input{ bufL=ir },
211 | ir >= iw = done ir ow
212 | os - ow < 4 = done ir ow
214 (c,ir') <- readCharBuf iraw ir
215 let (c0,c1,c2,c3) = ord4 c
216 writeWord8Buf oraw ow c0
217 writeWord8Buf oraw (ow+1) c1
218 writeWord8Buf oraw (ow+2) c2
219 writeWord8Buf oraw (ow+3) c3
224 utf32le_encode :: EncodeBuffer
226 input@Buffer{ bufRaw=iraw, bufL=ir0, bufR=iw, bufSize=_ }
227 output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow0, bufSize=os }
229 done !ir !ow = return (if ir == iw then input{ bufL=0, bufR=0 }
230 else input{ bufL=ir },
233 | ir >= iw = done ir ow
234 | os - ow < 4 = done ir ow
236 (c,ir') <- readCharBuf iraw ir
237 let (c0,c1,c2,c3) = ord4 c
238 writeWord8Buf oraw ow c3
239 writeWord8Buf oraw (ow+1) c2
240 writeWord8Buf oraw (ow+2) c1
241 writeWord8Buf oraw (ow+3) c0
246 chr4 :: Word8 -> Word8 -> Word8 -> Word8 -> Char
247 chr4 (W8# x1#) (W8# x2#) (W8# x3#) (W8# x4#) =
248 C# (chr# (z1# +# z2# +# z3# +# z4#))
254 !z1# = uncheckedIShiftL# y1# 24#
255 !z2# = uncheckedIShiftL# y2# 16#
256 !z3# = uncheckedIShiftL# y3# 8#
260 ord4 :: Char -> (Word8,Word8,Word8,Word8)
261 ord4 c = (fromIntegral (x `shiftR` 24),
262 fromIntegral (x `shiftR` 16),
263 fromIntegral (x `shiftR` 8),
270 validate :: Char -> Bool
271 validate c = (x1 >= 0x0 && x1 < 0xD800) || (x1 > 0xDFFF && x1 <= 0x10FFFF)
273 {-# INLINE validate #-}