1 {-# OPTIONS_GHC -XNoImplicitPrelude -funbox-strict-fields #-}
2 {-# LANGUAGE BangPatterns #-}
3 -----------------------------------------------------------------------------
5 -- Module : GHC.IO.Encoding.UTF8
6 -- Copyright : (c) The University of Glasgow, 2009
7 -- License : see libraries/base/LICENSE
9 -- Maintainer : libraries@haskell.org
10 -- Stability : internal
11 -- Portability : non-portable
13 -- UTF-8 Codec for the IO library
15 -- Portions Copyright : (c) Tom Harper 2008-2009,
16 -- (c) Bryan O'Sullivan 2009,
17 -- (c) Duncan Coutts 2009
19 -----------------------------------------------------------------------------
21 module GHC.IO.Encoding.UTF8 (
31 import GHC.IO.Exception
33 import GHC.IO.Encoding.Types
39 utf8 = TextEncoding { mkTextDecoder = utf8_DF,
40 mkTextEncoder = utf8_EF }
42 utf8_DF :: IO (TextDecoder ())
48 setState = const $ return ()
51 utf8_EF :: IO (TextEncoder ())
57 setState = const $ return ()
60 utf8_bom :: TextEncoding
61 utf8_bom = TextEncoding { mkTextDecoder = utf8_bom_DF,
62 mkTextEncoder = utf8_bom_EF }
64 utf8_bom_DF :: IO (TextDecoder Bool)
68 encode = utf8_bom_decode ref,
70 getState = readIORef ref,
71 setState = writeIORef ref
74 utf8_bom_EF :: IO (TextEncoder Bool)
78 encode = utf8_bom_encode ref,
80 getState = readIORef ref,
81 setState = writeIORef ref
84 utf8_bom_decode :: IORef Bool -> DecodeBuffer
86 input@Buffer{ bufRaw=iraw, bufL=ir, bufR=iw, bufSize=_ }
89 first <- readIORef ref
91 then utf8_decode input output
93 let no_bom = do writeIORef ref False; utf8_decode input output
94 if iw - ir < 1 then return (input,output) else do
95 c0 <- readWord8Buf iraw ir
96 if (c0 /= bom0) then no_bom else do
97 if iw - ir < 2 then return (input,output) else do
98 c1 <- readWord8Buf iraw (ir+1)
99 if (c1 /= bom1) then no_bom else do
100 if iw - ir < 3 then return (input,output) else do
101 c2 <- readWord8Buf iraw (ir+2)
102 if (c2 /= bom2) then no_bom else do
103 -- found a BOM, ignore it and carry on
105 utf8_decode input{ bufL = ir + 3 } output
107 utf8_bom_encode :: IORef Bool -> EncodeBuffer
108 utf8_bom_encode ref input
109 output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow, bufSize=os }
112 if not b then utf8_encode input output
114 then return (input,output)
117 writeWord8Buf oraw ow bom0
118 writeWord8Buf oraw (ow+1) bom1
119 writeWord8Buf oraw (ow+2) bom2
120 utf8_encode input output{ bufR = ow+3 }
122 bom0, bom1, bom2 :: Word8
127 utf8_decode :: DecodeBuffer
129 input@Buffer{ bufRaw=iraw, bufL=ir0, bufR=iw, bufSize=_ }
130 output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow0, bufSize=os }
133 | ow >= os || ir >= iw = done ir ow
135 c0 <- readWord8Buf iraw ir
138 ow' <- writeCharBuf oraw ow (unsafeChr (fromIntegral c0))
140 | c0 >= 0xc0 && c0 <= 0xdf ->
141 if iw - ir < 2 then done ir ow else do
142 c1 <- readWord8Buf iraw (ir+1)
143 if (c1 < 0x80 || c1 >= 0xc0) then invalid else do
144 ow' <- writeCharBuf oraw ow (chr2 c0 c1)
146 | c0 >= 0xe0 && c0 <= 0xef ->
147 if iw - ir < 3 then done ir ow else do
148 c1 <- readWord8Buf iraw (ir+1)
149 c2 <- readWord8Buf iraw (ir+2)
150 if not (validate3 c0 c1 c2) then invalid else do
151 ow' <- writeCharBuf oraw ow (chr3 c0 c1 c2)
154 if iw - ir < 4 then done ir ow else do
155 c1 <- readWord8Buf iraw (ir+1)
156 c2 <- readWord8Buf iraw (ir+2)
157 c3 <- readWord8Buf iraw (ir+3)
158 if not (validate4 c0 c1 c2 c3) then invalid else do
159 ow' <- writeCharBuf oraw ow (chr4 c0 c1 c2 c3)
164 invalid = if ir > ir0 then done ir ow else ioe_decodingError
166 -- lambda-lifted, to avoid thunks being built in the inner-loop:
167 done !ir !ow = return (if ir == iw then input{ bufL=0, bufR=0 }
168 else input{ bufL=ir },
173 ioe_decodingError :: IO a
174 ioe_decodingError = ioException
175 (IOError Nothing InvalidArgument "utf8_decode"
176 "invalid UTF-8 byte sequence" Nothing Nothing)
178 utf8_encode :: EncodeBuffer
180 input@Buffer{ bufRaw=iraw, bufL=ir0, bufR=iw, bufSize=_ }
181 output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow0, bufSize=os }
183 done !ir !ow = return (if ir == iw then input{ bufL=0, bufR=0 }
184 else input{ bufL=ir },
187 | ow >= os || ir >= iw = done ir ow
189 (c,ir') <- readCharBuf iraw ir
192 writeWord8Buf oraw ow (fromIntegral x)
195 if os - ow < 2 then done ir ow else do
197 writeWord8Buf oraw ow c1
198 writeWord8Buf oraw (ow+1) c2
201 if os - ow < 3 then done ir ow else do
202 let (c1,c2,c3) = ord3 c
203 writeWord8Buf oraw ow c1
204 writeWord8Buf oraw (ow+1) c2
205 writeWord8Buf oraw (ow+2) c3
208 if os - ow < 4 then done ir ow else do
209 let (c1,c2,c3,c4) = ord4 c
210 writeWord8Buf oraw ow c1
211 writeWord8Buf oraw (ow+1) c2
212 writeWord8Buf oraw (ow+2) c3
213 writeWord8Buf oraw (ow+3) c4
218 -- -----------------------------------------------------------------------------
219 -- UTF-8 primitives, lifted from Data.Text.Fusion.Utf8
221 ord2 :: Char -> (Word8,Word8)
222 ord2 c = assert (n >= 0x80 && n <= 0x07ff) (x1,x2)
225 x1 = fromIntegral $ (n `shiftR` 6) + 0xC0
226 x2 = fromIntegral $ (n .&. 0x3F) + 0x80
228 ord3 :: Char -> (Word8,Word8,Word8)
229 ord3 c = assert (n >= 0x0800 && n <= 0xffff) (x1,x2,x3)
232 x1 = fromIntegral $ (n `shiftR` 12) + 0xE0
233 x2 = fromIntegral $ ((n `shiftR` 6) .&. 0x3F) + 0x80
234 x3 = fromIntegral $ (n .&. 0x3F) + 0x80
236 ord4 :: Char -> (Word8,Word8,Word8,Word8)
237 ord4 c = assert (n >= 0x10000) (x1,x2,x3,x4)
240 x1 = fromIntegral $ (n `shiftR` 18) + 0xF0
241 x2 = fromIntegral $ ((n `shiftR` 12) .&. 0x3F) + 0x80
242 x3 = fromIntegral $ ((n `shiftR` 6) .&. 0x3F) + 0x80
243 x4 = fromIntegral $ (n .&. 0x3F) + 0x80
245 chr2 :: Word8 -> Word8 -> Char
246 chr2 (W8# x1#) (W8# x2#) = C# (chr# (z1# +# z2#))
250 !z1# = uncheckedIShiftL# (y1# -# 0xC0#) 6#
254 chr3 :: Word8 -> Word8 -> Word8 -> Char
255 chr3 (W8# x1#) (W8# x2#) (W8# x3#) = C# (chr# (z1# +# z2# +# z3#))
260 !z1# = uncheckedIShiftL# (y1# -# 0xE0#) 12#
261 !z2# = uncheckedIShiftL# (y2# -# 0x80#) 6#
265 chr4 :: Word8 -> Word8 -> Word8 -> Word8 -> Char
266 chr4 (W8# x1#) (W8# x2#) (W8# x3#) (W8# x4#) =
267 C# (chr# (z1# +# z2# +# z3# +# z4#))
273 !z1# = uncheckedIShiftL# (y1# -# 0xF0#) 18#
274 !z2# = uncheckedIShiftL# (y2# -# 0x80#) 12#
275 !z3# = uncheckedIShiftL# (y3# -# 0x80#) 6#
279 between :: Word8 -- ^ byte to check
280 -> Word8 -- ^ lower bound
281 -> Word8 -- ^ upper bound
283 between x y z = x >= y && x <= z
284 {-# INLINE between #-}
286 validate3 :: Word8 -> Word8 -> Word8 -> Bool
287 {-# INLINE validate3 #-}
288 validate3 x1 x2 x3 = validate3_1 ||
293 validate3_1 = (x1 == 0xE0) &&
294 between x2 0xA0 0xBF &&
296 validate3_2 = between x1 0xE1 0xEC &&
297 between x2 0x80 0xBF &&
299 validate3_3 = x1 == 0xED &&
300 between x2 0x80 0x9F &&
302 validate3_4 = between x1 0xEE 0xEF &&
303 between x2 0x80 0xBF &&
306 validate4 :: Word8 -> Word8 -> Word8 -> Word8 -> Bool
307 {-# INLINE validate4 #-}
308 validate4 x1 x2 x3 x4 = validate4_1 ||
312 validate4_1 = x1 == 0xF0 &&
313 between x2 0x90 0xBF &&
314 between x3 0x80 0xBF &&
316 validate4_2 = between x1 0xF1 0xF3 &&
317 between x2 0x80 0xBF &&
318 between x3 0x80 0xBF &&
320 validate4_3 = x1 == 0xF4 &&
321 between x2 0x80 0x8F &&
322 between x3 0x80 0xBF &&