1 {-# LANGUAGE NoImplicitPrelude
3 , NondecreasingIndentation
6 {-# OPTIONS_GHC -funbox-strict-fields #-}
8 -----------------------------------------------------------------------------
10 -- Module : GHC.IO.Encoding.UTF8
11 -- Copyright : (c) The University of Glasgow, 2009
12 -- License : see libraries/base/LICENSE
14 -- Maintainer : libraries@haskell.org
15 -- Stability : internal
16 -- Portability : non-portable
18 -- UTF-8 Codec for the IO library
20 -- Portions Copyright : (c) Tom Harper 2008-2009,
21 -- (c) Bryan O'Sullivan 2009,
22 -- (c) Duncan Coutts 2009
24 -----------------------------------------------------------------------------
26 module GHC.IO.Encoding.UTF8 (
36 import GHC.IO.Exception
38 import GHC.IO.Encoding.Types
44 utf8 = TextEncoding { textEncodingName = "UTF-8",
45 mkTextDecoder = utf8_DF,
46 mkTextEncoder = utf8_EF }
48 utf8_DF :: IO (TextDecoder ())
54 setState = const $ return ()
57 utf8_EF :: IO (TextEncoder ())
63 setState = const $ return ()
66 utf8_bom :: TextEncoding
67 utf8_bom = TextEncoding { textEncodingName = "UTF-8BOM",
68 mkTextDecoder = utf8_bom_DF,
69 mkTextEncoder = utf8_bom_EF }
71 utf8_bom_DF :: IO (TextDecoder Bool)
75 encode = utf8_bom_decode ref,
77 getState = readIORef ref,
78 setState = writeIORef ref
81 utf8_bom_EF :: IO (TextEncoder Bool)
85 encode = utf8_bom_encode ref,
87 getState = readIORef ref,
88 setState = writeIORef ref
91 utf8_bom_decode :: IORef Bool -> DecodeBuffer
93 input@Buffer{ bufRaw=iraw, bufL=ir, bufR=iw, bufSize=_ }
96 first <- readIORef ref
98 then utf8_decode input output
100 let no_bom = do writeIORef ref False; utf8_decode input output
101 if iw - ir < 1 then return (input,output) else do
102 c0 <- readWord8Buf iraw ir
103 if (c0 /= bom0) then no_bom else do
104 if iw - ir < 2 then return (input,output) else do
105 c1 <- readWord8Buf iraw (ir+1)
106 if (c1 /= bom1) then no_bom else do
107 if iw - ir < 3 then return (input,output) else do
108 c2 <- readWord8Buf iraw (ir+2)
109 if (c2 /= bom2) then no_bom else do
110 -- found a BOM, ignore it and carry on
112 utf8_decode input{ bufL = ir + 3 } output
114 utf8_bom_encode :: IORef Bool -> EncodeBuffer
115 utf8_bom_encode ref input
116 output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow, bufSize=os }
119 if not b then utf8_encode input output
121 then return (input,output)
124 writeWord8Buf oraw ow bom0
125 writeWord8Buf oraw (ow+1) bom1
126 writeWord8Buf oraw (ow+2) bom2
127 utf8_encode input output{ bufR = ow+3 }
129 bom0, bom1, bom2 :: Word8
134 utf8_decode :: DecodeBuffer
136 input@Buffer{ bufRaw=iraw, bufL=ir0, bufR=iw, bufSize=_ }
137 output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow0, bufSize=os }
140 | ow >= os || ir >= iw = done ir ow
142 c0 <- readWord8Buf iraw ir
145 ow' <- writeCharBuf oraw ow (unsafeChr (fromIntegral c0))
147 | c0 >= 0xc0 && c0 <= 0xdf ->
148 if iw - ir < 2 then done ir ow else do
149 c1 <- readWord8Buf iraw (ir+1)
150 if (c1 < 0x80 || c1 >= 0xc0) then invalid else do
151 ow' <- writeCharBuf oraw ow (chr2 c0 c1)
153 | c0 >= 0xe0 && c0 <= 0xef ->
156 2 -> do -- check for an error even when we don't have
157 -- the full sequence yet (#3341)
158 c1 <- readWord8Buf iraw (ir+1)
159 if not (validate3 c0 c1 0x80)
160 then invalid else done ir ow
162 c1 <- readWord8Buf iraw (ir+1)
163 c2 <- readWord8Buf iraw (ir+2)
164 if not (validate3 c0 c1 c2) then invalid else do
165 ow' <- writeCharBuf oraw ow (chr3 c0 c1 c2)
170 2 -> do -- check for an error even when we don't have
171 -- the full sequence yet (#3341)
172 c1 <- readWord8Buf iraw (ir+1)
173 if not (validate4 c0 c1 0x80 0x80)
174 then invalid else done ir ow
176 c1 <- readWord8Buf iraw (ir+1)
177 c2 <- readWord8Buf iraw (ir+2)
178 if not (validate4 c0 c1 c2 0x80)
179 then invalid else done ir ow
181 c1 <- readWord8Buf iraw (ir+1)
182 c2 <- readWord8Buf iraw (ir+2)
183 c3 <- readWord8Buf iraw (ir+3)
184 if not (validate4 c0 c1 c2 c3) then invalid else do
185 ow' <- writeCharBuf oraw ow (chr4 c0 c1 c2 c3)
190 invalid = if ir > ir0 then done ir ow else ioe_decodingError
192 -- lambda-lifted, to avoid thunks being built in the inner-loop:
193 done !ir !ow = return (if ir == iw then input{ bufL=0, bufR=0 }
194 else input{ bufL=ir },
199 ioe_decodingError :: IO a
200 ioe_decodingError = ioException
201 (IOError Nothing InvalidArgument "utf8_decode"
202 "invalid UTF-8 byte sequence" Nothing Nothing)
204 utf8_encode :: EncodeBuffer
206 input@Buffer{ bufRaw=iraw, bufL=ir0, bufR=iw, bufSize=_ }
207 output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow0, bufSize=os }
209 done !ir !ow = return (if ir == iw then input{ bufL=0, bufR=0 }
210 else input{ bufL=ir },
213 | ow >= os || ir >= iw = done ir ow
215 (c,ir') <- readCharBuf iraw ir
218 writeWord8Buf oraw ow (fromIntegral x)
221 if os - ow < 2 then done ir ow else do
223 writeWord8Buf oraw ow c1
224 writeWord8Buf oraw (ow+1) c2
227 if os - ow < 3 then done ir ow else do
228 let (c1,c2,c3) = ord3 c
229 writeWord8Buf oraw ow c1
230 writeWord8Buf oraw (ow+1) c2
231 writeWord8Buf oraw (ow+2) c3
234 if os - ow < 4 then done ir ow else do
235 let (c1,c2,c3,c4) = ord4 c
236 writeWord8Buf oraw ow c1
237 writeWord8Buf oraw (ow+1) c2
238 writeWord8Buf oraw (ow+2) c3
239 writeWord8Buf oraw (ow+3) c4
244 -- -----------------------------------------------------------------------------
245 -- UTF-8 primitives, lifted from Data.Text.Fusion.Utf8
247 ord2 :: Char -> (Word8,Word8)
248 ord2 c = assert (n >= 0x80 && n <= 0x07ff) (x1,x2)
251 x1 = fromIntegral $ (n `shiftR` 6) + 0xC0
252 x2 = fromIntegral $ (n .&. 0x3F) + 0x80
254 ord3 :: Char -> (Word8,Word8,Word8)
255 ord3 c = assert (n >= 0x0800 && n <= 0xffff) (x1,x2,x3)
258 x1 = fromIntegral $ (n `shiftR` 12) + 0xE0
259 x2 = fromIntegral $ ((n `shiftR` 6) .&. 0x3F) + 0x80
260 x3 = fromIntegral $ (n .&. 0x3F) + 0x80
262 ord4 :: Char -> (Word8,Word8,Word8,Word8)
263 ord4 c = assert (n >= 0x10000) (x1,x2,x3,x4)
266 x1 = fromIntegral $ (n `shiftR` 18) + 0xF0
267 x2 = fromIntegral $ ((n `shiftR` 12) .&. 0x3F) + 0x80
268 x3 = fromIntegral $ ((n `shiftR` 6) .&. 0x3F) + 0x80
269 x4 = fromIntegral $ (n .&. 0x3F) + 0x80
271 chr2 :: Word8 -> Word8 -> Char
272 chr2 (W8# x1#) (W8# x2#) = C# (chr# (z1# +# z2#))
276 !z1# = uncheckedIShiftL# (y1# -# 0xC0#) 6#
280 chr3 :: Word8 -> Word8 -> Word8 -> Char
281 chr3 (W8# x1#) (W8# x2#) (W8# x3#) = C# (chr# (z1# +# z2# +# z3#))
286 !z1# = uncheckedIShiftL# (y1# -# 0xE0#) 12#
287 !z2# = uncheckedIShiftL# (y2# -# 0x80#) 6#
291 chr4 :: Word8 -> Word8 -> Word8 -> Word8 -> Char
292 chr4 (W8# x1#) (W8# x2#) (W8# x3#) (W8# x4#) =
293 C# (chr# (z1# +# z2# +# z3# +# z4#))
299 !z1# = uncheckedIShiftL# (y1# -# 0xF0#) 18#
300 !z2# = uncheckedIShiftL# (y2# -# 0x80#) 12#
301 !z3# = uncheckedIShiftL# (y3# -# 0x80#) 6#
305 between :: Word8 -- ^ byte to check
306 -> Word8 -- ^ lower bound
307 -> Word8 -- ^ upper bound
309 between x y z = x >= y && x <= z
310 {-# INLINE between #-}
312 validate3 :: Word8 -> Word8 -> Word8 -> Bool
313 {-# INLINE validate3 #-}
314 validate3 x1 x2 x3 = validate3_1 ||
319 validate3_1 = (x1 == 0xE0) &&
320 between x2 0xA0 0xBF &&
322 validate3_2 = between x1 0xE1 0xEC &&
323 between x2 0x80 0xBF &&
325 validate3_3 = x1 == 0xED &&
326 between x2 0x80 0x9F &&
328 validate3_4 = between x1 0xEE 0xEF &&
329 between x2 0x80 0xBF &&
332 validate4 :: Word8 -> Word8 -> Word8 -> Word8 -> Bool
333 {-# INLINE validate4 #-}
334 validate4 x1 x2 x3 x4 = validate4_1 ||
338 validate4_1 = x1 == 0xF0 &&
339 between x2 0x90 0xBF &&
340 between x3 0x80 0xBF &&
342 validate4_2 = between x1 0xF1 0xF3 &&
343 between x2 0x80 0xBF &&
344 between x3 0x80 0xBF &&
346 validate4_3 = x1 == 0xF4 &&
347 between x2 0x80 0x8F &&
348 between x3 0x80 0xBF &&