1 {-# LANGUAGE NoImplicitPrelude
3 , NondecreasingIndentation
6 {-# OPTIONS_GHC -funbox-strict-fields #-}
8 -----------------------------------------------------------------------------
10 -- Module : GHC.IO.Encoding.UTF8
11 -- Copyright : (c) The University of Glasgow, 2009
12 -- License : see libraries/base/LICENSE
14 -- Maintainer : libraries@haskell.org
15 -- Stability : internal
16 -- Portability : non-portable
18 -- UTF-8 Codec for the IO library
20 -- Portions Copyright : (c) Tom Harper 2008-2009,
21 -- (c) Bryan O'Sullivan 2009,
22 -- (c) Duncan Coutts 2009
24 -----------------------------------------------------------------------------
26 module GHC.IO.Encoding.UTF8 (
37 import GHC.IO.Encoding.Failure
38 import GHC.IO.Encoding.Types
43 utf8 = mkUTF8 ErrorOnCodingFailure
45 mkUTF8 :: CodingFailureMode -> TextEncoding
46 mkUTF8 cfm = TextEncoding { textEncodingName = "UTF-8",
47 mkTextDecoder = utf8_DF cfm,
48 mkTextEncoder = utf8_EF cfm }
51 utf8_DF :: CodingFailureMode -> IO (TextDecoder ())
55 recover = recoverDecode cfm,
58 setState = const $ return ()
61 utf8_EF :: CodingFailureMode -> IO (TextEncoder ())
65 recover = recoverEncode cfm,
68 setState = const $ return ()
71 utf8_bom :: TextEncoding
72 utf8_bom = mkUTF8_bom ErrorOnCodingFailure
74 mkUTF8_bom :: CodingFailureMode -> TextEncoding
75 mkUTF8_bom cfm = TextEncoding { textEncodingName = "UTF-8BOM",
76 mkTextDecoder = utf8_bom_DF cfm,
77 mkTextEncoder = utf8_bom_EF cfm }
79 utf8_bom_DF :: CodingFailureMode -> IO (TextDecoder Bool)
83 encode = utf8_bom_decode ref,
84 recover = recoverDecode cfm,
86 getState = readIORef ref,
87 setState = writeIORef ref
90 utf8_bom_EF :: CodingFailureMode -> IO (TextEncoder Bool)
94 encode = utf8_bom_encode ref,
95 recover = recoverEncode cfm,
97 getState = readIORef ref,
98 setState = writeIORef ref
101 utf8_bom_decode :: IORef Bool -> DecodeBuffer
103 input@Buffer{ bufRaw=iraw, bufL=ir, bufR=iw, bufSize=_ }
106 first <- readIORef ref
108 then utf8_decode input output
110 let no_bom = do writeIORef ref False; utf8_decode input output
111 if iw - ir < 1 then return (InputUnderflow,input,output) else do
112 c0 <- readWord8Buf iraw ir
113 if (c0 /= bom0) then no_bom else do
114 if iw - ir < 2 then return (InputUnderflow,input,output) else do
115 c1 <- readWord8Buf iraw (ir+1)
116 if (c1 /= bom1) then no_bom else do
117 if iw - ir < 3 then return (InputUnderflow,input,output) else do
118 c2 <- readWord8Buf iraw (ir+2)
119 if (c2 /= bom2) then no_bom else do
120 -- found a BOM, ignore it and carry on
122 utf8_decode input{ bufL = ir + 3 } output
124 utf8_bom_encode :: IORef Bool -> EncodeBuffer
125 utf8_bom_encode ref input
126 output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow, bufSize=os }
129 if not b then utf8_encode input output
131 then return (OutputUnderflow,input,output)
134 writeWord8Buf oraw ow bom0
135 writeWord8Buf oraw (ow+1) bom1
136 writeWord8Buf oraw (ow+2) bom2
137 utf8_encode input output{ bufR = ow+3 }
139 bom0, bom1, bom2 :: Word8
144 utf8_decode :: DecodeBuffer
146 input@Buffer{ bufRaw=iraw, bufL=ir0, bufR=iw, bufSize=_ }
147 output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow0, bufSize=os }
150 | ow >= os = done OutputUnderflow ir ow
151 | ir >= iw = done InputUnderflow ir ow
153 c0 <- readWord8Buf iraw ir
156 ow' <- writeCharBuf oraw ow (unsafeChr (fromIntegral c0))
158 | c0 >= 0xc0 && c0 <= 0xdf ->
159 if iw - ir < 2 then done InputUnderflow ir ow else do
160 c1 <- readWord8Buf iraw (ir+1)
161 if (c1 < 0x80 || c1 >= 0xc0) then invalid else do
162 ow' <- writeCharBuf oraw ow (chr2 c0 c1)
164 | c0 >= 0xe0 && c0 <= 0xef ->
166 1 -> done InputUnderflow ir ow
167 2 -> do -- check for an error even when we don't have
168 -- the full sequence yet (#3341)
169 c1 <- readWord8Buf iraw (ir+1)
170 if not (validate3 c0 c1 0x80)
171 then invalid else done InputUnderflow ir ow
173 c1 <- readWord8Buf iraw (ir+1)
174 c2 <- readWord8Buf iraw (ir+2)
175 if not (validate3 c0 c1 c2) then invalid else do
176 ow' <- writeCharBuf oraw ow (chr3 c0 c1 c2)
180 1 -> done InputUnderflow ir ow
181 2 -> do -- check for an error even when we don't have
182 -- the full sequence yet (#3341)
183 c1 <- readWord8Buf iraw (ir+1)
184 if not (validate4 c0 c1 0x80 0x80)
185 then invalid else done InputUnderflow ir ow
187 c1 <- readWord8Buf iraw (ir+1)
188 c2 <- readWord8Buf iraw (ir+2)
189 if not (validate4 c0 c1 c2 0x80)
190 then invalid else done InputUnderflow ir ow
192 c1 <- readWord8Buf iraw (ir+1)
193 c2 <- readWord8Buf iraw (ir+2)
194 c3 <- readWord8Buf iraw (ir+3)
195 if not (validate4 c0 c1 c2 c3) then invalid else do
196 ow' <- writeCharBuf oraw ow (chr4 c0 c1 c2 c3)
201 invalid = done InvalidSequence ir ow
203 -- lambda-lifted, to avoid thunks being built in the inner-loop:
204 done why !ir !ow = return (why,
205 if ir == iw then input{ bufL=0, bufR=0 }
206 else input{ bufL=ir },
211 utf8_encode :: EncodeBuffer
213 input@Buffer{ bufRaw=iraw, bufL=ir0, bufR=iw, bufSize=_ }
214 output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow0, bufSize=os }
216 done why !ir !ow = return (why,
217 if ir == iw then input{ bufL=0, bufR=0 }
218 else input{ bufL=ir },
221 | ow >= os = done OutputUnderflow ir ow
222 | ir >= iw = done InputUnderflow ir ow
224 (c,ir') <- readCharBuf iraw ir
227 writeWord8Buf oraw ow (fromIntegral x)
230 if os - ow < 2 then done OutputUnderflow ir ow else do
232 writeWord8Buf oraw ow c1
233 writeWord8Buf oraw (ow+1) c2
235 | x <= 0xFFFF -> if isSurrogate c then done InvalidSequence ir ow else do
236 if os - ow < 3 then done OutputUnderflow ir ow else do
237 let (c1,c2,c3) = ord3 c
238 writeWord8Buf oraw ow c1
239 writeWord8Buf oraw (ow+1) c2
240 writeWord8Buf oraw (ow+2) c3
243 if os - ow < 4 then done OutputUnderflow ir ow else do
244 let (c1,c2,c3,c4) = ord4 c
245 writeWord8Buf oraw ow c1
246 writeWord8Buf oraw (ow+1) c2
247 writeWord8Buf oraw (ow+2) c3
248 writeWord8Buf oraw (ow+3) c4
253 -- -----------------------------------------------------------------------------
254 -- UTF-8 primitives, lifted from Data.Text.Fusion.Utf8
256 ord2 :: Char -> (Word8,Word8)
257 ord2 c = assert (n >= 0x80 && n <= 0x07ff) (x1,x2)
260 x1 = fromIntegral $ (n `shiftR` 6) + 0xC0
261 x2 = fromIntegral $ (n .&. 0x3F) + 0x80
263 ord3 :: Char -> (Word8,Word8,Word8)
264 ord3 c = assert (n >= 0x0800 && n <= 0xffff) (x1,x2,x3)
267 x1 = fromIntegral $ (n `shiftR` 12) + 0xE0
268 x2 = fromIntegral $ ((n `shiftR` 6) .&. 0x3F) + 0x80
269 x3 = fromIntegral $ (n .&. 0x3F) + 0x80
271 ord4 :: Char -> (Word8,Word8,Word8,Word8)
272 ord4 c = assert (n >= 0x10000) (x1,x2,x3,x4)
275 x1 = fromIntegral $ (n `shiftR` 18) + 0xF0
276 x2 = fromIntegral $ ((n `shiftR` 12) .&. 0x3F) + 0x80
277 x3 = fromIntegral $ ((n `shiftR` 6) .&. 0x3F) + 0x80
278 x4 = fromIntegral $ (n .&. 0x3F) + 0x80
280 chr2 :: Word8 -> Word8 -> Char
281 chr2 (W8# x1#) (W8# x2#) = C# (chr# (z1# +# z2#))
285 !z1# = uncheckedIShiftL# (y1# -# 0xC0#) 6#
289 chr3 :: Word8 -> Word8 -> Word8 -> Char
290 chr3 (W8# x1#) (W8# x2#) (W8# x3#) = C# (chr# (z1# +# z2# +# z3#))
295 !z1# = uncheckedIShiftL# (y1# -# 0xE0#) 12#
296 !z2# = uncheckedIShiftL# (y2# -# 0x80#) 6#
300 chr4 :: Word8 -> Word8 -> Word8 -> Word8 -> Char
301 chr4 (W8# x1#) (W8# x2#) (W8# x3#) (W8# x4#) =
302 C# (chr# (z1# +# z2# +# z3# +# z4#))
308 !z1# = uncheckedIShiftL# (y1# -# 0xF0#) 18#
309 !z2# = uncheckedIShiftL# (y2# -# 0x80#) 12#
310 !z3# = uncheckedIShiftL# (y3# -# 0x80#) 6#
314 between :: Word8 -- ^ byte to check
315 -> Word8 -- ^ lower bound
316 -> Word8 -- ^ upper bound
318 between x y z = x >= y && x <= z
319 {-# INLINE between #-}
321 validate3 :: Word8 -> Word8 -> Word8 -> Bool
322 {-# INLINE validate3 #-}
323 validate3 x1 x2 x3 = validate3_1 ||
328 validate3_1 = (x1 == 0xE0) &&
329 between x2 0xA0 0xBF &&
331 validate3_2 = between x1 0xE1 0xEC &&
332 between x2 0x80 0xBF &&
334 validate3_3 = x1 == 0xED &&
335 between x2 0x80 0x9F &&
337 validate3_4 = between x1 0xEE 0xEF &&
338 between x2 0x80 0xBF &&
341 validate4 :: Word8 -> Word8 -> Word8 -> Word8 -> Bool
342 {-# INLINE validate4 #-}
343 validate4 x1 x2 x3 x4 = validate4_1 ||
347 validate4_1 = x1 == 0xF0 &&
348 between x2 0x90 0xBF &&
349 between x3 0x80 0xBF &&
351 validate4_2 = between x1 0xF1 0xF3 &&
352 between x2 0x80 0xBF &&
353 between x3 0x80 0xBF &&
355 validate4_3 = x1 == 0xF4 &&
356 between x2 0x80 0x8F &&
357 between x3 0x80 0xBF &&