1 {-# OPTIONS_GHC -fno-implicit-prelude -funbox-strict-fields #-}
2 {-# LANGUAGE BangPatterns #-}
3 -----------------------------------------------------------------------------
5 -- Module : GHC.IO.Encoding.UTF8
6 -- Copyright : (c) The University of Glasgow, 2009
7 -- License : see libraries/base/LICENSE
9 -- Maintainer : libraries@haskell.org
10 -- Stability : internal
11 -- Portability : non-portable
13 -- UTF-8 Codec for the IO library
15 -- Portions Copyright : (c) Tom Harper 2008-2009,
16 -- (c) Bryan O'Sullivan 2009,
17 -- (c) Duncan Coutts 2009
19 -----------------------------------------------------------------------------
21 module GHC.IO.Encoding.UTF8 (
31 import GHC.IO.Exception
33 import GHC.IO.Encoding.Types
39 utf8 = TextEncoding { mkTextDecoder = utf8_DF,
40 mkTextEncoder = utf8_EF }
42 utf8_DF :: IO TextDecoder
43 utf8_DF = return (BufferCodec utf8_decode (return ()))
45 utf8_EF :: IO TextEncoder
46 utf8_EF = return (BufferCodec utf8_encode (return ()))
48 utf8_decode :: DecodeBuffer
50 input@Buffer{ bufRaw=iraw, bufL=ir0, bufR=iw, bufSize=_ }
51 output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow0, bufSize=os }
54 | ow >= os || ir >= iw = done ir ow
56 c0 <- readWord8Buf iraw ir
59 writeCharBuf oraw ow (unsafeChr (fromIntegral c0))
61 | c0 >= 0xc0 && c0 <= 0xdf ->
62 if iw - ir < 2 then done ir ow else do
63 c1 <- readWord8Buf iraw (ir+1)
64 if (c1 < 0x80 || c1 >= 0xc0) then invalid else do
65 writeCharBuf oraw ow (chr2 c0 c1)
67 | c0 >= 0xe0 && c0 <= 0xef ->
68 if iw - ir < 3 then done ir ow else do
69 c1 <- readWord8Buf iraw (ir+1)
70 c2 <- readWord8Buf iraw (ir+2)
71 if not (validate3 c0 c1 c2) then invalid else do
72 writeCharBuf oraw ow (chr3 c0 c1 c2)
75 if iw - ir < 4 then done ir ow else do
76 c1 <- readWord8Buf iraw (ir+1)
77 c2 <- readWord8Buf iraw (ir+2)
78 c3 <- readWord8Buf iraw (ir+3)
79 if not (validate4 c0 c1 c2 c3) then invalid else do
80 writeCharBuf oraw ow (chr4 c0 c1 c2 c3)
83 invalid = if ir > ir0 then done ir ow else ioe_decodingError
85 -- lambda-lifted, to avoid thunks being built in the inner-loop:
86 done !ir !ow = return (if ir == iw then input{ bufL=0, bufR=0 }
87 else input{ bufL=ir },
92 ioe_decodingError :: IO a
93 ioe_decodingError = ioException
94 (IOError Nothing InvalidArgument "utf8_decode"
95 "invalid UTF-8 byte sequence" Nothing Nothing)
97 utf8_encode :: EncodeBuffer
99 input@Buffer{ bufRaw=iraw, bufL=ir0, bufR=iw, bufSize=_ }
100 output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow0, bufSize=os }
102 done !ir !ow = return (if ir == iw then input{ bufL=0, bufR=0 }
103 else input{ bufL=ir },
106 | ow >= os || ir >= iw = done ir ow
108 (c,ir') <- readCharBuf iraw ir
111 writeWord8Buf oraw ow (fromIntegral x)
114 if os - ow < 2 then done ir ow else do
116 writeWord8Buf oraw ow c1
117 writeWord8Buf oraw (ow+1) c2
120 if os - ow < 3 then done ir ow else do
121 let (c1,c2,c3) = ord3 c
122 writeWord8Buf oraw ow c1
123 writeWord8Buf oraw (ow+1) c2
124 writeWord8Buf oraw (ow+2) c3
127 if os - ow < 4 then done ir ow else do
128 let (c1,c2,c3,c4) = ord4 c
129 writeWord8Buf oraw ow c1
130 writeWord8Buf oraw (ow+1) c2
131 writeWord8Buf oraw (ow+2) c3
132 writeWord8Buf oraw (ow+3) c4
137 -- -----------------------------------------------------------------------------
138 -- UTF-8 primitives, lifted from Data.Text.Fusion.Utf8
140 ord2 :: Char -> (Word8,Word8)
141 ord2 c = assert (n >= 0x80 && n <= 0x07ff) (x1,x2)
144 x1 = fromIntegral $ (n `shiftR` 6) + 0xC0
145 x2 = fromIntegral $ (n .&. 0x3F) + 0x80
147 ord3 :: Char -> (Word8,Word8,Word8)
148 ord3 c = assert (n >= 0x0800 && n <= 0xffff) (x1,x2,x3)
151 x1 = fromIntegral $ (n `shiftR` 12) + 0xE0
152 x2 = fromIntegral $ ((n `shiftR` 6) .&. 0x3F) + 0x80
153 x3 = fromIntegral $ (n .&. 0x3F) + 0x80
155 ord4 :: Char -> (Word8,Word8,Word8,Word8)
156 ord4 c = assert (n >= 0x10000) (x1,x2,x3,x4)
159 x1 = fromIntegral $ (n `shiftR` 18) + 0xF0
160 x2 = fromIntegral $ ((n `shiftR` 12) .&. 0x3F) + 0x80
161 x3 = fromIntegral $ ((n `shiftR` 6) .&. 0x3F) + 0x80
162 x4 = fromIntegral $ (n .&. 0x3F) + 0x80
164 chr2 :: Word8 -> Word8 -> Char
165 chr2 (W8# x1#) (W8# x2#) = C# (chr# (z1# +# z2#))
169 !z1# = uncheckedIShiftL# (y1# -# 0xC0#) 6#
173 chr3 :: Word8 -> Word8 -> Word8 -> Char
174 chr3 (W8# x1#) (W8# x2#) (W8# x3#) = C# (chr# (z1# +# z2# +# z3#))
179 !z1# = uncheckedIShiftL# (y1# -# 0xE0#) 12#
180 !z2# = uncheckedIShiftL# (y2# -# 0x80#) 6#
184 chr4 :: Word8 -> Word8 -> Word8 -> Word8 -> Char
185 chr4 (W8# x1#) (W8# x2#) (W8# x3#) (W8# x4#) =
186 C# (chr# (z1# +# z2# +# z3# +# z4#))
192 !z1# = uncheckedIShiftL# (y1# -# 0xF0#) 18#
193 !z2# = uncheckedIShiftL# (y2# -# 0x80#) 12#
194 !z3# = uncheckedIShiftL# (y3# -# 0x80#) 6#
198 between :: Word8 -- ^ byte to check
199 -> Word8 -- ^ lower bound
200 -> Word8 -- ^ upper bound
202 between x y z = x >= y && x <= z
203 {-# INLINE between #-}
205 validate3 :: Word8 -> Word8 -> Word8 -> Bool
206 {-# INLINE validate3 #-}
207 validate3 x1 x2 x3 = validate3_1 ||
212 validate3_1 = (x1 == 0xE0) &&
213 between x2 0xA0 0xBF &&
215 validate3_2 = between x1 0xE1 0xEC &&
216 between x2 0x80 0xBF &&
218 validate3_3 = x1 == 0xED &&
219 between x2 0x80 0x9F &&
221 validate3_4 = between x1 0xEE 0xEF &&
222 between x2 0x80 0xBF &&
225 validate4 :: Word8 -> Word8 -> Word8 -> Word8 -> Bool
226 {-# INLINE validate4 #-}
227 validate4 x1 x2 x3 x4 = validate4_1 ||
231 validate4_1 = x1 == 0xF0 &&
232 between x2 0x90 0xBF &&
233 between x3 0x80 0xBF &&
235 validate4_2 = between x1 0xF1 0xF3 &&
236 between x2 0x80 0xBF &&
237 between x3 0x80 0xBF &&
239 validate4_3 = x1 == 0xF4 &&
240 between x2 0x80 0x8F &&
241 between x3 0x80 0xBF &&