-{-# OPTIONS_GHC -fno-implicit-prelude -funbox-strict-fields #-}
-{-# LANGUAGE BangPatterns #-}
+{-# LANGUAGE NoImplicitPrelude
+ , BangPatterns
+ , NondecreasingIndentation
+ , MagicHash
+ #-}
+{-# OPTIONS_GHC -funbox-strict-fields #-}
+
-----------------------------------------------------------------------------
-- |
-- Module : GHC.IO.Encoding.UTF8
module GHC.IO.Encoding.UTF8 (
utf8,
- utf8_decode,
- utf8_encode,
+ utf8_bom,
) where
import GHC.Base
import GHC.Real
import GHC.Num
-import GHC.IO
+import GHC.IORef
+-- import GHC.IO
import GHC.IO.Exception
import GHC.IO.Buffer
import GHC.IO.Encoding.Types
import Data.Maybe
utf8 :: TextEncoding
-utf8 = TextEncoding { mkTextDecoder = utf8_DF,
+utf8 = TextEncoding { textEncodingName = "UTF-8",
+ mkTextDecoder = utf8_DF,
mkTextEncoder = utf8_EF }
utf8_DF :: IO (TextDecoder ())
setState = const $ return ()
})
+utf8_bom :: TextEncoding
+utf8_bom = TextEncoding { textEncodingName = "UTF-8BOM",
+ mkTextDecoder = utf8_bom_DF,
+ mkTextEncoder = utf8_bom_EF }
+
+utf8_bom_DF :: IO (TextDecoder Bool)
+utf8_bom_DF = do
+ ref <- newIORef True
+ return (BufferCodec {
+ encode = utf8_bom_decode ref,
+ close = return (),
+ getState = readIORef ref,
+ setState = writeIORef ref
+ })
+
+utf8_bom_EF :: IO (TextEncoder Bool)
+utf8_bom_EF = do
+ ref <- newIORef True
+ return (BufferCodec {
+ encode = utf8_bom_encode ref,
+ close = return (),
+ getState = readIORef ref,
+ setState = writeIORef ref
+ })
+
+utf8_bom_decode :: IORef Bool -> DecodeBuffer
+utf8_bom_decode ref
+ input@Buffer{ bufRaw=iraw, bufL=ir, bufR=iw, bufSize=_ }
+ output
+ = do
+ first <- readIORef ref
+ if not first
+ then utf8_decode input output
+ else do
+ let no_bom = do writeIORef ref False; utf8_decode input output
+ if iw - ir < 1 then return (input,output) else do
+ c0 <- readWord8Buf iraw ir
+ if (c0 /= bom0) then no_bom else do
+ if iw - ir < 2 then return (input,output) else do
+ c1 <- readWord8Buf iraw (ir+1)
+ if (c1 /= bom1) then no_bom else do
+ if iw - ir < 3 then return (input,output) else do
+ c2 <- readWord8Buf iraw (ir+2)
+ if (c2 /= bom2) then no_bom else do
+ -- found a BOM, ignore it and carry on
+ writeIORef ref False
+ utf8_decode input{ bufL = ir + 3 } output
+
+utf8_bom_encode :: IORef Bool -> EncodeBuffer
+utf8_bom_encode ref input
+ output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow, bufSize=os }
+ = do
+ b <- readIORef ref
+ if not b then utf8_encode input output
+ else if os - ow < 3
+ then return (input,output)
+ else do
+ writeIORef ref False
+ writeWord8Buf oraw ow bom0
+ writeWord8Buf oraw (ow+1) bom1
+ writeWord8Buf oraw (ow+2) bom2
+ utf8_encode input output{ bufR = ow+3 }
+
+bom0, bom1, bom2 :: Word8
+bom0 = 0xef
+bom1 = 0xbb
+bom2 = 0xbf
+
utf8_decode :: DecodeBuffer
utf8_decode
input@Buffer{ bufRaw=iraw, bufL=ir0, bufR=iw, bufSize=_ }
c0 <- readWord8Buf iraw ir
case c0 of
_ | c0 <= 0x7f -> do
- writeCharBuf oraw ow (unsafeChr (fromIntegral c0))
- loop (ir+1) (ow+1)
+ ow' <- writeCharBuf oraw ow (unsafeChr (fromIntegral c0))
+ loop (ir+1) ow'
| c0 >= 0xc0 && c0 <= 0xdf ->
if iw - ir < 2 then done ir ow else do
c1 <- readWord8Buf iraw (ir+1)
if (c1 < 0x80 || c1 >= 0xc0) then invalid else do
- writeCharBuf oraw ow (chr2 c0 c1)
- loop (ir+2) (ow+1)
+ ow' <- writeCharBuf oraw ow (chr2 c0 c1)
+ loop (ir+2) ow'
| c0 >= 0xe0 && c0 <= 0xef ->
- if iw - ir < 3 then done ir ow else do
+ case iw - ir of
+ 1 -> done ir ow
+ 2 -> do -- check for an error even when we don't have
+ -- the full sequence yet (#3341)
+ c1 <- readWord8Buf iraw (ir+1)
+ if not (validate3 c0 c1 0x80)
+ then invalid else done ir ow
+ _ -> do
c1 <- readWord8Buf iraw (ir+1)
c2 <- readWord8Buf iraw (ir+2)
if not (validate3 c0 c1 c2) then invalid else do
- writeCharBuf oraw ow (chr3 c0 c1 c2)
- loop (ir+3) (ow+1)
- | otherwise ->
- if iw - ir < 4 then done ir ow else do
+ ow' <- writeCharBuf oraw ow (chr3 c0 c1 c2)
+ loop (ir+3) ow'
+ | c0 >= 0xf0 ->
+ case iw - ir of
+ 1 -> done ir ow
+ 2 -> do -- check for an error even when we don't have
+ -- the full sequence yet (#3341)
+ c1 <- readWord8Buf iraw (ir+1)
+ if not (validate4 c0 c1 0x80 0x80)
+ then invalid else done ir ow
+ 3 -> do
+ c1 <- readWord8Buf iraw (ir+1)
+ c2 <- readWord8Buf iraw (ir+2)
+ if not (validate4 c0 c1 c2 0x80)
+ then invalid else done ir ow
+ _ -> do
c1 <- readWord8Buf iraw (ir+1)
c2 <- readWord8Buf iraw (ir+2)
c3 <- readWord8Buf iraw (ir+3)
if not (validate4 c0 c1 c2 c3) then invalid else do
- writeCharBuf oraw ow (chr4 c0 c1 c2 c3)
- loop (ir+4) (ow+1)
+ ow' <- writeCharBuf oraw ow (chr4 c0 c1 c2 c3)
+ loop (ir+4) ow'
+ | otherwise ->
+ invalid
where
invalid = if ir > ir0 then done ir ow else ioe_decodingError