X-Git-Url: http://git.megacz.com/?a=blobdiff_plain;f=GHC%2FIO%2FEncoding%2FUTF8.hs;h=dea4fdea9ed3d697380a3d5087865db6f57cc36e;hb=41e8fba828acbae1751628af50849f5352b27873;hp=0efb187aba9311c027950d94ae4ee43482d8a229;hpb=ccc931d0905f6e0d55cb90b045881d4515112411;p=ghc-base.git diff --git a/GHC/IO/Encoding/UTF8.hs b/GHC/IO/Encoding/UTF8.hs index 0efb187..dea4fde 100644 --- a/GHC/IO/Encoding/UTF8.hs +++ b/GHC/IO/Encoding/UTF8.hs @@ -1,5 +1,10 @@ -{-# OPTIONS_GHC -fno-implicit-prelude -funbox-strict-fields #-} -{-# LANGUAGE BangPatterns #-} +{-# LANGUAGE NoImplicitPrelude + , BangPatterns + , NondecreasingIndentation + , MagicHash + #-} +{-# OPTIONS_GHC -funbox-strict-fields #-} + ----------------------------------------------------------------------------- -- | -- Module : GHC.IO.Encoding.UTF8 @@ -20,14 +25,14 @@ module GHC.IO.Encoding.UTF8 ( utf8, - utf8_decode, - utf8_encode, + utf8_bom, ) where import GHC.Base import GHC.Real import GHC.Num -import GHC.IO +import GHC.IORef +-- import GHC.IO import GHC.IO.Exception import GHC.IO.Buffer import GHC.IO.Encoding.Types @@ -36,7 +41,8 @@ import Data.Bits import Data.Maybe utf8 :: TextEncoding -utf8 = TextEncoding { mkTextDecoder = utf8_DF, +utf8 = TextEncoding { textEncodingName = "UTF-8", + mkTextDecoder = utf8_DF, mkTextEncoder = utf8_EF } utf8_DF :: IO (TextDecoder ()) @@ -57,6 +63,74 @@ utf8_EF = setState = const $ return () }) +utf8_bom :: TextEncoding +utf8_bom = TextEncoding { textEncodingName = "UTF-8BOM", + mkTextDecoder = utf8_bom_DF, + mkTextEncoder = utf8_bom_EF } + +utf8_bom_DF :: IO (TextDecoder Bool) +utf8_bom_DF = do + ref <- newIORef True + return (BufferCodec { + encode = utf8_bom_decode ref, + close = return (), + getState = readIORef ref, + setState = writeIORef ref + }) + +utf8_bom_EF :: IO (TextEncoder Bool) +utf8_bom_EF = do + ref <- newIORef True + return (BufferCodec { + encode = utf8_bom_encode ref, + close = return (), + getState = readIORef ref, + setState = writeIORef ref + }) + +utf8_bom_decode :: IORef Bool -> DecodeBuffer +utf8_bom_decode ref + input@Buffer{ bufRaw=iraw, bufL=ir, bufR=iw, bufSize=_ } + output + = do + first <- readIORef ref + if not first + then utf8_decode input output + else do + let no_bom = do writeIORef ref False; utf8_decode input output + if iw - ir < 1 then return (input,output) else do + c0 <- readWord8Buf iraw ir + if (c0 /= bom0) then no_bom else do + if iw - ir < 2 then return (input,output) else do + c1 <- readWord8Buf iraw (ir+1) + if (c1 /= bom1) then no_bom else do + if iw - ir < 3 then return (input,output) else do + c2 <- readWord8Buf iraw (ir+2) + if (c2 /= bom2) then no_bom else do + -- found a BOM, ignore it and carry on + writeIORef ref False + utf8_decode input{ bufL = ir + 3 } output + +utf8_bom_encode :: IORef Bool -> EncodeBuffer +utf8_bom_encode ref input + output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow, bufSize=os } + = do + b <- readIORef ref + if not b then utf8_encode input output + else if os - ow < 3 + then return (input,output) + else do + writeIORef ref False + writeWord8Buf oraw ow bom0 + writeWord8Buf oraw (ow+1) bom1 + writeWord8Buf oraw (ow+2) bom2 + utf8_encode input output{ bufR = ow+3 } + +bom0, bom1, bom2 :: Word8 +bom0 = 0xef +bom1 = 0xbb +bom2 = 0xbf + utf8_decode :: DecodeBuffer utf8_decode input@Buffer{ bufRaw=iraw, bufL=ir0, bufR=iw, bufSize=_ } @@ -68,29 +142,50 @@ utf8_decode c0 <- readWord8Buf iraw ir case c0 of _ | c0 <= 0x7f -> do - writeCharBuf oraw ow (unsafeChr (fromIntegral c0)) - loop (ir+1) (ow+1) + ow' <- writeCharBuf oraw ow (unsafeChr (fromIntegral c0)) + loop (ir+1) ow' | c0 >= 0xc0 && c0 <= 0xdf -> if iw - ir < 2 then done ir ow else do c1 <- readWord8Buf iraw (ir+1) if (c1 < 0x80 || c1 >= 0xc0) then invalid else do - writeCharBuf oraw ow (chr2 c0 c1) - loop (ir+2) (ow+1) + ow' <- writeCharBuf oraw ow (chr2 c0 c1) + loop (ir+2) ow' | c0 >= 0xe0 && c0 <= 0xef -> - if iw - ir < 3 then done ir ow else do + case iw - ir of + 1 -> done ir ow + 2 -> do -- check for an error even when we don't have + -- the full sequence yet (#3341) + c1 <- readWord8Buf iraw (ir+1) + if not (validate3 c0 c1 0x80) + then invalid else done ir ow + _ -> do c1 <- readWord8Buf iraw (ir+1) c2 <- readWord8Buf iraw (ir+2) if not (validate3 c0 c1 c2) then invalid else do - writeCharBuf oraw ow (chr3 c0 c1 c2) - loop (ir+3) (ow+1) - | otherwise -> - if iw - ir < 4 then done ir ow else do + ow' <- writeCharBuf oraw ow (chr3 c0 c1 c2) + loop (ir+3) ow' + | c0 >= 0xf0 -> + case iw - ir of + 1 -> done ir ow + 2 -> do -- check for an error even when we don't have + -- the full sequence yet (#3341) + c1 <- readWord8Buf iraw (ir+1) + if not (validate4 c0 c1 0x80 0x80) + then invalid else done ir ow + 3 -> do + c1 <- readWord8Buf iraw (ir+1) + c2 <- readWord8Buf iraw (ir+2) + if not (validate4 c0 c1 c2 0x80) + then invalid else done ir ow + _ -> do c1 <- readWord8Buf iraw (ir+1) c2 <- readWord8Buf iraw (ir+2) c3 <- readWord8Buf iraw (ir+3) if not (validate4 c0 c1 c2 c3) then invalid else do - writeCharBuf oraw ow (chr4 c0 c1 c2 c3) - loop (ir+4) (ow+1) + ow' <- writeCharBuf oraw ow (chr4 c0 c1 c2 c3) + loop (ir+4) ow' + | otherwise -> + invalid where invalid = if ir > ir0 then done ir ow else ioe_decodingError