X-Git-Url: http://git.megacz.com/?a=blobdiff_plain;f=GHC%2FIO%2FEncoding%2FUTF8.hs;h=55d09c82b1d7fa335522f5e68388ede4578b462e;hb=509f28cc93b980d30aca37008cbe66c677a0d6f6;hp=59125256f46574ae8f5470f725374d05be18658c;hpb=8afc9fecd586d3c4f7ef9c69fb1686a79e5f441d;p=ghc-base.git diff --git a/GHC/IO/Encoding/UTF8.hs b/GHC/IO/Encoding/UTF8.hs index 5912525..55d09c8 100644 --- a/GHC/IO/Encoding/UTF8.hs +++ b/GHC/IO/Encoding/UTF8.hs @@ -1,5 +1,10 @@ -{-# OPTIONS_GHC -fno-implicit-prelude -funbox-strict-fields #-} -{-# LANGUAGE BangPatterns #-} +{-# LANGUAGE NoImplicitPrelude + , BangPatterns + , NondecreasingIndentation + , MagicHash + #-} +{-# OPTIONS_GHC -funbox-strict-fields #-} + ----------------------------------------------------------------------------- -- | -- Module : GHC.IO.Encoding.UTF8 @@ -19,51 +24,131 @@ ----------------------------------------------------------------------------- module GHC.IO.Encoding.UTF8 ( - utf8, - utf8_decode, - utf8_encode, + utf8, mkUTF8, + utf8_bom, mkUTF8_bom ) where import GHC.Base import GHC.Real import GHC.Num +import GHC.IORef -- import GHC.IO -import GHC.IO.Exception import GHC.IO.Buffer +import GHC.IO.Encoding.Failure import GHC.IO.Encoding.Types import GHC.Word import Data.Bits -import Data.Maybe utf8 :: TextEncoding -utf8 = TextEncoding { mkTextDecoder = utf8_DF, - mkTextEncoder = utf8_EF } +utf8 = mkUTF8 ErrorOnCodingFailure + +mkUTF8 :: CodingFailureMode -> TextEncoding +mkUTF8 cfm = TextEncoding { textEncodingName = "UTF-8", + mkTextDecoder = utf8_DF cfm, + mkTextEncoder = utf8_EF cfm } + -utf8_DF :: IO (TextDecoder ()) -utf8_DF = +utf8_DF :: CodingFailureMode -> IO (TextDecoder ()) +utf8_DF cfm = return (BufferCodec { encode = utf8_decode, + recover = recoverDecode cfm, close = return (), getState = return (), setState = const $ return () }) -utf8_EF :: IO (TextEncoder ()) -utf8_EF = +utf8_EF :: CodingFailureMode -> IO (TextEncoder ()) +utf8_EF cfm = return (BufferCodec { encode = utf8_encode, + recover = recoverEncode cfm, close = return (), getState = return (), setState = const $ return () }) +utf8_bom :: TextEncoding +utf8_bom = mkUTF8_bom ErrorOnCodingFailure + +mkUTF8_bom :: CodingFailureMode -> TextEncoding +mkUTF8_bom cfm = TextEncoding { textEncodingName = "UTF-8BOM", + mkTextDecoder = utf8_bom_DF cfm, + mkTextEncoder = utf8_bom_EF cfm } + +utf8_bom_DF :: CodingFailureMode -> IO (TextDecoder Bool) +utf8_bom_DF cfm = do + ref <- newIORef True + return (BufferCodec { + encode = utf8_bom_decode ref, + recover = recoverDecode cfm, + close = return (), + getState = readIORef ref, + setState = writeIORef ref + }) + +utf8_bom_EF :: CodingFailureMode -> IO (TextEncoder Bool) +utf8_bom_EF cfm = do + ref <- newIORef True + return (BufferCodec { + encode = utf8_bom_encode ref, + recover = recoverEncode cfm, + close = return (), + getState = readIORef ref, + setState = writeIORef ref + }) + +utf8_bom_decode :: IORef Bool -> DecodeBuffer +utf8_bom_decode ref + input@Buffer{ bufRaw=iraw, bufL=ir, bufR=iw, bufSize=_ } + output + = do + first <- readIORef ref + if not first + then utf8_decode input output + else do + let no_bom = do writeIORef ref False; utf8_decode input output + if iw - ir < 1 then return (InputUnderflow,input,output) else do + c0 <- readWord8Buf iraw ir + if (c0 /= bom0) then no_bom else do + if iw - ir < 2 then return (InputUnderflow,input,output) else do + c1 <- readWord8Buf iraw (ir+1) + if (c1 /= bom1) then no_bom else do + if iw - ir < 3 then return (InputUnderflow,input,output) else do + c2 <- readWord8Buf iraw (ir+2) + if (c2 /= bom2) then no_bom else do + -- found a BOM, ignore it and carry on + writeIORef ref False + utf8_decode input{ bufL = ir + 3 } output + +utf8_bom_encode :: IORef Bool -> EncodeBuffer +utf8_bom_encode ref input + output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow, bufSize=os } + = do + b <- readIORef ref + if not b then utf8_encode input output + else if os - ow < 3 + then return (OutputUnderflow,input,output) + else do + writeIORef ref False + writeWord8Buf oraw ow bom0 + writeWord8Buf oraw (ow+1) bom1 + writeWord8Buf oraw (ow+2) bom2 + utf8_encode input output{ bufR = ow+3 } + +bom0, bom1, bom2 :: Word8 +bom0 = 0xef +bom1 = 0xbb +bom2 = 0xbf + utf8_decode :: DecodeBuffer utf8_decode input@Buffer{ bufRaw=iraw, bufL=ir0, bufR=iw, bufSize=_ } output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow0, bufSize=os } = let loop !ir !ow - | ow >= os || ir >= iw = done ir ow + | ow >= os = done OutputUnderflow ir ow + | ir >= iw = done InputUnderflow ir ow | otherwise = do c0 <- readWord8Buf iraw ir case c0 of @@ -71,20 +156,39 @@ utf8_decode ow' <- writeCharBuf oraw ow (unsafeChr (fromIntegral c0)) loop (ir+1) ow' | c0 >= 0xc0 && c0 <= 0xdf -> - if iw - ir < 2 then done ir ow else do + if iw - ir < 2 then done InputUnderflow ir ow else do c1 <- readWord8Buf iraw (ir+1) if (c1 < 0x80 || c1 >= 0xc0) then invalid else do ow' <- writeCharBuf oraw ow (chr2 c0 c1) loop (ir+2) ow' | c0 >= 0xe0 && c0 <= 0xef -> - if iw - ir < 3 then done ir ow else do + case iw - ir of + 1 -> done InputUnderflow ir ow + 2 -> do -- check for an error even when we don't have + -- the full sequence yet (#3341) + c1 <- readWord8Buf iraw (ir+1) + if not (validate3 c0 c1 0x80) + then invalid else done InputUnderflow ir ow + _ -> do c1 <- readWord8Buf iraw (ir+1) c2 <- readWord8Buf iraw (ir+2) if not (validate3 c0 c1 c2) then invalid else do ow' <- writeCharBuf oraw ow (chr3 c0 c1 c2) loop (ir+3) ow' | c0 >= 0xf0 -> - if iw - ir < 4 then done ir ow else do + case iw - ir of + 1 -> done InputUnderflow ir ow + 2 -> do -- check for an error even when we don't have + -- the full sequence yet (#3341) + c1 <- readWord8Buf iraw (ir+1) + if not (validate4 c0 c1 0x80 0x80) + then invalid else done InputUnderflow ir ow + 3 -> do + c1 <- readWord8Buf iraw (ir+1) + c2 <- readWord8Buf iraw (ir+2) + if not (validate4 c0 c1 c2 0x80) + then invalid else done InputUnderflow ir ow + _ -> do c1 <- readWord8Buf iraw (ir+1) c2 <- readWord8Buf iraw (ir+2) c3 <- readWord8Buf iraw (ir+3) @@ -94,30 +198,28 @@ utf8_decode | otherwise -> invalid where - invalid = if ir > ir0 then done ir ow else ioe_decodingError + invalid = done InvalidSequence ir ow -- lambda-lifted, to avoid thunks being built in the inner-loop: - done !ir !ow = return (if ir == iw then input{ bufL=0, bufR=0 } - else input{ bufL=ir }, - output{ bufR=ow }) + done why !ir !ow = return (why, + if ir == iw then input{ bufL=0, bufR=0 } + else input{ bufL=ir }, + output{ bufR=ow }) in loop ir0 ow0 -ioe_decodingError :: IO a -ioe_decodingError = ioException - (IOError Nothing InvalidArgument "utf8_decode" - "invalid UTF-8 byte sequence" Nothing Nothing) - utf8_encode :: EncodeBuffer utf8_encode input@Buffer{ bufRaw=iraw, bufL=ir0, bufR=iw, bufSize=_ } output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow0, bufSize=os } = let - done !ir !ow = return (if ir == iw then input{ bufL=0, bufR=0 } - else input{ bufL=ir }, - output{ bufR=ow }) + done why !ir !ow = return (why, + if ir == iw then input{ bufL=0, bufR=0 } + else input{ bufL=ir }, + output{ bufR=ow }) loop !ir !ow - | ow >= os || ir >= iw = done ir ow + | ow >= os = done OutputUnderflow ir ow + | ir >= iw = done InputUnderflow ir ow | otherwise = do (c,ir') <- readCharBuf iraw ir case ord c of @@ -125,20 +227,20 @@ utf8_encode writeWord8Buf oraw ow (fromIntegral x) loop ir' (ow+1) | x <= 0x07FF -> - if os - ow < 2 then done ir ow else do + if os - ow < 2 then done OutputUnderflow ir ow else do let (c1,c2) = ord2 c writeWord8Buf oraw ow c1 writeWord8Buf oraw (ow+1) c2 loop ir' (ow+2) - | x <= 0xFFFF -> do - if os - ow < 3 then done ir ow else do + | x <= 0xFFFF -> if isSurrogate c then done InvalidSequence ir ow else do + if os - ow < 3 then done OutputUnderflow ir ow else do let (c1,c2,c3) = ord3 c writeWord8Buf oraw ow c1 writeWord8Buf oraw (ow+1) c2 writeWord8Buf oraw (ow+2) c3 loop ir' (ow+3) | otherwise -> do - if os - ow < 4 then done ir ow else do + if os - ow < 4 then done OutputUnderflow ir ow else do let (c1,c2,c3,c4) = ord4 c writeWord8Buf oraw ow c1 writeWord8Buf oraw (ow+1) c2