X-Git-Url: http://git.megacz.com/?a=blobdiff_plain;f=GHC%2FIO%2FEncoding%2FUTF8.hs;h=dea4fdea9ed3d697380a3d5087865db6f57cc36e;hb=41e8fba828acbae1751628af50849f5352b27873;hp=edca77c43b922eecf96d3eb83634816d7393e0de;hpb=207e64ee80ef749dbb46df0fa6a134b19d5a42a5;p=ghc-base.git diff --git a/GHC/IO/Encoding/UTF8.hs b/GHC/IO/Encoding/UTF8.hs index edca77c..dea4fde 100644 --- a/GHC/IO/Encoding/UTF8.hs +++ b/GHC/IO/Encoding/UTF8.hs @@ -1,5 +1,10 @@ -{-# OPTIONS_GHC -XNoImplicitPrelude -funbox-strict-fields #-} -{-# LANGUAGE BangPatterns #-} +{-# LANGUAGE NoImplicitPrelude + , BangPatterns + , NondecreasingIndentation + , MagicHash + #-} +{-# OPTIONS_GHC -funbox-strict-fields #-} + ----------------------------------------------------------------------------- -- | -- Module : GHC.IO.Encoding.UTF8 @@ -36,7 +41,8 @@ import Data.Bits import Data.Maybe utf8 :: TextEncoding -utf8 = TextEncoding { mkTextDecoder = utf8_DF, +utf8 = TextEncoding { textEncodingName = "UTF-8", + mkTextDecoder = utf8_DF, mkTextEncoder = utf8_EF } utf8_DF :: IO (TextDecoder ()) @@ -58,7 +64,8 @@ utf8_EF = }) utf8_bom :: TextEncoding -utf8_bom = TextEncoding { mkTextDecoder = utf8_bom_DF, +utf8_bom = TextEncoding { textEncodingName = "UTF-8BOM", + mkTextDecoder = utf8_bom_DF, mkTextEncoder = utf8_bom_EF } utf8_bom_DF :: IO (TextDecoder Bool) @@ -144,14 +151,33 @@ utf8_decode ow' <- writeCharBuf oraw ow (chr2 c0 c1) loop (ir+2) ow' | c0 >= 0xe0 && c0 <= 0xef -> - if iw - ir < 3 then done ir ow else do + case iw - ir of + 1 -> done ir ow + 2 -> do -- check for an error even when we don't have + -- the full sequence yet (#3341) + c1 <- readWord8Buf iraw (ir+1) + if not (validate3 c0 c1 0x80) + then invalid else done ir ow + _ -> do c1 <- readWord8Buf iraw (ir+1) c2 <- readWord8Buf iraw (ir+2) if not (validate3 c0 c1 c2) then invalid else do ow' <- writeCharBuf oraw ow (chr3 c0 c1 c2) loop (ir+3) ow' | c0 >= 0xf0 -> - if iw - ir < 4 then done ir ow else do + case iw - ir of + 1 -> done ir ow + 2 -> do -- check for an error even when we don't have + -- the full sequence yet (#3341) + c1 <- readWord8Buf iraw (ir+1) + if not (validate4 c0 c1 0x80 0x80) + then invalid else done ir ow + 3 -> do + c1 <- readWord8Buf iraw (ir+1) + c2 <- readWord8Buf iraw (ir+2) + if not (validate4 c0 c1 c2 0x80) + then invalid else done ir ow + _ -> do c1 <- readWord8Buf iraw (ir+1) c2 <- readWord8Buf iraw (ir+2) c3 <- readWord8Buf iraw (ir+3)