X-Git-Url: http://git.megacz.com/?a=blobdiff_plain;f=GHC%2FIO%2FEncoding%2FTypes.hs;h=ebce578219061f71f64f9c23f869ef327ec82d86;hb=4c889c7daa98daff7aec5c0e4ccf491f25f5d10c;hp=9be6c98b1324a0a55c737804a5e171bc6b491589;hpb=207e64ee80ef749dbb46df0fa6a134b19d5a42a5;p=ghc-base.git diff --git a/GHC/IO/Encoding/Types.hs b/GHC/IO/Encoding/Types.hs index 9be6c98..ebce578 100644 --- a/GHC/IO/Encoding/Types.hs +++ b/GHC/IO/Encoding/Types.hs @@ -1,4 +1,6 @@ -{-# OPTIONS_GHC -XNoImplicitPrelude -funbox-strict-fields #-} +{-# LANGUAGE NoImplicitPrelude, ExistentialQuantification #-} +{-# OPTIONS_GHC -funbox-strict-fields #-} + ----------------------------------------------------------------------------- -- | -- Module : GHC.IO.Encoding.Types @@ -18,10 +20,12 @@ module GHC.IO.Encoding.Types ( TextEncoding(..), TextEncoder, TextDecoder, EncodeBuffer, DecodeBuffer, + CodingProgress(..) ) where import GHC.Base import GHC.Word +import GHC.Show -- import GHC.IO import GHC.IO.Buffer @@ -29,22 +33,41 @@ import GHC.IO.Buffer -- Text encoders/decoders data BufferCodec from to state = BufferCodec { - encode :: Buffer from -> Buffer to -> IO (Buffer from, Buffer to), + encode :: Buffer from -> Buffer to -> IO (CodingProgress, Buffer from, Buffer to), -- ^ The @encode@ function translates elements of the buffer @from@ -- to the buffer @to@. It should translate as many elements as possible -- given the sizes of the buffers, including translating zero elements -- if there is either not enough room in @to@, or @from@ does not -- contain a complete multibyte sequence. - -- - -- @encode@ should raise an exception if, and only if, @from@ - -- begins with an illegal sequence, or the first element of @from@ - -- is not representable in the encoding of @to@. That is, if any - -- elements can be successfully translated before an error is - -- encountered, then @encode@ should translate as much as it can - -- and not throw an exception. This behaviour is used by the IO + -- + -- The fact that as many elements as possible are translated is used by the IO -- library in order to report translation errors at the point they -- actually occur, rather than when the buffer is translated. -- + -- To allow us to use iconv as a BufferCode efficiently, character buffers are + -- defined to contain lone surrogates instead of those private use characters that + -- are used for roundtripping. Thus, Chars poked and peeked from a character buffer + -- must undergo surrogatifyRoundtripCharacter and desurrogatifyRoundtripCharacter + -- respectively. + -- + -- For more information on this, see Note [Roundtripping] in GHC.IO.Encoding.Failure. + + recover :: Buffer from -> Buffer to -> IO (Buffer from, Buffer to), + -- ^ The @recover@ function is used to continue decoding + -- in the presence of invalid or unrepresentable sequences. This includes + -- both those detected by @encode@ returning @InvalidSequence@ and those + -- that occur because the input byte sequence appears to be truncated. + -- + -- Progress will usually be made by skipping the first element of the @from@ + -- buffer. This function should only be called if you are certain that you + -- wish to do this skipping, and if the @to@ buffer has at least one element + -- of free space. + -- + -- @recover@ may raise an exception rather than skipping anything. + -- + -- Currently, some implementations of @recover@ may mutate the input buffer. + -- In particular, this feature is used to implement transliteration. + close :: IO (), -- ^ Resources associated with the encoding may now be released. -- The @encode@ function may not be called again after calling @@ -63,16 +86,16 @@ data BufferCodec from to state = BufferCodec { -- beginning), and if not, whether to use the big or little-endian -- encoding. - setState :: state -> IO() + setState :: state -> IO () -- restore the state of the codec using the state from a previous -- call to 'getState'. } type DecodeBuffer = Buffer Word8 -> Buffer Char - -> IO (Buffer Word8, Buffer Char) + -> IO (CodingProgress, Buffer Word8, Buffer Char) type EncodeBuffer = Buffer Char -> Buffer Word8 - -> IO (Buffer Char, Buffer Word8) + -> IO (CodingProgress, Buffer Char, Buffer Word8) type TextDecoder state = BufferCodec Word8 CharBufElem state type TextEncoder state = BufferCodec CharBufElem Word8 state @@ -84,6 +107,25 @@ type TextEncoder state = BufferCodec CharBufElem Word8 state -- of bytes. The 'TextEncoding' for UTF-8 is 'utf8'. data TextEncoding = forall dstate estate . TextEncoding { - mkTextDecoder :: IO (TextDecoder dstate), - mkTextEncoder :: IO (TextEncoder estate) + textEncodingName :: String, + -- ^ a string that can be passed to 'mkTextEncoding' to + -- create an equivalent 'TextEncoding'. + mkTextDecoder :: IO (TextDecoder dstate), + -- ^ Creates a means of decoding bytes into characters: the result must not + -- be shared between several byte sequences or simultaneously across threads + mkTextEncoder :: IO (TextEncoder estate) + -- ^ Creates a means of encode characters into bytes: the result must not + -- be shared between several character sequences or simultaneously across threads } + +instance Show TextEncoding where + -- | Returns the value of 'textEncodingName' + show te = textEncodingName te + +data CodingProgress = InputUnderflow -- ^ Stopped because the input contains insufficient available elements, + -- or all of the input sequence has been sucessfully translated. + | OutputUnderflow -- ^ Stopped because the output contains insufficient free elements + | InvalidSequence -- ^ Stopped because there are sufficient free elements in the output + -- to output at least one encoded ASCII character, but the input contains + -- an invalid or unrepresentable sequence + deriving (Eq, Show)