Use Unicode private-use characters for roundtripping

[ghc-base.git] / GHC / IO / Encoding / Types.hs
diff --git a/GHC/IO/Encoding/Types.hs b/GHC/IO/Encoding/Types.hs

index b857bdf..ebce578 100644 (file)
--- a/GHC/IO/Encoding/Types.hs
+++ b/GHC/IO/Encoding/Types.hs
@@ -1,4 +1,6 @@
-{-# OPTIONS_GHC -fno-implicit-prelude -funbox-strict-fields #-}
+{-# LANGUAGE NoImplicitPrelude, ExistentialQuantification #-}
+{-# OPTIONS_GHC -funbox-strict-fields #-}
+
  -----------------------------------------------------------------------------
  -- |
  -- Module      :  GHC.IO.Encoding.Types
@@ -18,55 +20,112 @@ module GHC.IO.Encoding.Types (
      TextEncoding(..),
      TextEncoder, TextDecoder,
      EncodeBuffer, DecodeBuffer,
+    CodingProgress(..)
    ) where
  
  import GHC.Base
  import GHC.Word
-import GHC.IO
+import GHC.Show
+-- import GHC.IO
  import GHC.IO.Buffer
  
  -- -----------------------------------------------------------------------------
  -- Text encoders/decoders
  
-data BufferCodec from to = BufferCodec {
-  encode :: Buffer from -> Buffer to -> IO (Buffer from, Buffer to),
+data BufferCodec from to state = BufferCodec {
+  encode :: Buffer from -> Buffer to -> IO (CodingProgress, Buffer from, Buffer to),
     -- ^ The @encode@ function translates elements of the buffer @from@
     -- to the buffer @to@.  It should translate as many elements as possible
     -- given the sizes of the buffers, including translating zero elements
     -- if there is either not enough room in @to@, or @from@ does not
     -- contain a complete multibyte sequence.
-   -- 
-   -- @encode@ should raise an exception if, and only if, @from@
-   -- begins with an illegal sequence, or the first element of @from@
-   -- is not representable in the encoding of @to@.  That is, if any
-   -- elements can be successfully translated before an error is
-   -- encountered, then @encode@ should translate as much as it can
-   -- and not throw an exception.  This behaviour is used by the IO
+   --
+   -- The fact that as many elements as possible are translated is used by the IO
     -- library in order to report translation errors at the point they
     -- actually occur, rather than when the buffer is translated.
     --
-  close  :: IO ()
+   -- To allow us to use iconv as a BufferCode efficiently, character buffers are
+   -- defined to contain lone surrogates instead of those private use characters that
+   -- are used for roundtripping. Thus, Chars poked and peeked from a character buffer
+   -- must undergo surrogatifyRoundtripCharacter and desurrogatifyRoundtripCharacter
+   -- respectively.
+   --
+   -- For more information on this, see Note [Roundtripping] in GHC.IO.Encoding.Failure.
+  
+  recover :: Buffer from -> Buffer to -> IO (Buffer from, Buffer to),
+   -- ^ The @recover@ function is used to continue decoding
+   -- in the presence of invalid or unrepresentable sequences. This includes
+   -- both those detected by @encode@ returning @InvalidSequence@ and those
+   -- that occur because the input byte sequence appears to be truncated.
+   --
+   -- Progress will usually be made by skipping the first element of the @from@
+   -- buffer. This function should only be called if you are certain that you
+   -- wish to do this skipping, and if the @to@ buffer has at least one element
+   -- of free space.
+   --
+   -- @recover@ may raise an exception rather than skipping anything.
+   --
+   -- Currently, some implementations of @recover@ may mutate the input buffer.
+   -- In particular, this feature is used to implement transliteration.
+  
+  close  :: IO (),
     -- ^ Resources associated with the encoding may now be released.
     -- The @encode@ function may not be called again after calling
     -- @close@.
+
+  getState :: IO state,
+   -- ^ Return the current state of the codec.
+   --
+   -- Many codecs are not stateful, and in these case the state can be
+   -- represented as '()'.  Other codecs maintain a state.  For
+   -- example, UTF-16 recognises a BOM (byte-order-mark) character at
+   -- the beginning of the input, and remembers thereafter whether to
+   -- use big-endian or little-endian mode.  In this case, the state
+   -- of the codec would include two pieces of information: whether we
+   -- are at the beginning of the stream (the BOM only occurs at the
+   -- beginning), and if not, whether to use the big or little-endian
+   -- encoding.
+
+  setState :: state -> IO ()
+   -- restore the state of the codec using the state from a previous
+   -- call to 'getState'.
   }
  
  type DecodeBuffer = Buffer Word8 -> Buffer Char
-                  -> IO (Buffer Word8, Buffer Char)
+                  -> IO (CodingProgress, Buffer Word8, Buffer Char)
  
  type EncodeBuffer = Buffer Char -> Buffer Word8
-                  -> IO (Buffer Char, Buffer Word8)
+                  -> IO (CodingProgress, Buffer Char, Buffer Word8)
  
-type TextDecoder = BufferCodec Word8 CharBufElem
-type TextEncoder = BufferCodec CharBufElem Word8
+type TextDecoder state = BufferCodec Word8 CharBufElem state
+type TextEncoder state = BufferCodec CharBufElem Word8 state
  
  -- | A 'TextEncoding' is a specification of a conversion scheme
  -- between sequences of bytes and sequences of Unicode characters.
  --
  -- For example, UTF-8 is an encoding of Unicode characters into a sequence
--- of bytes.  The 'TextEncoding' for UTF-8 is 'utf_8'.
+-- of bytes.  The 'TextEncoding' for UTF-8 is 'utf8'.
  data TextEncoding
-  = TextEncoding  {
-       mkTextDecoder :: IO TextDecoder,
-       mkTextEncoder :: IO TextEncoder
+  = forall dstate estate . TextEncoding  {
+        textEncodingName :: String,
+                   -- ^ a string that can be passed to 'mkTextEncoding' to
+                   -- create an equivalent 'TextEncoding'.
+        mkTextDecoder :: IO (TextDecoder dstate),
+                   -- ^ Creates a means of decoding bytes into characters: the result must not
+                   -- be shared between several byte sequences or simultaneously across threads
+        mkTextEncoder :: IO (TextEncoder estate)
+                   -- ^ Creates a means of encode characters into bytes: the result must not
+                   -- be shared between several character sequences or simultaneously across threads
    }
+
+instance Show TextEncoding where
+  -- | Returns the value of 'textEncodingName'
+  show te = textEncodingName te
+
+data CodingProgress = InputUnderflow  -- ^ Stopped because the input contains insufficient available elements,
+                                      -- or all of the input sequence has been sucessfully translated.
+                    | OutputUnderflow -- ^ Stopped because the output contains insufficient free elements
+                    | InvalidSequence -- ^ Stopped because there are sufficient free elements in the output
+                                      -- to output at least one encoded ASCII character, but the input contains
+                                      -- an invalid or unrepresentable sequence
+                    deriving (Eq, Show)