GHC/IO/Encoding/Types.hs

   1 {-# LANGUAGE NoImplicitPrelude, ExistentialQuantification #-}
   2 {-# OPTIONS_GHC -funbox-strict-fields #-}
   3
   4 -----------------------------------------------------------------------------
   5 -- |
   6 -- Module      :  GHC.IO.Encoding.Types
   7 -- Copyright   :  (c) The University of Glasgow, 2008-2009
   8 -- License     :  see libraries/base/LICENSE
   9 --
  10 -- Maintainer  :  libraries@haskell.org
  11 -- Stability   :  internal
  12 -- Portability :  non-portable
  13 --
  14 -- Types for text encoding/decoding
  15 --
  16 -----------------------------------------------------------------------------
  17
  18 module GHC.IO.Encoding.Types (
  19     BufferCodec(..),
  20     TextEncoding(..),
  21     TextEncoder, TextDecoder,
  22     EncodeBuffer, DecodeBuffer,
  23     CodingProgress(..)
  24   ) where
  25
  26 import GHC.Base
  27 import GHC.Word
  28 import GHC.Show
  29 -- import GHC.IO
  30 import GHC.IO.Buffer
  31
  32 -- -----------------------------------------------------------------------------
  33 -- Text encoders/decoders
  34
  35 data BufferCodec from to state = BufferCodec {
  36   encode :: Buffer from -> Buffer to -> IO (CodingProgress, Buffer from, Buffer to),
  37    -- ^ The @encode@ function translates elements of the buffer @from@
  38    -- to the buffer @to@.  It should translate as many elements as possible
  39    -- given the sizes of the buffers, including translating zero elements
  40    -- if there is either not enough room in @to@, or @from@ does not
  41    -- contain a complete multibyte sequence.
  42    --
  43    -- The fact that as many elements as possible are translated is used by the IO
  44    -- library in order to report translation errors at the point they
  45    -- actually occur, rather than when the buffer is translated.
  46    --
  47    -- To allow us to use iconv as a BufferCode efficiently, character buffers are
  48    -- defined to contain lone surrogates instead of those private use characters that
  49    -- are used for roundtripping. Thus, Chars poked and peeked from a character buffer
  50    -- must undergo surrogatifyRoundtripCharacter and desurrogatifyRoundtripCharacter
  51    -- respectively.
  52    --
  53    -- For more information on this, see Note [Roundtripping] in GHC.IO.Encoding.Failure.
  54
  55   recover :: Buffer from -> Buffer to -> IO (Buffer from, Buffer to),
  56    -- ^ The @recover@ function is used to continue decoding
  57    -- in the presence of invalid or unrepresentable sequences. This includes
  58    -- both those detected by @encode@ returning @InvalidSequence@ and those
  59    -- that occur because the input byte sequence appears to be truncated.
  60    --
  61    -- Progress will usually be made by skipping the first element of the @from@
  62    -- buffer. This function should only be called if you are certain that you
  63    -- wish to do this skipping, and if the @to@ buffer has at least one element
  64    -- of free space.
  65    --
  66    -- @recover@ may raise an exception rather than skipping anything.
  67    --
  68    -- Currently, some implementations of @recover@ may mutate the input buffer.
  69    -- In particular, this feature is used to implement transliteration.
  70
  71   close  :: IO (),
  72    -- ^ Resources associated with the encoding may now be released.
  73    -- The @encode@ function may not be called again after calling
  74    -- @close@.
  75
  76   getState :: IO state,
  77    -- ^ Return the current state of the codec.
  78    --
  79    -- Many codecs are not stateful, and in these case the state can be
  80    -- represented as '()'.  Other codecs maintain a state.  For
  81    -- example, UTF-16 recognises a BOM (byte-order-mark) character at
  82    -- the beginning of the input, and remembers thereafter whether to
  83    -- use big-endian or little-endian mode.  In this case, the state
  84    -- of the codec would include two pieces of information: whether we
  85    -- are at the beginning of the stream (the BOM only occurs at the
  86    -- beginning), and if not, whether to use the big or little-endian
  87    -- encoding.
  88
  89   setState :: state -> IO ()
  90    -- restore the state of the codec using the state from a previous
  91    -- call to 'getState'.
  92  }
  93
  94 type DecodeBuffer = Buffer Word8 -> Buffer Char
  95                   -> IO (CodingProgress, Buffer Word8, Buffer Char)
  96
  97 type EncodeBuffer = Buffer Char -> Buffer Word8
  98                   -> IO (CodingProgress, Buffer Char, Buffer Word8)
  99
 100 type TextDecoder state = BufferCodec Word8 CharBufElem state
 101 type TextEncoder state = BufferCodec CharBufElem Word8 state
 102
 103 -- | A 'TextEncoding' is a specification of a conversion scheme
 104 -- between sequences of bytes and sequences of Unicode characters.
 105 --
 106 -- For example, UTF-8 is an encoding of Unicode characters into a sequence
 107 -- of bytes.  The 'TextEncoding' for UTF-8 is 'utf8'.
 108 data TextEncoding
 109   = forall dstate estate . TextEncoding  {
 110         textEncodingName :: String,
 111                    -- ^ a string that can be passed to 'mkTextEncoding' to
 112                    -- create an equivalent 'TextEncoding'.
 113         mkTextDecoder :: IO (TextDecoder dstate),
 114                    -- ^ Creates a means of decoding bytes into characters: the result must not
 115                    -- be shared between several byte sequences or simultaneously across threads
 116         mkTextEncoder :: IO (TextEncoder estate)
 117                    -- ^ Creates a means of encode characters into bytes: the result must not
 118                    -- be shared between several character sequences or simultaneously across threads
 119   }
 120
 121 instance Show TextEncoding where
 122   -- | Returns the value of 'textEncodingName'
 123   show te = textEncodingName te
 124
 125 data CodingProgress = InputUnderflow  -- ^ Stopped because the input contains insufficient available elements,
 126                                       -- or all of the input sequence has been sucessfully translated.
 127                     | OutputUnderflow -- ^ Stopped because the output contains insufficient free elements
 128                     | InvalidSequence -- ^ Stopped because there are sufficient free elements in the output
 129                                       -- to output at least one encoded ASCII character, but the input contains
 130                                       -- an invalid or unrepresentable sequence
 131                     deriving (Eq, Show)