GHC/IO/Encoding/Types.hs

   1 {-# OPTIONS_GHC -fno-implicit-prelude -funbox-strict-fields #-}
   2 -----------------------------------------------------------------------------
   3 -- |
   4 -- Module      :  GHC.IO.Encoding.Types
   5 -- Copyright   :  (c) The University of Glasgow, 2008-2009
   6 -- License     :  see libraries/base/LICENSE
   7 --
   8 -- Maintainer  :  libraries@haskell.org
   9 -- Stability   :  internal
  10 -- Portability :  non-portable
  11 --
  12 -- Types for text encoding/decoding
  13 --
  14 -----------------------------------------------------------------------------
  15
  16 module GHC.IO.Encoding.Types (
  17     BufferCodec(..),
  18     TextEncoding(..),
  19     TextEncoder, TextDecoder,
  20     EncodeBuffer, DecodeBuffer,
  21   ) where
  22
  23 import GHC.Base
  24 import GHC.Word
  25 -- import GHC.IO
  26 import GHC.IO.Buffer
  27
  28 -- -----------------------------------------------------------------------------
  29 -- Text encoders/decoders
  30
  31 data BufferCodec from to state = BufferCodec {
  32   encode :: Buffer from -> Buffer to -> IO (Buffer from, Buffer to),
  33    -- ^ The @encode@ function translates elements of the buffer @from@
  34    -- to the buffer @to@.  It should translate as many elements as possible
  35    -- given the sizes of the buffers, including translating zero elements
  36    -- if there is either not enough room in @to@, or @from@ does not
  37    -- contain a complete multibyte sequence.
  38    --
  39    -- @encode@ should raise an exception if, and only if, @from@
  40    -- begins with an illegal sequence, or the first element of @from@
  41    -- is not representable in the encoding of @to@.  That is, if any
  42    -- elements can be successfully translated before an error is
  43    -- encountered, then @encode@ should translate as much as it can
  44    -- and not throw an exception.  This behaviour is used by the IO
  45    -- library in order to report translation errors at the point they
  46    -- actually occur, rather than when the buffer is translated.
  47    --
  48   close  :: IO (),
  49    -- ^ Resources associated with the encoding may now be released.
  50    -- The @encode@ function may not be called again after calling
  51    -- @close@.
  52
  53   getState :: IO state,
  54    -- ^ Return the current state of the codec.
  55    --
  56    -- Many codecs are not stateful, and in these case the state can be
  57    -- represented as '()'.  Other codecs maintain a state.  For
  58    -- example, UTF-16 recognises a BOM (byte-order-mark) character at
  59    -- the beginning of the input, and remembers thereafter whether to
  60    -- use big-endian or little-endian mode.  In this case, the state
  61    -- of the codec would include two pieces of information: whether we
  62    -- are at the beginning of the stream (the BOM only occurs at the
  63    -- beginning), and if not, whether to use the big or little-endian
  64    -- encoding.
  65
  66   setState :: state -> IO()
  67    -- restore the state of the codec using the state from a previous
  68    -- call to 'getState'.
  69  }
  70
  71 type DecodeBuffer = Buffer Word8 -> Buffer Char
  72                   -> IO (Buffer Word8, Buffer Char)
  73
  74 type EncodeBuffer = Buffer Char -> Buffer Word8
  75                   -> IO (Buffer Char, Buffer Word8)
  76
  77 type TextDecoder state = BufferCodec Word8 CharBufElem state
  78 type TextEncoder state = BufferCodec CharBufElem Word8 state
  79
  80 -- | A 'TextEncoding' is a specification of a conversion scheme
  81 -- between sequences of bytes and sequences of Unicode characters.
  82 --
  83 -- For example, UTF-8 is an encoding of Unicode characters into a sequence
  84 -- of bytes.  The 'TextEncoding' for UTF-8 is 'utf8'.
  85 data TextEncoding
  86   = forall dstate estate . TextEncoding  {
  87         mkTextDecoder :: IO (TextDecoder dstate),
  88         mkTextEncoder :: IO (TextEncoder estate)
  89   }