1 {-# LANGUAGE NoImplicitPrelude, ExistentialQuantification #-}
2 {-# OPTIONS_GHC -funbox-strict-fields #-}
4 -----------------------------------------------------------------------------
6 -- Module : GHC.IO.Encoding.Types
7 -- Copyright : (c) The University of Glasgow, 2008-2009
8 -- License : see libraries/base/LICENSE
10 -- Maintainer : libraries@haskell.org
11 -- Stability : internal
12 -- Portability : non-portable
14 -- Types for text encoding/decoding
16 -----------------------------------------------------------------------------
18 module GHC.IO.Encoding.Types (
21 TextEncoder, TextDecoder,
22 EncodeBuffer, DecodeBuffer,
31 -- -----------------------------------------------------------------------------
32 -- Text encoders/decoders
34 data BufferCodec from to state = BufferCodec {
35 encode :: Buffer from -> Buffer to -> IO (Buffer from, Buffer to),
36 -- ^ The @encode@ function translates elements of the buffer @from@
37 -- to the buffer @to@. It should translate as many elements as possible
38 -- given the sizes of the buffers, including translating zero elements
39 -- if there is either not enough room in @to@, or @from@ does not
40 -- contain a complete multibyte sequence.
42 -- @encode@ should raise an exception if, and only if, @from@
43 -- begins with an illegal sequence, or the first element of @from@
44 -- is not representable in the encoding of @to@. That is, if any
45 -- elements can be successfully translated before an error is
46 -- encountered, then @encode@ should translate as much as it can
47 -- and not throw an exception. This behaviour is used by the IO
48 -- library in order to report translation errors at the point they
49 -- actually occur, rather than when the buffer is translated.
52 -- ^ Resources associated with the encoding may now be released.
53 -- The @encode@ function may not be called again after calling
57 -- ^ Return the current state of the codec.
59 -- Many codecs are not stateful, and in these case the state can be
60 -- represented as '()'. Other codecs maintain a state. For
61 -- example, UTF-16 recognises a BOM (byte-order-mark) character at
62 -- the beginning of the input, and remembers thereafter whether to
63 -- use big-endian or little-endian mode. In this case, the state
64 -- of the codec would include two pieces of information: whether we
65 -- are at the beginning of the stream (the BOM only occurs at the
66 -- beginning), and if not, whether to use the big or little-endian
69 setState :: state -> IO()
70 -- restore the state of the codec using the state from a previous
71 -- call to 'getState'.
74 type DecodeBuffer = Buffer Word8 -> Buffer Char
75 -> IO (Buffer Word8, Buffer Char)
77 type EncodeBuffer = Buffer Char -> Buffer Word8
78 -> IO (Buffer Char, Buffer Word8)
80 type TextDecoder state = BufferCodec Word8 CharBufElem state
81 type TextEncoder state = BufferCodec CharBufElem Word8 state
83 -- | A 'TextEncoding' is a specification of a conversion scheme
84 -- between sequences of bytes and sequences of Unicode characters.
86 -- For example, UTF-8 is an encoding of Unicode characters into a sequence
87 -- of bytes. The 'TextEncoding' for UTF-8 is 'utf8'.
89 = forall dstate estate . TextEncoding {
90 textEncodingName :: String,
91 -- ^ a string that can be passed to 'mkTextEncoding' to
92 -- create an equivalent 'TextEncoding'.
93 mkTextDecoder :: IO (TextDecoder dstate),
94 -- ^ Creates a means of decoding bytes into characters: the result must not
95 -- be shared between several byte sequences or simultaneously across threads
96 mkTextEncoder :: IO (TextEncoder estate)
97 -- ^ Creates a means of encode characters into bytes: the result must not
98 -- be shared between several character sequences or simultaneously across threads
101 instance Show TextEncoding where
102 -- | Returns the value of 'textEncodingName'
103 show te = textEncodingName te