1 {-# LANGUAGE NoImplicitPrelude, ExistentialQuantification #-}
2 {-# OPTIONS_GHC -funbox-strict-fields #-}
4 -----------------------------------------------------------------------------
6 -- Module : GHC.IO.Encoding.Types
7 -- Copyright : (c) The University of Glasgow, 2008-2009
8 -- License : see libraries/base/LICENSE
10 -- Maintainer : libraries@haskell.org
11 -- Stability : internal
12 -- Portability : non-portable
14 -- Types for text encoding/decoding
16 -----------------------------------------------------------------------------
18 module GHC.IO.Encoding.Types (
21 TextEncoder, TextDecoder,
22 EncodeBuffer, DecodeBuffer,
32 -- -----------------------------------------------------------------------------
33 -- Text encoders/decoders
35 data BufferCodec from to state = BufferCodec {
36 encode :: Buffer from -> Buffer to -> IO (CodingProgress, Buffer from, Buffer to),
37 -- ^ The @encode@ function translates elements of the buffer @from@
38 -- to the buffer @to@. It should translate as many elements as possible
39 -- given the sizes of the buffers, including translating zero elements
40 -- if there is either not enough room in @to@, or @from@ does not
41 -- contain a complete multibyte sequence.
43 -- The fact that as many elements as possible are translated is used by the IO
44 -- library in order to report translation errors at the point they
45 -- actually occur, rather than when the buffer is translated.
47 -- To allow us to use iconv as a BufferCode efficiently, character buffers are
48 -- defined to contain lone surrogates instead of those private use characters that
49 -- are used for roundtripping. Thus, Chars poked and peeked from a character buffer
50 -- must undergo surrogatifyRoundtripCharacter and desurrogatifyRoundtripCharacter
53 -- For more information on this, see Note [Roundtripping] in GHC.IO.Encoding.Failure.
55 recover :: Buffer from -> Buffer to -> IO (Buffer from, Buffer to),
56 -- ^ The @recover@ function is used to continue decoding
57 -- in the presence of invalid or unrepresentable sequences. This includes
58 -- both those detected by @encode@ returning @InvalidSequence@ and those
59 -- that occur because the input byte sequence appears to be truncated.
61 -- Progress will usually be made by skipping the first element of the @from@
62 -- buffer. This function should only be called if you are certain that you
63 -- wish to do this skipping, and if the @to@ buffer has at least one element
66 -- @recover@ may raise an exception rather than skipping anything.
68 -- Currently, some implementations of @recover@ may mutate the input buffer.
69 -- In particular, this feature is used to implement transliteration.
72 -- ^ Resources associated with the encoding may now be released.
73 -- The @encode@ function may not be called again after calling
77 -- ^ Return the current state of the codec.
79 -- Many codecs are not stateful, and in these case the state can be
80 -- represented as '()'. Other codecs maintain a state. For
81 -- example, UTF-16 recognises a BOM (byte-order-mark) character at
82 -- the beginning of the input, and remembers thereafter whether to
83 -- use big-endian or little-endian mode. In this case, the state
84 -- of the codec would include two pieces of information: whether we
85 -- are at the beginning of the stream (the BOM only occurs at the
86 -- beginning), and if not, whether to use the big or little-endian
89 setState :: state -> IO ()
90 -- restore the state of the codec using the state from a previous
91 -- call to 'getState'.
94 type DecodeBuffer = Buffer Word8 -> Buffer Char
95 -> IO (CodingProgress, Buffer Word8, Buffer Char)
97 type EncodeBuffer = Buffer Char -> Buffer Word8
98 -> IO (CodingProgress, Buffer Char, Buffer Word8)
100 type TextDecoder state = BufferCodec Word8 CharBufElem state
101 type TextEncoder state = BufferCodec CharBufElem Word8 state
103 -- | A 'TextEncoding' is a specification of a conversion scheme
104 -- between sequences of bytes and sequences of Unicode characters.
106 -- For example, UTF-8 is an encoding of Unicode characters into a sequence
107 -- of bytes. The 'TextEncoding' for UTF-8 is 'utf8'.
109 = forall dstate estate . TextEncoding {
110 textEncodingName :: String,
111 -- ^ a string that can be passed to 'mkTextEncoding' to
112 -- create an equivalent 'TextEncoding'.
113 mkTextDecoder :: IO (TextDecoder dstate),
114 -- ^ Creates a means of decoding bytes into characters: the result must not
115 -- be shared between several byte sequences or simultaneously across threads
116 mkTextEncoder :: IO (TextEncoder estate)
117 -- ^ Creates a means of encode characters into bytes: the result must not
118 -- be shared between several character sequences or simultaneously across threads
121 instance Show TextEncoding where
122 -- | Returns the value of 'textEncodingName'
123 show te = textEncodingName te
125 data CodingProgress = InputUnderflow -- ^ Stopped because the input contains insufficient available elements,
126 -- or all of the input sequence has been sucessfully translated.
127 | OutputUnderflow -- ^ Stopped because the output contains insufficient free elements
128 | InvalidSequence -- ^ Stopped because there are sufficient free elements in the output
129 -- to output at least one encoded ASCII character, but the input contains
130 -- an invalid or unrepresentable sequence