1 {-# OPTIONS_GHC -XNoImplicitPrelude -funbox-strict-fields #-}
2 -----------------------------------------------------------------------------
4 -- Module : GHC.IO.Encoding
5 -- Copyright : (c) The University of Glasgow, 2008-2009
6 -- License : see libraries/base/LICENSE
8 -- Maintainer : libraries@haskell.org
9 -- Stability : internal
10 -- Portability : non-portable
12 -- Text codecs for I/O
14 -----------------------------------------------------------------------------
16 module GHC.IO.Encoding (
17 BufferCodec(..), TextEncoding(..), TextEncoder, TextDecoder,
18 latin1, latin1_encode, latin1_decode,
20 utf16, utf16le, utf16be,
21 utf32, utf32le, utf32be,
29 import GHC.IO.Encoding.Types
31 #if !defined(mingw32_HOST_OS)
32 import qualified GHC.IO.Encoding.Iconv as Iconv
34 import qualified GHC.IO.Encoding.CodePage as CodePage
35 import Text.Read (reads)
37 import qualified GHC.IO.Encoding.Latin1 as Latin1
38 import qualified GHC.IO.Encoding.UTF8 as UTF8
39 import qualified GHC.IO.Encoding.UTF16 as UTF16
40 import qualified GHC.IO.Encoding.UTF32 as UTF32
42 #if defined(mingw32_HOST_OS)
44 import GHC.IO.Exception
47 -- -----------------------------------------------------------------------------
49 -- | The Latin1 (ISO8859-1) encoding. This encoding maps bytes
50 -- directly to the first 256 Unicode code points, and is thus not a
51 -- complete Unicode encoding. An attempt to write a character greater than
52 -- '\255' to a 'Handle' using the 'latin1' encoding will result in an error.
53 latin1 :: TextEncoding
54 latin1 = Latin1.latin1_checked
56 -- | The UTF-8 Unicode encoding
60 -- | The UTF-8 Unicode encoding, with a byte-order-mark (BOM; the byte
61 -- sequence 0xEF 0xBB 0xBF). This encoding behaves like 'utf8',
62 -- except that on input, the BOM sequence is ignored at the beginning
63 -- of the stream, and on output, the BOM sequence is prepended.
65 -- The byte-order-mark is strictly unnecessary in UTF-8, but is
66 -- sometimes used to identify the encoding of a file.
68 utf8_bom :: TextEncoding
69 utf8_bom = UTF8.utf8_bom
71 -- | The UTF-16 Unicode encoding (a byte-order-mark should be used to
72 -- indicate endianness).
76 -- | The UTF-16 Unicode encoding (litte-endian)
77 utf16le :: TextEncoding
78 utf16le = UTF16.utf16le
80 -- | The UTF-16 Unicode encoding (big-endian)
81 utf16be :: TextEncoding
82 utf16be = UTF16.utf16be
84 -- | The UTF-32 Unicode encoding (a byte-order-mark should be used to
85 -- indicate endianness).
89 -- | The UTF-32 Unicode encoding (litte-endian)
90 utf32le :: TextEncoding
91 utf32le = UTF32.utf32le
93 -- | The UTF-32 Unicode encoding (big-endian)
94 utf32be :: TextEncoding
95 utf32be = UTF32.utf32be
97 -- | The Unicode encoding of the current locale
98 localeEncoding :: TextEncoding
99 #if !defined(mingw32_HOST_OS)
100 localeEncoding = Iconv.localeEncoding
102 localeEncoding = CodePage.localeEncoding
105 -- | Look up the named Unicode encoding. May fail with
107 -- * 'isDoesNotExistError' if the encoding is unknown
109 -- The set of known encodings is system-dependent, but includes at least:
113 -- * @UTF-16@, @UTF-16BE@, @UTF-16LE@
115 -- * @UTF-32@, @UTF-32BE@, @UTF-32LE@
117 -- On systems using GNU iconv (e.g. Linux), there is additional
118 -- notation for specifying how illegal characters are handled:
120 -- * a suffix of @\/\/IGNORE@, e.g. @UTF-8\/\/IGNORE@, will cause
121 -- all illegal sequences on input to be ignored, and on output
122 -- will drop all code points that have no representation in the
125 -- * a suffix of @\/\/TRANSLIT@ will choose a replacement character
126 -- for illegal sequences or code points.
128 -- On Windows, you can access supported code pages with the prefix
129 -- @CP@; for example, @\"CP1250\"@.
131 mkTextEncoding :: String -> IO TextEncoding
132 #if !defined(mingw32_HOST_OS)
133 mkTextEncoding = Iconv.mkTextEncoding
135 mkTextEncoding "UTF-8" = return utf8
136 mkTextEncoding "UTF-16" = return utf16
137 mkTextEncoding "UTF-16LE" = return utf16le
138 mkTextEncoding "UTF-16BE" = return utf16be
139 mkTextEncoding "UTF-32" = return utf32
140 mkTextEncoding "UTF-32LE" = return utf32le
141 mkTextEncoding "UTF-32BE" = return utf32be
142 mkTextEncoding ('C':'P':n)
143 | [(cp,"")] <- reads n = return $ CodePage.codePageEncoding cp
144 mkTextEncoding e = ioException
145 (IOError Nothing NoSuchThing "mkTextEncoding"
146 ("unknown encoding:" ++ e) Nothing Nothing)
149 latin1_encode :: CharBuffer -> Buffer Word8 -> IO (CharBuffer, Buffer Word8)
150 latin1_encode = Latin1.latin1_encode -- unchecked, used for binary
151 --latin1_encode = unsafePerformIO $ do mkTextEncoder Iconv.latin1 >>= return.encode
153 latin1_decode :: Buffer Word8 -> CharBuffer -> IO (Buffer Word8, CharBuffer)
154 latin1_decode = Latin1.latin1_decode
155 --latin1_decode = unsafePerformIO $ do mkTextDecoder Iconv.latin1 >>= return.encode