1 {-# LANGUAGE CPP, NoImplicitPrelude, PatternGuards #-}
2 {-# OPTIONS_GHC -funbox-strict-fields #-}
4 -----------------------------------------------------------------------------
6 -- Module : GHC.IO.Encoding
7 -- Copyright : (c) The University of Glasgow, 2008-2009
8 -- License : see libraries/base/LICENSE
10 -- Maintainer : libraries@haskell.org
11 -- Stability : internal
12 -- Portability : non-portable
14 -- Text codecs for I/O
16 -----------------------------------------------------------------------------
18 module GHC.IO.Encoding (
19 BufferCodec(..), TextEncoding(..), TextEncoder, TextDecoder,
20 latin1, latin1_encode, latin1_decode,
22 utf16, utf16le, utf16be,
23 utf32, utf32le, utf32be,
31 import GHC.IO.Encoding.Types
33 #if !defined(mingw32_HOST_OS)
34 import qualified GHC.IO.Encoding.Iconv as Iconv
36 import qualified GHC.IO.Encoding.CodePage as CodePage
37 import Text.Read (reads)
39 import qualified GHC.IO.Encoding.Latin1 as Latin1
40 import qualified GHC.IO.Encoding.UTF8 as UTF8
41 import qualified GHC.IO.Encoding.UTF16 as UTF16
42 import qualified GHC.IO.Encoding.UTF32 as UTF32
44 #if defined(mingw32_HOST_OS)
46 import GHC.IO.Exception
49 -- -----------------------------------------------------------------------------
51 -- | The Latin1 (ISO8859-1) encoding. This encoding maps bytes
52 -- directly to the first 256 Unicode code points, and is thus not a
53 -- complete Unicode encoding. An attempt to write a character greater than
54 -- '\255' to a 'Handle' using the 'latin1' encoding will result in an error.
55 latin1 :: TextEncoding
56 latin1 = Latin1.latin1_checked
58 -- | The UTF-8 Unicode encoding
62 -- | The UTF-8 Unicode encoding, with a byte-order-mark (BOM; the byte
63 -- sequence 0xEF 0xBB 0xBF). This encoding behaves like 'utf8',
64 -- except that on input, the BOM sequence is ignored at the beginning
65 -- of the stream, and on output, the BOM sequence is prepended.
67 -- The byte-order-mark is strictly unnecessary in UTF-8, but is
68 -- sometimes used to identify the encoding of a file.
70 utf8_bom :: TextEncoding
71 utf8_bom = UTF8.utf8_bom
73 -- | The UTF-16 Unicode encoding (a byte-order-mark should be used to
74 -- indicate endianness).
78 -- | The UTF-16 Unicode encoding (litte-endian)
79 utf16le :: TextEncoding
80 utf16le = UTF16.utf16le
82 -- | The UTF-16 Unicode encoding (big-endian)
83 utf16be :: TextEncoding
84 utf16be = UTF16.utf16be
86 -- | The UTF-32 Unicode encoding (a byte-order-mark should be used to
87 -- indicate endianness).
91 -- | The UTF-32 Unicode encoding (litte-endian)
92 utf32le :: TextEncoding
93 utf32le = UTF32.utf32le
95 -- | The UTF-32 Unicode encoding (big-endian)
96 utf32be :: TextEncoding
97 utf32be = UTF32.utf32be
99 -- | The Unicode encoding of the current locale
100 localeEncoding :: TextEncoding
101 #if !defined(mingw32_HOST_OS)
102 localeEncoding = Iconv.localeEncoding
104 localeEncoding = CodePage.localeEncoding
107 -- | Look up the named Unicode encoding. May fail with
109 -- * 'isDoesNotExistError' if the encoding is unknown
111 -- The set of known encodings is system-dependent, but includes at least:
115 -- * @UTF-16@, @UTF-16BE@, @UTF-16LE@
117 -- * @UTF-32@, @UTF-32BE@, @UTF-32LE@
119 -- On systems using GNU iconv (e.g. Linux), there is additional
120 -- notation for specifying how illegal characters are handled:
122 -- * a suffix of @\/\/IGNORE@, e.g. @UTF-8\/\/IGNORE@, will cause
123 -- all illegal sequences on input to be ignored, and on output
124 -- will drop all code points that have no representation in the
127 -- * a suffix of @\/\/TRANSLIT@ will choose a replacement character
128 -- for illegal sequences or code points.
130 -- On Windows, you can access supported code pages with the prefix
131 -- @CP@; for example, @\"CP1250\"@.
133 mkTextEncoding :: String -> IO TextEncoding
134 #if !defined(mingw32_HOST_OS)
135 mkTextEncoding = Iconv.mkTextEncoding
137 mkTextEncoding "UTF-8" = return utf8
138 mkTextEncoding "UTF-16" = return utf16
139 mkTextEncoding "UTF-16LE" = return utf16le
140 mkTextEncoding "UTF-16BE" = return utf16be
141 mkTextEncoding "UTF-32" = return utf32
142 mkTextEncoding "UTF-32LE" = return utf32le
143 mkTextEncoding "UTF-32BE" = return utf32be
144 mkTextEncoding ('C':'P':n)
145 | [(cp,"")] <- reads n = return $ CodePage.codePageEncoding cp
146 mkTextEncoding e = ioException
147 (IOError Nothing NoSuchThing "mkTextEncoding"
148 ("unknown encoding:" ++ e) Nothing Nothing)
151 latin1_encode :: CharBuffer -> Buffer Word8 -> IO (CharBuffer, Buffer Word8)
152 latin1_encode = Latin1.latin1_encode -- unchecked, used for binary
153 --latin1_encode = unsafePerformIO $ do mkTextEncoder Iconv.latin1 >>= return.encode
155 latin1_decode :: Buffer Word8 -> CharBuffer -> IO (Buffer Word8, CharBuffer)
156 latin1_decode = Latin1.latin1_decode
157 --latin1_decode = unsafePerformIO $ do mkTextDecoder Iconv.latin1 >>= return.encode