1 {-# LANGUAGE CPP, NoImplicitPrelude, PatternGuards #-}
2 {-# OPTIONS_GHC -funbox-strict-fields #-}
4 -----------------------------------------------------------------------------
6 -- Module : GHC.IO.Encoding
7 -- Copyright : (c) The University of Glasgow, 2008-2009
8 -- License : see libraries/base/LICENSE
10 -- Maintainer : libraries@haskell.org
11 -- Stability : internal
12 -- Portability : non-portable
14 -- Text codecs for I/O
16 -----------------------------------------------------------------------------
18 module GHC.IO.Encoding (
19 BufferCodec(..), TextEncoding(..), TextEncoder, TextDecoder, CodingProgress(..),
20 latin1, latin1_encode, latin1_decode,
22 utf16, utf16le, utf16be,
23 utf32, utf32le, utf32be,
24 localeEncoding, fileSystemEncoding, foreignEncoding,
31 import GHC.IO.Exception
33 import GHC.IO.Encoding.Failure
34 import GHC.IO.Encoding.Types
36 #if !defined(mingw32_HOST_OS)
37 import qualified GHC.IO.Encoding.Iconv as Iconv
39 import qualified GHC.IO.Encoding.CodePage as CodePage
40 import Text.Read (reads)
42 import qualified GHC.IO.Encoding.Latin1 as Latin1
43 import qualified GHC.IO.Encoding.UTF8 as UTF8
44 import qualified GHC.IO.Encoding.UTF16 as UTF16
45 import qualified GHC.IO.Encoding.UTF32 as UTF32
50 -- -----------------------------------------------------------------------------
52 -- | The Latin1 (ISO8859-1) encoding. This encoding maps bytes
53 -- directly to the first 256 Unicode code points, and is thus not a
54 -- complete Unicode encoding. An attempt to write a character greater than
55 -- '\255' to a 'Handle' using the 'latin1' encoding will result in an error.
56 latin1 :: TextEncoding
57 latin1 = Latin1.latin1_checked
59 -- | The UTF-8 Unicode encoding
63 -- | The UTF-8 Unicode encoding, with a byte-order-mark (BOM; the byte
64 -- sequence 0xEF 0xBB 0xBF). This encoding behaves like 'utf8',
65 -- except that on input, the BOM sequence is ignored at the beginning
66 -- of the stream, and on output, the BOM sequence is prepended.
68 -- The byte-order-mark is strictly unnecessary in UTF-8, but is
69 -- sometimes used to identify the encoding of a file.
71 utf8_bom :: TextEncoding
72 utf8_bom = UTF8.utf8_bom
74 -- | The UTF-16 Unicode encoding (a byte-order-mark should be used to
75 -- indicate endianness).
79 -- | The UTF-16 Unicode encoding (litte-endian)
80 utf16le :: TextEncoding
81 utf16le = UTF16.utf16le
83 -- | The UTF-16 Unicode encoding (big-endian)
84 utf16be :: TextEncoding
85 utf16be = UTF16.utf16be
87 -- | The UTF-32 Unicode encoding (a byte-order-mark should be used to
88 -- indicate endianness).
92 -- | The UTF-32 Unicode encoding (litte-endian)
93 utf32le :: TextEncoding
94 utf32le = UTF32.utf32le
96 -- | The UTF-32 Unicode encoding (big-endian)
97 utf32be :: TextEncoding
98 utf32be = UTF32.utf32be
100 -- | The Unicode encoding of the current locale
101 localeEncoding :: TextEncoding
103 -- | The Unicode encoding of the current locale, but allowing arbitrary
104 -- undecodable bytes to be round-tripped through it.
106 -- This 'TextEncoding' is used to decode and encode command line arguments
107 -- and environment variables on non-Windows platforms.
109 -- On Windows, this encoding *should not* be used if possible because
110 -- the use of code pages is deprecated: Strings should be retrieved
111 -- via the "wide" W-family of UTF-16 APIs instead
112 fileSystemEncoding :: TextEncoding
114 -- | The Unicode encoding of the current locale, but where undecodable
115 -- bytes are replaced with their closest visual match. Used for
116 -- the 'CString' marshalling functions in "Foreign.C.String"
117 foreignEncoding :: TextEncoding
119 #if !defined(mingw32_HOST_OS)
120 localeEncoding = Iconv.localeEncoding
121 fileSystemEncoding = Iconv.mkLocaleEncoding RoundtripFailure
122 foreignEncoding = Iconv.mkLocaleEncoding IgnoreCodingFailure
124 localeEncoding = CodePage.localeEncoding
125 fileSystemEncoding = CodePage.mkLocaleEncoding RoundtripFailure
126 foreignEncoding = CodePage.mkLocaleEncoding IgnoreCodingFailure
129 -- | An encoding in which Unicode code points are translated to bytes
130 -- by taking the code point modulo 256. When decoding, bytes are
131 -- translated directly into the equivalent code point.
133 -- This encoding never fails in either direction. However, encoding
134 -- discards informaiton, so encode followed by decode is not the
136 char8 :: TextEncoding
137 char8 = Latin1.latin1
139 -- | Look up the named Unicode encoding. May fail with
141 -- * 'isDoesNotExistError' if the encoding is unknown
143 -- The set of known encodings is system-dependent, but includes at least:
147 -- * @UTF-16@, @UTF-16BE@, @UTF-16LE@
149 -- * @UTF-32@, @UTF-32BE@, @UTF-32LE@
151 -- On systems using GNU iconv (e.g. Linux), there is additional
152 -- notation for specifying how illegal characters are handled:
154 -- * a suffix of @\/\/IGNORE@, e.g. @UTF-8\/\/IGNORE@, will cause
155 -- all illegal sequences on input to be ignored, and on output
156 -- will drop all code points that have no representation in the
159 -- * a suffix of @\/\/TRANSLIT@ will choose a replacement character
160 -- for illegal sequences or code points.
162 -- On Windows, you can access supported code pages with the prefix
163 -- @CP@; for example, @\"CP1250\"@.
165 mkTextEncoding :: String -> IO TextEncoding
166 mkTextEncoding e = case mb_coding_failure_mode of
167 Nothing -> unknown_encoding
168 Just cfm -> case enc of
169 "UTF-8" -> return $ UTF8.mkUTF8 cfm
170 "UTF-16" -> return $ UTF16.mkUTF16 cfm
171 "UTF-16LE" -> return $ UTF16.mkUTF16le cfm
172 "UTF-16BE" -> return $ UTF16.mkUTF16be cfm
173 "UTF-32" -> return $ UTF32.mkUTF32 cfm
174 "UTF-32LE" -> return $ UTF32.mkUTF32le cfm
175 "UTF-32BE" -> return $ UTF32.mkUTF32be cfm
176 #if defined(mingw32_HOST_OS)
177 'C':'P':n | [(cp,"")] <- reads n -> return $ CodePage.mkCodePageEncoding cfm cp
178 _ -> unknown_encoding
180 _ -> Iconv.mkIconvEncoding cfm enc
183 -- The only problem with actually documenting //IGNORE and //TRANSLIT as
184 -- supported suffixes is that they are not necessarily supported with non-GNU iconv
185 (enc, suffix) = span (/= '/') e
186 mb_coding_failure_mode = case suffix of
187 "" -> Just ErrorOnCodingFailure
188 "//IGNORE" -> Just IgnoreCodingFailure
189 "//TRANSLIT" -> Just TransliterateCodingFailure
190 "//ROUNDTRIP" -> Just RoundtripFailure
193 unknown_encoding = ioException (IOError Nothing NoSuchThing "mkTextEncoding"
194 ("unknown encoding:" ++ e) Nothing Nothing)
196 latin1_encode :: CharBuffer -> Buffer Word8 -> IO (CharBuffer, Buffer Word8)
197 latin1_encode input output = fmap (\(_why,input',output') -> (input',output')) $ Latin1.latin1_encode input output -- unchecked, used for char8
198 --latin1_encode = unsafePerformIO $ do mkTextEncoder Iconv.latin1 >>= return.encode
200 latin1_decode :: Buffer Word8 -> CharBuffer -> IO (Buffer Word8, CharBuffer)
201 latin1_decode input output = fmap (\(_why,input',output') -> (input',output')) $ Latin1.latin1_decode input output
202 --latin1_decode = unsafePerformIO $ do mkTextDecoder Iconv.latin1 >>= return.encode