GHC/IO/Encoding.hs

   1 {-# LANGUAGE CPP, NoImplicitPrelude, PatternGuards #-}
   2 {-# OPTIONS_GHC -funbox-strict-fields #-}
   3
   4 -----------------------------------------------------------------------------
   5 -- |
   6 -- Module      :  GHC.IO.Encoding
   7 -- Copyright   :  (c) The University of Glasgow, 2008-2009
   8 -- License     :  see libraries/base/LICENSE
   9 --
  10 -- Maintainer  :  libraries@haskell.org
  11 -- Stability   :  internal
  12 -- Portability :  non-portable
  13 --
  14 -- Text codecs for I/O
  15 --
  16 -----------------------------------------------------------------------------
  17
  18 module GHC.IO.Encoding (
  19   BufferCodec(..), TextEncoding(..), TextEncoder, TextDecoder, CodingProgress(..),
  20   latin1, latin1_encode, latin1_decode,
  21   utf8, utf8_bom,
  22   utf16, utf16le, utf16be,
  23   utf32, utf32le, utf32be,
  24   localeEncoding, fileSystemEncoding, foreignEncoding,
  25   char8,
  26   mkTextEncoding,
  27   ) where
  28
  29 import GHC.Base
  30 --import GHC.IO
  31 import GHC.IO.Exception
  32 import GHC.IO.Buffer
  33 import GHC.IO.Encoding.Failure
  34 import GHC.IO.Encoding.Types
  35 import GHC.Word
  36 #if !defined(mingw32_HOST_OS)
  37 import qualified GHC.IO.Encoding.Iconv  as Iconv
  38 #else
  39 import qualified GHC.IO.Encoding.CodePage as CodePage
  40 import Text.Read (reads)
  41 #endif
  42 import qualified GHC.IO.Encoding.Latin1 as Latin1
  43 import qualified GHC.IO.Encoding.UTF8   as UTF8
  44 import qualified GHC.IO.Encoding.UTF16  as UTF16
  45 import qualified GHC.IO.Encoding.UTF32  as UTF32
  46
  47 import Data.List
  48 import Data.Maybe
  49
  50 -- -----------------------------------------------------------------------------
  51
  52 -- | The Latin1 (ISO8859-1) encoding.  This encoding maps bytes
  53 -- directly to the first 256 Unicode code points, and is thus not a
  54 -- complete Unicode encoding.  An attempt to write a character greater than
  55 -- '\255' to a 'Handle' using the 'latin1' encoding will result in an error.
  56 latin1  :: TextEncoding
  57 latin1 = Latin1.latin1_checked
  58
  59 -- | The UTF-8 Unicode encoding
  60 utf8  :: TextEncoding
  61 utf8 = UTF8.utf8
  62
  63 -- | The UTF-8 Unicode encoding, with a byte-order-mark (BOM; the byte
  64 -- sequence 0xEF 0xBB 0xBF).  This encoding behaves like 'utf8',
  65 -- except that on input, the BOM sequence is ignored at the beginning
  66 -- of the stream, and on output, the BOM sequence is prepended.
  67 --
  68 -- The byte-order-mark is strictly unnecessary in UTF-8, but is
  69 -- sometimes used to identify the encoding of a file.
  70 --
  71 utf8_bom  :: TextEncoding
  72 utf8_bom = UTF8.utf8_bom
  73
  74 -- | The UTF-16 Unicode encoding (a byte-order-mark should be used to
  75 -- indicate endianness).
  76 utf16  :: TextEncoding
  77 utf16 = UTF16.utf16
  78
  79 -- | The UTF-16 Unicode encoding (litte-endian)
  80 utf16le  :: TextEncoding
  81 utf16le = UTF16.utf16le
  82
  83 -- | The UTF-16 Unicode encoding (big-endian)
  84 utf16be  :: TextEncoding
  85 utf16be = UTF16.utf16be
  86
  87 -- | The UTF-32 Unicode encoding (a byte-order-mark should be used to
  88 -- indicate endianness).
  89 utf32  :: TextEncoding
  90 utf32 = UTF32.utf32
  91
  92 -- | The UTF-32 Unicode encoding (litte-endian)
  93 utf32le  :: TextEncoding
  94 utf32le = UTF32.utf32le
  95
  96 -- | The UTF-32 Unicode encoding (big-endian)
  97 utf32be  :: TextEncoding
  98 utf32be = UTF32.utf32be
  99
 100 -- | The Unicode encoding of the current locale
 101 localeEncoding :: TextEncoding
 102
 103 -- | The Unicode encoding of the current locale, but allowing arbitrary
 104 -- undecodable bytes to be round-tripped through it.
 105 --
 106 -- This 'TextEncoding' is used to decode and encode command line arguments
 107 -- and environment variables on non-Windows platforms.
 108 --
 109 -- On Windows, this encoding *should not* be used if possible because
 110 -- the use of code pages is deprecated: Strings should be retrieved
 111 -- via the "wide" W-family of UTF-16 APIs instead
 112 fileSystemEncoding :: TextEncoding
 113
 114 -- | The Unicode encoding of the current locale, but where undecodable
 115 -- bytes are replaced with their closest visual match. Used for
 116 -- the 'CString' marshalling functions in "Foreign.C.String"
 117 foreignEncoding :: TextEncoding
 118
 119 #if !defined(mingw32_HOST_OS)
 120 localeEncoding = Iconv.localeEncoding
 121 fileSystemEncoding = Iconv.mkLocaleEncoding RoundtripFailure
 122 foreignEncoding = Iconv.mkLocaleEncoding IgnoreCodingFailure
 123 #else
 124 localeEncoding = CodePage.localeEncoding
 125 fileSystemEncoding = CodePage.mkLocaleEncoding RoundtripFailure
 126 foreignEncoding = CodePage.mkLocaleEncoding IgnoreCodingFailure
 127 #endif
 128
 129 -- | An encoding in which Unicode code points are translated to bytes
 130 -- by taking the code point modulo 256.  When decoding, bytes are
 131 -- translated directly into the equivalent code point.
 132 --
 133 -- This encoding never fails in either direction.  However, encoding
 134 -- discards informaiton, so encode followed by decode is not the
 135 -- identity.
 136 char8 :: TextEncoding
 137 char8 = Latin1.latin1
 138
 139 -- | Look up the named Unicode encoding.  May fail with
 140 --
 141 --  * 'isDoesNotExistError' if the encoding is unknown
 142 --
 143 -- The set of known encodings is system-dependent, but includes at least:
 144 --
 145 --  * @UTF-8@
 146 --
 147 --  * @UTF-16@, @UTF-16BE@, @UTF-16LE@
 148 --
 149 --  * @UTF-32@, @UTF-32BE@, @UTF-32LE@
 150 --
 151 -- On systems using GNU iconv (e.g. Linux), there is additional
 152 -- notation for specifying how illegal characters are handled:
 153 --
 154 --  * a suffix of @\/\/IGNORE@, e.g. @UTF-8\/\/IGNORE@, will cause
 155 --    all illegal sequences on input to be ignored, and on output
 156 --    will drop all code points that have no representation in the
 157 --    target encoding.
 158 --
 159 --  * a suffix of @\/\/TRANSLIT@ will choose a replacement character
 160 --    for illegal sequences or code points.
 161 --
 162 -- On Windows, you can access supported code pages with the prefix
 163 -- @CP@; for example, @\"CP1250\"@.
 164 --
 165 mkTextEncoding :: String -> IO TextEncoding
 166 mkTextEncoding e = case mb_coding_failure_mode of
 167   Nothing -> unknown_encoding
 168   Just cfm -> case enc of
 169     "UTF-8"    -> return $ UTF8.mkUTF8 cfm
 170     "UTF-16"   -> return $ UTF16.mkUTF16 cfm
 171     "UTF-16LE" -> return $ UTF16.mkUTF16le cfm
 172     "UTF-16BE" -> return $ UTF16.mkUTF16be cfm
 173     "UTF-32"   -> return $ UTF32.mkUTF32 cfm
 174     "UTF-32LE" -> return $ UTF32.mkUTF32le cfm
 175     "UTF-32BE" -> return $ UTF32.mkUTF32be cfm
 176 #if defined(mingw32_HOST_OS)
 177     'C':'P':n | [(cp,"")] <- reads n -> return $ CodePage.mkCodePageEncoding cfm cp
 178     _ -> unknown_encoding
 179 #else
 180     _ -> Iconv.mkIconvEncoding cfm enc
 181 #endif
 182   where
 183     -- The only problem with actually documenting //IGNORE and //TRANSLIT as
 184     -- supported suffixes is that they are not necessarily supported with non-GNU iconv
 185     (enc, suffix) = span (/= '/') e
 186     mb_coding_failure_mode = case suffix of
 187         ""            -> Just ErrorOnCodingFailure
 188         "//IGNORE"    -> Just IgnoreCodingFailure
 189         "//TRANSLIT"  -> Just TransliterateCodingFailure
 190         "//ROUNDTRIP" -> Just RoundtripFailure
 191         _             -> Nothing
 192
 193     unknown_encoding = ioException (IOError Nothing NoSuchThing "mkTextEncoding"
 194                                             ("unknown encoding:" ++ e)  Nothing Nothing)
 195
 196 latin1_encode :: CharBuffer -> Buffer Word8 -> IO (CharBuffer, Buffer Word8)
 197 latin1_encode input output = fmap (\(_why,input',output') -> (input',output')) $ Latin1.latin1_encode input output -- unchecked, used for char8
 198 --latin1_encode = unsafePerformIO $ do mkTextEncoder Iconv.latin1 >>= return.encode
 199
 200 latin1_decode :: Buffer Word8 -> CharBuffer -> IO (Buffer Word8, CharBuffer)
 201 latin1_decode input output = fmap (\(_why,input',output') -> (input',output')) $ Latin1.latin1_decode input output
 202 --latin1_decode = unsafePerformIO $ do mkTextDecoder Iconv.latin1 >>= return.encode