GHC/IO/Encoding.hs

   1 {-# LANGUAGE CPP, NoImplicitPrelude, PatternGuards #-}
   2 {-# OPTIONS_GHC -funbox-strict-fields #-}
   3
   4 -----------------------------------------------------------------------------
   5 -- |
   6 -- Module      :  GHC.IO.Encoding
   7 -- Copyright   :  (c) The University of Glasgow, 2008-2009
   8 -- License     :  see libraries/base/LICENSE
   9 --
  10 -- Maintainer  :  libraries@haskell.org
  11 -- Stability   :  internal
  12 -- Portability :  non-portable
  13 --
  14 -- Text codecs for I/O
  15 --
  16 -----------------------------------------------------------------------------
  17
  18 module GHC.IO.Encoding (
  19   BufferCodec(..), TextEncoding(..), TextEncoder, TextDecoder, CodingProgress(..),
  20   latin1, latin1_encode, latin1_decode,
  21   utf8, utf8_bom,
  22   utf16, utf16le, utf16be,
  23   utf32, utf32le, utf32be,
  24   localeEncoding, fileSystemEncoding, foreignEncoding,
  25   mkTextEncoding,
  26   ) where
  27
  28 import GHC.Base
  29 --import GHC.IO
  30 import GHC.IO.Exception
  31 import GHC.IO.Buffer
  32 import GHC.IO.Encoding.Failure
  33 import GHC.IO.Encoding.Types
  34 import GHC.Word
  35 #if !defined(mingw32_HOST_OS)
  36 import qualified GHC.IO.Encoding.Iconv  as Iconv
  37 #else
  38 import qualified GHC.IO.Encoding.CodePage as CodePage
  39 import Text.Read (reads)
  40 #endif
  41 import qualified GHC.IO.Encoding.Latin1 as Latin1
  42 import qualified GHC.IO.Encoding.UTF8   as UTF8
  43 import qualified GHC.IO.Encoding.UTF16  as UTF16
  44 import qualified GHC.IO.Encoding.UTF32  as UTF32
  45
  46 import Data.List
  47 import Data.Maybe
  48
  49 -- -----------------------------------------------------------------------------
  50
  51 -- | The Latin1 (ISO8859-1) encoding.  This encoding maps bytes
  52 -- directly to the first 256 Unicode code points, and is thus not a
  53 -- complete Unicode encoding.  An attempt to write a character greater than
  54 -- '\255' to a 'Handle' using the 'latin1' encoding will result in an error.
  55 latin1  :: TextEncoding
  56 latin1 = Latin1.latin1_checked
  57
  58 -- | The UTF-8 Unicode encoding
  59 utf8  :: TextEncoding
  60 utf8 = UTF8.utf8
  61
  62 -- | The UTF-8 Unicode encoding, with a byte-order-mark (BOM; the byte
  63 -- sequence 0xEF 0xBB 0xBF).  This encoding behaves like 'utf8',
  64 -- except that on input, the BOM sequence is ignored at the beginning
  65 -- of the stream, and on output, the BOM sequence is prepended.
  66 --
  67 -- The byte-order-mark is strictly unnecessary in UTF-8, but is
  68 -- sometimes used to identify the encoding of a file.
  69 --
  70 utf8_bom  :: TextEncoding
  71 utf8_bom = UTF8.utf8_bom
  72
  73 -- | The UTF-16 Unicode encoding (a byte-order-mark should be used to
  74 -- indicate endianness).
  75 utf16  :: TextEncoding
  76 utf16 = UTF16.utf16
  77
  78 -- | The UTF-16 Unicode encoding (litte-endian)
  79 utf16le  :: TextEncoding
  80 utf16le = UTF16.utf16le
  81
  82 -- | The UTF-16 Unicode encoding (big-endian)
  83 utf16be  :: TextEncoding
  84 utf16be = UTF16.utf16be
  85
  86 -- | The UTF-32 Unicode encoding (a byte-order-mark should be used to
  87 -- indicate endianness).
  88 utf32  :: TextEncoding
  89 utf32 = UTF32.utf32
  90
  91 -- | The UTF-32 Unicode encoding (litte-endian)
  92 utf32le  :: TextEncoding
  93 utf32le = UTF32.utf32le
  94
  95 -- | The UTF-32 Unicode encoding (big-endian)
  96 utf32be  :: TextEncoding
  97 utf32be = UTF32.utf32be
  98
  99 -- | The Unicode encoding of the current locale
 100 localeEncoding :: TextEncoding
 101
 102 -- | The Unicode encoding of the current locale, but allowing arbitrary
 103 -- undecodable bytes to be round-tripped through it.
 104 --
 105 -- This 'TextEncoding' is used to decode and encode command line arguments
 106 -- and environment variables on non-Windows platforms.
 107 --
 108 -- On Windows, this encoding *should not* be used if possible because
 109 -- the use of code pages is deprecated: Strings should be retrieved
 110 -- via the "wide" W-family of UTF-16 APIs instead
 111 fileSystemEncoding :: TextEncoding
 112
 113 -- | The Unicode encoding of the current locale, but where undecodable
 114 -- bytes are replaced with their closest visual match. Used for
 115 -- the 'CString' marshalling functions in "Foreign.C.String"
 116 foreignEncoding :: TextEncoding
 117
 118 #if !defined(mingw32_HOST_OS)
 119 localeEncoding = Iconv.localeEncoding
 120 fileSystemEncoding = Iconv.mkLocaleEncoding RoundtripFailure
 121 foreignEncoding = Iconv.mkLocaleEncoding IgnoreCodingFailure
 122 #else
 123 localeEncoding = CodePage.localeEncoding
 124 fileSystemEncoding = CodePage.mkLocaleEncoding RoundtripFailure
 125 foreignEncoding = CodePage.mkLocaleEncoding IgnoreCodingFailure
 126 #endif
 127
 128 -- | Look up the named Unicode encoding.  May fail with
 129 --
 130 --  * 'isDoesNotExistError' if the encoding is unknown
 131 --
 132 -- The set of known encodings is system-dependent, but includes at least:
 133 --
 134 --  * @UTF-8@
 135 --
 136 --  * @UTF-16@, @UTF-16BE@, @UTF-16LE@
 137 --
 138 --  * @UTF-32@, @UTF-32BE@, @UTF-32LE@
 139 --
 140 -- On systems using GNU iconv (e.g. Linux), there is additional
 141 -- notation for specifying how illegal characters are handled:
 142 --
 143 --  * a suffix of @\/\/IGNORE@, e.g. @UTF-8\/\/IGNORE@, will cause
 144 --    all illegal sequences on input to be ignored, and on output
 145 --    will drop all code points that have no representation in the
 146 --    target encoding.
 147 --
 148 --  * a suffix of @\/\/TRANSLIT@ will choose a replacement character
 149 --    for illegal sequences or code points.
 150 --
 151 -- On Windows, you can access supported code pages with the prefix
 152 -- @CP@; for example, @\"CP1250\"@.
 153 --
 154 mkTextEncoding :: String -> IO TextEncoding
 155 mkTextEncoding e = case mb_coding_failure_mode of
 156   Nothing -> unknown_encoding
 157   Just cfm -> case enc of
 158     "UTF-8"    -> return $ UTF8.mkUTF8 cfm
 159     "UTF-16"   -> return $ UTF16.mkUTF16 cfm
 160     "UTF-16LE" -> return $ UTF16.mkUTF16le cfm
 161     "UTF-16BE" -> return $ UTF16.mkUTF16be cfm
 162     "UTF-32"   -> return $ UTF32.mkUTF32 cfm
 163     "UTF-32LE" -> return $ UTF32.mkUTF32le cfm
 164     "UTF-32BE" -> return $ UTF32.mkUTF32be cfm
 165 #if defined(mingw32_HOST_OS)
 166     'C':'P':n | [(cp,"")] <- reads n -> return $ CodePage.mkCodePageEncoding cfm cp
 167     _ -> unknown_encoding
 168 #else
 169     _ -> Iconv.mkIconvEncoding cfm enc
 170 #endif
 171   where
 172     -- The only problem with actually documenting //IGNORE and //TRANSLIT as
 173     -- supported suffixes is that they are not necessarily supported with non-GNU iconv
 174     (enc, suffix) = span (/= '/') e
 175     mb_coding_failure_mode = case suffix of
 176         ""            -> Just ErrorOnCodingFailure
 177         "//IGNORE"    -> Just IgnoreCodingFailure
 178         "//TRANSLIT"  -> Just TransliterateCodingFailure
 179         "//ROUNDTRIP" -> Just RoundtripFailure
 180         _             -> Nothing
 181
 182     unknown_encoding = ioException (IOError Nothing NoSuchThing "mkTextEncoding"
 183                                             ("unknown encoding:" ++ e)  Nothing Nothing)
 184
 185 latin1_encode :: CharBuffer -> Buffer Word8 -> IO (CharBuffer, Buffer Word8)
 186 latin1_encode input output = fmap (\(_why,input',output') -> (input',output')) $ Latin1.latin1_encode input output -- unchecked, used for binary
 187 --latin1_encode = unsafePerformIO $ do mkTextEncoder Iconv.latin1 >>= return.encode
 188
 189 latin1_decode :: Buffer Word8 -> CharBuffer -> IO (Buffer Word8, CharBuffer)
 190 latin1_decode input output = fmap (\(_why,input',output') -> (input',output')) $ Latin1.latin1_decode input output
 191 --latin1_decode = unsafePerformIO $ do mkTextDecoder Iconv.latin1 >>= return.encode