GHC/IO/Encoding.hs

   1 {-# OPTIONS_GHC -XNoImplicitPrelude -funbox-strict-fields #-}
   2 -----------------------------------------------------------------------------
   3 -- |
   4 -- Module      :  GHC.IO.Encoding
   5 -- Copyright   :  (c) The University of Glasgow, 2008-2009
   6 -- License     :  see libraries/base/LICENSE
   7 --
   8 -- Maintainer  :  libraries@haskell.org
   9 -- Stability   :  internal
  10 -- Portability :  non-portable
  11 --
  12 -- Text codecs for I/O
  13 --
  14 -----------------------------------------------------------------------------
  15
  16 module GHC.IO.Encoding (
  17   BufferCodec(..), TextEncoding(..), TextEncoder, TextDecoder,
  18   latin1, latin1_encode, latin1_decode,
  19   utf8, utf8_bom,
  20   utf16, utf16le, utf16be,
  21   utf32, utf32le, utf32be,
  22   localeEncoding,
  23   mkTextEncoding,
  24   ) where
  25
  26 import GHC.Base
  27 --import GHC.IO
  28 import GHC.IO.Buffer
  29 import GHC.IO.Encoding.Types
  30 import GHC.Word
  31 #if !defined(mingw32_HOST_OS)
  32 import qualified GHC.IO.Encoding.Iconv  as Iconv
  33 #else
  34 import qualified GHC.IO.Encoding.CodePage as CodePage
  35 import Text.Read (reads)
  36 #endif
  37 import qualified GHC.IO.Encoding.Latin1 as Latin1
  38 import qualified GHC.IO.Encoding.UTF8   as UTF8
  39 import qualified GHC.IO.Encoding.UTF16  as UTF16
  40 import qualified GHC.IO.Encoding.UTF32  as UTF32
  41
  42 #if defined(mingw32_HOST_OS)
  43 import Data.Maybe
  44 import GHC.IO.Exception
  45 #endif
  46
  47 -- -----------------------------------------------------------------------------
  48
  49 -- | The Latin1 (ISO8859-1) encoding.  This encoding maps bytes
  50 -- directly to the first 256 Unicode code points, and is thus not a
  51 -- complete Unicode encoding.  An attempt to write a character greater than
  52 -- '\255' to a 'Handle' using the 'latin1' encoding will result in an error.
  53 latin1  :: TextEncoding
  54 latin1 = Latin1.latin1_checked
  55
  56 -- | The UTF-8 Unicode encoding
  57 utf8  :: TextEncoding
  58 utf8 = UTF8.utf8
  59
  60 -- | The UTF-8 Unicode encoding, with a byte-order-mark (BOM; the byte
  61 -- sequence 0xEF 0xBB 0xBF).  This encoding behaves like 'utf8',
  62 -- except that on input, the BOM sequence is ignored at the beginning
  63 -- of the stream, and on output, the BOM sequence is prepended.
  64 --
  65 -- The byte-order-mark is strictly unnecessary in UTF-8, but is
  66 -- sometimes used to identify the encoding of a file.
  67 --
  68 utf8_bom  :: TextEncoding
  69 utf8_bom = UTF8.utf8_bom
  70
  71 -- | The UTF-16 Unicode encoding (a byte-order-mark should be used to
  72 -- indicate endianness).
  73 utf16  :: TextEncoding
  74 utf16 = UTF16.utf16
  75
  76 -- | The UTF-16 Unicode encoding (litte-endian)
  77 utf16le  :: TextEncoding
  78 utf16le = UTF16.utf16le
  79
  80 -- | The UTF-16 Unicode encoding (big-endian)
  81 utf16be  :: TextEncoding
  82 utf16be = UTF16.utf16be
  83
  84 -- | The UTF-32 Unicode encoding (a byte-order-mark should be used to
  85 -- indicate endianness).
  86 utf32  :: TextEncoding
  87 utf32 = UTF32.utf32
  88
  89 -- | The UTF-32 Unicode encoding (litte-endian)
  90 utf32le  :: TextEncoding
  91 utf32le = UTF32.utf32le
  92
  93 -- | The UTF-32 Unicode encoding (big-endian)
  94 utf32be  :: TextEncoding
  95 utf32be = UTF32.utf32be
  96
  97 -- | The Unicode encoding of the current locale
  98 localeEncoding  :: TextEncoding
  99 #if !defined(mingw32_HOST_OS)
 100 localeEncoding = Iconv.localeEncoding
 101 #else
 102 localeEncoding = CodePage.localeEncoding
 103 #endif
 104
 105 -- | Look up the named Unicode encoding.  May fail with
 106 --
 107 --  * 'isDoesNotExistError' if the encoding is unknown
 108 --
 109 -- The set of known encodings is system-dependent, but includes at least:
 110 --
 111 --  * @UTF-8@
 112 --
 113 --  * @UTF-16@, @UTF-16BE@, @UTF-16LE@
 114 --
 115 --  * @UTF-32@, @UTF-32BE@, @UTF-32LE@
 116 --
 117 -- On systems using GNU iconv (e.g. Linux), there is additional
 118 -- notation for specifying how illegal characters are handled:
 119 --
 120 --  * a suffix of @\/\/IGNORE@, e.g. @UTF-8\/\/IGNORE@, will cause
 121 --    all illegal sequences on input to be ignored, and on output
 122 --    will drop all code points that have no representation in the
 123 --    target encoding.
 124 --
 125 --  * a suffix of @\/\/TRANSLIT@ will choose a replacement character
 126 --    for illegal sequences or code points.
 127 --
 128 -- On Windows, you can access supported code pages with the prefix
 129 -- @CP@; for example, @\"CP1250\"@.
 130 --
 131 mkTextEncoding :: String -> IO TextEncoding
 132 #if !defined(mingw32_HOST_OS)
 133 mkTextEncoding = Iconv.mkTextEncoding
 134 #else
 135 mkTextEncoding "UTF-8"    = return utf8
 136 mkTextEncoding "UTF-16"   = return utf16
 137 mkTextEncoding "UTF-16LE" = return utf16le
 138 mkTextEncoding "UTF-16BE" = return utf16be
 139 mkTextEncoding "UTF-32"   = return utf32
 140 mkTextEncoding "UTF-32LE" = return utf32le
 141 mkTextEncoding "UTF-32BE" = return utf32be
 142 mkTextEncoding ('C':'P':n)
 143     | [(cp,"")] <- reads n = return $ CodePage.codePageEncoding cp
 144 mkTextEncoding e = ioException
 145      (IOError Nothing NoSuchThing "mkTextEncoding"
 146           ("unknown encoding:" ++ e)  Nothing Nothing)
 147 #endif
 148
 149 latin1_encode :: CharBuffer -> Buffer Word8 -> IO (CharBuffer, Buffer Word8)
 150 latin1_encode = Latin1.latin1_encode -- unchecked, used for binary
 151 --latin1_encode = unsafePerformIO $ do mkTextEncoder Iconv.latin1 >>= return.encode
 152
 153 latin1_decode :: Buffer Word8 -> CharBuffer -> IO (Buffer Word8, CharBuffer)
 154 latin1_decode = Latin1.latin1_decode
 155 --latin1_decode = unsafePerformIO $ do mkTextDecoder Iconv.latin1 >>= return.encode