GHC/IO/Encoding.hs

   1 {-# LANGUAGE CPP, NoImplicitPrelude, PatternGuards #-}
   2 {-# OPTIONS_GHC -funbox-strict-fields #-}
   3
   4 -----------------------------------------------------------------------------
   5 -- |
   6 -- Module      :  GHC.IO.Encoding
   7 -- Copyright   :  (c) The University of Glasgow, 2008-2009
   8 -- License     :  see libraries/base/LICENSE
   9 --
  10 -- Maintainer  :  libraries@haskell.org
  11 -- Stability   :  internal
  12 -- Portability :  non-portable
  13 --
  14 -- Text codecs for I/O
  15 --
  16 -----------------------------------------------------------------------------
  17
  18 module GHC.IO.Encoding (
  19   BufferCodec(..), TextEncoding(..), TextEncoder, TextDecoder,
  20   latin1, latin1_encode, latin1_decode,
  21   utf8, utf8_bom,
  22   utf16, utf16le, utf16be,
  23   utf32, utf32le, utf32be,
  24   localeEncoding,
  25   mkTextEncoding,
  26   ) where
  27
  28 import GHC.Base
  29 --import GHC.IO
  30 import GHC.IO.Buffer
  31 import GHC.IO.Encoding.Types
  32 import GHC.Word
  33 #if !defined(mingw32_HOST_OS)
  34 import qualified GHC.IO.Encoding.Iconv  as Iconv
  35 #else
  36 import qualified GHC.IO.Encoding.CodePage as CodePage
  37 import Text.Read (reads)
  38 #endif
  39 import qualified GHC.IO.Encoding.Latin1 as Latin1
  40 import qualified GHC.IO.Encoding.UTF8   as UTF8
  41 import qualified GHC.IO.Encoding.UTF16  as UTF16
  42 import qualified GHC.IO.Encoding.UTF32  as UTF32
  43
  44 #if defined(mingw32_HOST_OS)
  45 import Data.Maybe
  46 import GHC.IO.Exception
  47 #endif
  48
  49 -- -----------------------------------------------------------------------------
  50
  51 -- | The Latin1 (ISO8859-1) encoding.  This encoding maps bytes
  52 -- directly to the first 256 Unicode code points, and is thus not a
  53 -- complete Unicode encoding.  An attempt to write a character greater than
  54 -- '\255' to a 'Handle' using the 'latin1' encoding will result in an error.
  55 latin1  :: TextEncoding
  56 latin1 = Latin1.latin1_checked
  57
  58 -- | The UTF-8 Unicode encoding
  59 utf8  :: TextEncoding
  60 utf8 = UTF8.utf8
  61
  62 -- | The UTF-8 Unicode encoding, with a byte-order-mark (BOM; the byte
  63 -- sequence 0xEF 0xBB 0xBF).  This encoding behaves like 'utf8',
  64 -- except that on input, the BOM sequence is ignored at the beginning
  65 -- of the stream, and on output, the BOM sequence is prepended.
  66 --
  67 -- The byte-order-mark is strictly unnecessary in UTF-8, but is
  68 -- sometimes used to identify the encoding of a file.
  69 --
  70 utf8_bom  :: TextEncoding
  71 utf8_bom = UTF8.utf8_bom
  72
  73 -- | The UTF-16 Unicode encoding (a byte-order-mark should be used to
  74 -- indicate endianness).
  75 utf16  :: TextEncoding
  76 utf16 = UTF16.utf16
  77
  78 -- | The UTF-16 Unicode encoding (litte-endian)
  79 utf16le  :: TextEncoding
  80 utf16le = UTF16.utf16le
  81
  82 -- | The UTF-16 Unicode encoding (big-endian)
  83 utf16be  :: TextEncoding
  84 utf16be = UTF16.utf16be
  85
  86 -- | The UTF-32 Unicode encoding (a byte-order-mark should be used to
  87 -- indicate endianness).
  88 utf32  :: TextEncoding
  89 utf32 = UTF32.utf32
  90
  91 -- | The UTF-32 Unicode encoding (litte-endian)
  92 utf32le  :: TextEncoding
  93 utf32le = UTF32.utf32le
  94
  95 -- | The UTF-32 Unicode encoding (big-endian)
  96 utf32be  :: TextEncoding
  97 utf32be = UTF32.utf32be
  98
  99 -- | The Unicode encoding of the current locale
 100 localeEncoding  :: TextEncoding
 101 #if !defined(mingw32_HOST_OS)
 102 localeEncoding = Iconv.localeEncoding
 103 #else
 104 localeEncoding = CodePage.localeEncoding
 105 #endif
 106
 107 -- | Look up the named Unicode encoding.  May fail with
 108 --
 109 --  * 'isDoesNotExistError' if the encoding is unknown
 110 --
 111 -- The set of known encodings is system-dependent, but includes at least:
 112 --
 113 --  * @UTF-8@
 114 --
 115 --  * @UTF-16@, @UTF-16BE@, @UTF-16LE@
 116 --
 117 --  * @UTF-32@, @UTF-32BE@, @UTF-32LE@
 118 --
 119 -- On systems using GNU iconv (e.g. Linux), there is additional
 120 -- notation for specifying how illegal characters are handled:
 121 --
 122 --  * a suffix of @\/\/IGNORE@, e.g. @UTF-8\/\/IGNORE@, will cause
 123 --    all illegal sequences on input to be ignored, and on output
 124 --    will drop all code points that have no representation in the
 125 --    target encoding.
 126 --
 127 --  * a suffix of @\/\/TRANSLIT@ will choose a replacement character
 128 --    for illegal sequences or code points.
 129 --
 130 -- On Windows, you can access supported code pages with the prefix
 131 -- @CP@; for example, @\"CP1250\"@.
 132 --
 133 mkTextEncoding :: String -> IO TextEncoding
 134 #if !defined(mingw32_HOST_OS)
 135 mkTextEncoding = Iconv.mkTextEncoding
 136 #else
 137 mkTextEncoding "UTF-8"    = return utf8
 138 mkTextEncoding "UTF-16"   = return utf16
 139 mkTextEncoding "UTF-16LE" = return utf16le
 140 mkTextEncoding "UTF-16BE" = return utf16be
 141 mkTextEncoding "UTF-32"   = return utf32
 142 mkTextEncoding "UTF-32LE" = return utf32le
 143 mkTextEncoding "UTF-32BE" = return utf32be
 144 mkTextEncoding ('C':'P':n)
 145     | [(cp,"")] <- reads n = return $ CodePage.codePageEncoding cp
 146 mkTextEncoding e = ioException
 147      (IOError Nothing NoSuchThing "mkTextEncoding"
 148           ("unknown encoding:" ++ e)  Nothing Nothing)
 149 #endif
 150
 151 latin1_encode :: CharBuffer -> Buffer Word8 -> IO (CharBuffer, Buffer Word8)
 152 latin1_encode = Latin1.latin1_encode -- unchecked, used for binary
 153 --latin1_encode = unsafePerformIO $ do mkTextEncoder Iconv.latin1 >>= return.encode
 154
 155 latin1_decode :: Buffer Word8 -> CharBuffer -> IO (Buffer Word8, CharBuffer)
 156 latin1_decode = Latin1.latin1_decode
 157 --latin1_decode = unsafePerformIO $ do mkTextDecoder Iconv.latin1 >>= return.encode