X-Git-Url: http://git.megacz.com/?a=blobdiff_plain;f=GHC%2FIO%2FEncoding.hs;h=5d8ecb4c7062894a5f0b11b6abc6062dc395f061;hb=54a0b4928967a350be687e17ff3576f2af6e21fb;hp=bb976e3cce6cc398751ed5bf406e32e4b8790a59;hpb=8afc9fecd586d3c4f7ef9c69fb1686a79e5f441d;p=ghc-base.git diff --git a/GHC/IO/Encoding.hs b/GHC/IO/Encoding.hs index bb976e3..5d8ecb4 100644 --- a/GHC/IO/Encoding.hs +++ b/GHC/IO/Encoding.hs @@ -1,4 +1,6 @@ -{-# OPTIONS_GHC -fno-implicit-prelude -funbox-strict-fields #-} +{-# LANGUAGE CPP, NoImplicitPrelude, PatternGuards #-} +{-# OPTIONS_GHC -funbox-strict-fields #-} + ----------------------------------------------------------------------------- -- | -- Module : GHC.IO.Encoding @@ -16,7 +18,7 @@ module GHC.IO.Encoding ( BufferCodec(..), TextEncoding(..), TextEncoder, TextDecoder, latin1, latin1_encode, latin1_decode, - utf8, + utf8, utf8_bom, utf16, utf16le, utf16be, utf32, utf32le, utf32be, localeEncoding, @@ -30,6 +32,9 @@ import GHC.IO.Encoding.Types import GHC.Word #if !defined(mingw32_HOST_OS) import qualified GHC.IO.Encoding.Iconv as Iconv +#else +import qualified GHC.IO.Encoding.CodePage as CodePage +import Text.Read (reads) #endif import qualified GHC.IO.Encoding.Latin1 as Latin1 import qualified GHC.IO.Encoding.UTF8 as UTF8 @@ -43,45 +48,88 @@ import GHC.IO.Exception -- ----------------------------------------------------------------------------- -latin1, utf8, utf16, utf16le, utf16be, utf32, utf32le, utf32be, localeEncoding - :: TextEncoding - -- | The Latin1 (ISO8859-1) encoding. This encoding maps bytes -- directly to the first 256 Unicode code points, and is thus not a --- complete Unicode encoding. +-- complete Unicode encoding. An attempt to write a character greater than +-- '\255' to a 'Handle' using the 'latin1' encoding will result in an error. +latin1 :: TextEncoding latin1 = Latin1.latin1_checked --- | The UTF-8 unicode encoding +-- | The UTF-8 Unicode encoding +utf8 :: TextEncoding utf8 = UTF8.utf8 --- | The UTF-16 unicode encoding (a byte-order-mark should be used to +-- | The UTF-8 Unicode encoding, with a byte-order-mark (BOM; the byte +-- sequence 0xEF 0xBB 0xBF). This encoding behaves like 'utf8', +-- except that on input, the BOM sequence is ignored at the beginning +-- of the stream, and on output, the BOM sequence is prepended. +-- +-- The byte-order-mark is strictly unnecessary in UTF-8, but is +-- sometimes used to identify the encoding of a file. +-- +utf8_bom :: TextEncoding +utf8_bom = UTF8.utf8_bom + +-- | The UTF-16 Unicode encoding (a byte-order-mark should be used to -- indicate endianness). +utf16 :: TextEncoding utf16 = UTF16.utf16 --- | The UTF-16 unicode encoding (litte-endian) +-- | The UTF-16 Unicode encoding (litte-endian) +utf16le :: TextEncoding utf16le = UTF16.utf16le --- | The UTF-16 unicode encoding (big-endian) +-- | The UTF-16 Unicode encoding (big-endian) +utf16be :: TextEncoding utf16be = UTF16.utf16be --- | The UTF-32 unicode encoding (a byte-order-mark should be used to +-- | The UTF-32 Unicode encoding (a byte-order-mark should be used to -- indicate endianness). +utf32 :: TextEncoding utf32 = UTF32.utf32 --- | The UTF-32 unicode encoding (litte-endian) +-- | The UTF-32 Unicode encoding (litte-endian) +utf32le :: TextEncoding utf32le = UTF32.utf32le --- | The UTF-32 unicode encoding (big-endian) +-- | The UTF-32 Unicode encoding (big-endian) +utf32be :: TextEncoding utf32be = UTF32.utf32be --- | The text encoding of the current locale +-- | The Unicode encoding of the current locale +localeEncoding :: TextEncoding #if !defined(mingw32_HOST_OS) localeEncoding = Iconv.localeEncoding #else -localeEncoding = Latin1.latin1 +localeEncoding = CodePage.localeEncoding #endif --- | Acquire the named text encoding +-- | Look up the named Unicode encoding. May fail with +-- +-- * 'isDoesNotExistError' if the encoding is unknown +-- +-- The set of known encodings is system-dependent, but includes at least: +-- +-- * @UTF-8@ +-- +-- * @UTF-16@, @UTF-16BE@, @UTF-16LE@ +-- +-- * @UTF-32@, @UTF-32BE@, @UTF-32LE@ +-- +-- On systems using GNU iconv (e.g. Linux), there is additional +-- notation for specifying how illegal characters are handled: +-- +-- * a suffix of @\/\/IGNORE@, e.g. @UTF-8\/\/IGNORE@, will cause +-- all illegal sequences on input to be ignored, and on output +-- will drop all code points that have no representation in the +-- target encoding. +-- +-- * a suffix of @\/\/TRANSLIT@ will choose a replacement character +-- for illegal sequences or code points. +-- +-- On Windows, you can access supported code pages with the prefix +-- @CP@; for example, @\"CP1250\"@. +-- mkTextEncoding :: String -> IO TextEncoding #if !defined(mingw32_HOST_OS) mkTextEncoding = Iconv.mkTextEncoding @@ -93,8 +141,10 @@ mkTextEncoding "UTF-16BE" = return utf16be mkTextEncoding "UTF-32" = return utf32 mkTextEncoding "UTF-32LE" = return utf32le mkTextEncoding "UTF-32BE" = return utf32be +mkTextEncoding ('C':'P':n) + | [(cp,"")] <- reads n = return $ CodePage.codePageEncoding cp mkTextEncoding e = ioException - (IOError Nothing InvalidArgument "mkTextEncoding" + (IOError Nothing NoSuchThing "mkTextEncoding" ("unknown encoding:" ++ e) Nothing Nothing) #endif