X-Git-Url: http://git.megacz.com/?a=blobdiff_plain;f=GHC%2FIO%2FEncoding.hs;h=92ca843d80429c34ddab42567447820a538689fd;hb=HEAD;hp=a1da1b1b21986879f3416d7c624f9a11f2ad6d53;hpb=1bf6eeca9b174c5d9522a551e58daad6895faab7;p=ghc-base.git diff --git a/GHC/IO/Encoding.hs b/GHC/IO/Encoding.hs index a1da1b1..92ca843 100644 --- a/GHC/IO/Encoding.hs +++ b/GHC/IO/Encoding.hs @@ -1,4 +1,6 @@ -{-# OPTIONS_GHC -fno-implicit-prelude -funbox-strict-fields #-} +{-# LANGUAGE CPP, NoImplicitPrelude, PatternGuards #-} +{-# OPTIONS_GHC -funbox-strict-fields #-} + ----------------------------------------------------------------------------- -- | -- Module : GHC.IO.Encoding @@ -14,32 +16,36 @@ ----------------------------------------------------------------------------- module GHC.IO.Encoding ( - BufferCodec(..), TextEncoding(..), TextEncoder, TextDecoder, + BufferCodec(..), TextEncoding(..), TextEncoder, TextDecoder, CodingProgress(..), latin1, latin1_encode, latin1_decode, utf8, utf8_bom, utf16, utf16le, utf16be, utf32, utf32le, utf32be, - localeEncoding, + localeEncoding, fileSystemEncoding, foreignEncoding, + char8, mkTextEncoding, ) where import GHC.Base --import GHC.IO +import GHC.IO.Exception import GHC.IO.Buffer +import GHC.IO.Encoding.Failure import GHC.IO.Encoding.Types import GHC.Word #if !defined(mingw32_HOST_OS) import qualified GHC.IO.Encoding.Iconv as Iconv +#else +import qualified GHC.IO.Encoding.CodePage as CodePage +import Text.Read (reads) #endif import qualified GHC.IO.Encoding.Latin1 as Latin1 import qualified GHC.IO.Encoding.UTF8 as UTF8 import qualified GHC.IO.Encoding.UTF16 as UTF16 import qualified GHC.IO.Encoding.UTF32 as UTF32 -#if defined(mingw32_HOST_OS) +import Data.List import Data.Maybe -import GHC.IO.Exception -#endif -- ----------------------------------------------------------------------------- @@ -92,39 +98,105 @@ utf32be :: TextEncoding utf32be = UTF32.utf32be -- | The Unicode encoding of the current locale -localeEncoding :: TextEncoding +localeEncoding :: TextEncoding + +-- | The Unicode encoding of the current locale, but allowing arbitrary +-- undecodable bytes to be round-tripped through it. +-- +-- This 'TextEncoding' is used to decode and encode command line arguments +-- and environment variables on non-Windows platforms. +-- +-- On Windows, this encoding *should not* be used if possible because +-- the use of code pages is deprecated: Strings should be retrieved +-- via the "wide" W-family of UTF-16 APIs instead +fileSystemEncoding :: TextEncoding + +-- | The Unicode encoding of the current locale, but where undecodable +-- bytes are replaced with their closest visual match. Used for +-- the 'CString' marshalling functions in "Foreign.C.String" +foreignEncoding :: TextEncoding + #if !defined(mingw32_HOST_OS) localeEncoding = Iconv.localeEncoding +fileSystemEncoding = Iconv.mkLocaleEncoding RoundtripFailure +foreignEncoding = Iconv.mkLocaleEncoding IgnoreCodingFailure #else -localeEncoding = Latin1.latin1 +localeEncoding = CodePage.localeEncoding +fileSystemEncoding = CodePage.mkLocaleEncoding RoundtripFailure +foreignEncoding = CodePage.mkLocaleEncoding IgnoreCodingFailure #endif +-- | An encoding in which Unicode code points are translated to bytes +-- by taking the code point modulo 256. When decoding, bytes are +-- translated directly into the equivalent code point. +-- +-- This encoding never fails in either direction. However, encoding +-- discards informaiton, so encode followed by decode is not the +-- identity. +char8 :: TextEncoding +char8 = Latin1.latin1 + -- | Look up the named Unicode encoding. May fail with -- -- * 'isDoesNotExistError' if the encoding is unknown -- --- The set of known encodings is system-dependent. +-- The set of known encodings is system-dependent, but includes at least: +-- +-- * @UTF-8@ +-- +-- * @UTF-16@, @UTF-16BE@, @UTF-16LE@ +-- +-- * @UTF-32@, @UTF-32BE@, @UTF-32LE@ +-- +-- On systems using GNU iconv (e.g. Linux), there is additional +-- notation for specifying how illegal characters are handled: +-- +-- * a suffix of @\/\/IGNORE@, e.g. @UTF-8\/\/IGNORE@, will cause +-- all illegal sequences on input to be ignored, and on output +-- will drop all code points that have no representation in the +-- target encoding. +-- +-- * a suffix of @\/\/TRANSLIT@ will choose a replacement character +-- for illegal sequences or code points. +-- +-- On Windows, you can access supported code pages with the prefix +-- @CP@; for example, @\"CP1250\"@. -- mkTextEncoding :: String -> IO TextEncoding -#if !defined(mingw32_HOST_OS) -mkTextEncoding = Iconv.mkTextEncoding +mkTextEncoding e = case mb_coding_failure_mode of + Nothing -> unknown_encoding + Just cfm -> case enc of + "UTF-8" -> return $ UTF8.mkUTF8 cfm + "UTF-16" -> return $ UTF16.mkUTF16 cfm + "UTF-16LE" -> return $ UTF16.mkUTF16le cfm + "UTF-16BE" -> return $ UTF16.mkUTF16be cfm + "UTF-32" -> return $ UTF32.mkUTF32 cfm + "UTF-32LE" -> return $ UTF32.mkUTF32le cfm + "UTF-32BE" -> return $ UTF32.mkUTF32be cfm +#if defined(mingw32_HOST_OS) + 'C':'P':n | [(cp,"")] <- reads n -> return $ CodePage.mkCodePageEncoding cfm cp + _ -> unknown_encoding #else -mkTextEncoding "UTF-8" = return utf8 -mkTextEncoding "UTF-16" = return utf16 -mkTextEncoding "UTF-16LE" = return utf16le -mkTextEncoding "UTF-16BE" = return utf16be -mkTextEncoding "UTF-32" = return utf32 -mkTextEncoding "UTF-32LE" = return utf32le -mkTextEncoding "UTF-32BE" = return utf32be -mkTextEncoding e = ioException - (IOError Nothing NoSuchThing "mkTextEncoding" - ("unknown encoding:" ++ e) Nothing Nothing) + _ -> Iconv.mkIconvEncoding cfm enc #endif + where + -- The only problem with actually documenting //IGNORE and //TRANSLIT as + -- supported suffixes is that they are not necessarily supported with non-GNU iconv + (enc, suffix) = span (/= '/') e + mb_coding_failure_mode = case suffix of + "" -> Just ErrorOnCodingFailure + "//IGNORE" -> Just IgnoreCodingFailure + "//TRANSLIT" -> Just TransliterateCodingFailure + "//ROUNDTRIP" -> Just RoundtripFailure + _ -> Nothing + + unknown_encoding = ioException (IOError Nothing NoSuchThing "mkTextEncoding" + ("unknown encoding:" ++ e) Nothing Nothing) latin1_encode :: CharBuffer -> Buffer Word8 -> IO (CharBuffer, Buffer Word8) -latin1_encode = Latin1.latin1_encode -- unchecked, used for binary +latin1_encode input output = fmap (\(_why,input',output') -> (input',output')) $ Latin1.latin1_encode input output -- unchecked, used for char8 --latin1_encode = unsafePerformIO $ do mkTextEncoder Iconv.latin1 >>= return.encode latin1_decode :: Buffer Word8 -> CharBuffer -> IO (Buffer Word8, CharBuffer) -latin1_decode = Latin1.latin1_decode +latin1_decode input output = fmap (\(_why,input',output') -> (input',output')) $ Latin1.latin1_decode input output --latin1_decode = unsafePerformIO $ do mkTextDecoder Iconv.latin1 >>= return.encode