Use Unicode private-use characters for roundtripping

[ghc-base.git] / GHC / IO / Encoding.hs
diff --git a/GHC/IO/Encoding.hs b/GHC/IO/Encoding.hs

index bb976e3..505824e 100644 (file)
--- a/GHC/IO/Encoding.hs
+++ b/GHC/IO/Encoding.hs
@@ -1,4 +1,6 @@
-{-# OPTIONS_GHC -fno-implicit-prelude -funbox-strict-fields #-}
+{-# LANGUAGE CPP, NoImplicitPrelude, PatternGuards #-}
+{-# OPTIONS_GHC -funbox-strict-fields #-}
+
  -----------------------------------------------------------------------------
  -- |
  -- Module      :  GHC.IO.Encoding
@@ -14,94 +16,176 @@
  -----------------------------------------------------------------------------
  
  module GHC.IO.Encoding (
-  BufferCodec(..), TextEncoding(..), TextEncoder, TextDecoder,
+  BufferCodec(..), TextEncoding(..), TextEncoder, TextDecoder, CodingProgress(..),
    latin1, latin1_encode, latin1_decode,
-  utf8, 
+  utf8, utf8_bom,
    utf16, utf16le, utf16be,
    utf32, utf32le, utf32be, 
-  localeEncoding,
+  localeEncoding, fileSystemEncoding, foreignEncoding,
    mkTextEncoding,
    ) where
  
  import GHC.Base
  --import GHC.IO
+import GHC.IO.Exception
  import GHC.IO.Buffer
+import GHC.IO.Encoding.Failure
  import GHC.IO.Encoding.Types
  import GHC.Word
  #if !defined(mingw32_HOST_OS)
  import qualified GHC.IO.Encoding.Iconv  as Iconv
+#else
+import qualified GHC.IO.Encoding.CodePage as CodePage
+import Text.Read (reads)
  #endif
  import qualified GHC.IO.Encoding.Latin1 as Latin1
  import qualified GHC.IO.Encoding.UTF8   as UTF8
  import qualified GHC.IO.Encoding.UTF16  as UTF16
  import qualified GHC.IO.Encoding.UTF32  as UTF32
  
-#if defined(mingw32_HOST_OS)
+import Data.List
  import Data.Maybe
-import GHC.IO.Exception
-#endif
  
  -- -----------------------------------------------------------------------------
  
-latin1, utf8, utf16, utf16le, utf16be, utf32, utf32le, utf32be, localeEncoding
-  :: TextEncoding
-
  -- | The Latin1 (ISO8859-1) encoding.  This encoding maps bytes
  -- directly to the first 256 Unicode code points, and is thus not a
--- complete Unicode encoding.
+-- complete Unicode encoding.  An attempt to write a character greater than
+-- '\255' to a 'Handle' using the 'latin1' encoding will result in an error.
+latin1  :: TextEncoding
  latin1 = Latin1.latin1_checked
  
--- | The UTF-8 unicode encoding
+-- | The UTF-8 Unicode encoding
+utf8  :: TextEncoding
  utf8 = UTF8.utf8
  
--- | The UTF-16 unicode encoding (a byte-order-mark should be used to
+-- | The UTF-8 Unicode encoding, with a byte-order-mark (BOM; the byte
+-- sequence 0xEF 0xBB 0xBF).  This encoding behaves like 'utf8',
+-- except that on input, the BOM sequence is ignored at the beginning
+-- of the stream, and on output, the BOM sequence is prepended.
+--
+-- The byte-order-mark is strictly unnecessary in UTF-8, but is
+-- sometimes used to identify the encoding of a file.
+--
+utf8_bom  :: TextEncoding
+utf8_bom = UTF8.utf8_bom
+
+-- | The UTF-16 Unicode encoding (a byte-order-mark should be used to
  -- indicate endianness).
+utf16  :: TextEncoding
  utf16 = UTF16.utf16
  
--- | The UTF-16 unicode encoding (litte-endian)
+-- | The UTF-16 Unicode encoding (litte-endian)
+utf16le  :: TextEncoding
  utf16le = UTF16.utf16le
  
--- | The UTF-16 unicode encoding (big-endian)
+-- | The UTF-16 Unicode encoding (big-endian)
+utf16be  :: TextEncoding
  utf16be = UTF16.utf16be
  
--- | The UTF-32 unicode encoding (a byte-order-mark should be used to
+-- | The UTF-32 Unicode encoding (a byte-order-mark should be used to
  -- indicate endianness).
+utf32  :: TextEncoding
  utf32 = UTF32.utf32
  
--- | The UTF-32 unicode encoding (litte-endian)
+-- | The UTF-32 Unicode encoding (litte-endian)
+utf32le  :: TextEncoding
  utf32le = UTF32.utf32le
  
--- | The UTF-32 unicode encoding (big-endian)
+-- | The UTF-32 Unicode encoding (big-endian)
+utf32be  :: TextEncoding
  utf32be = UTF32.utf32be
  
--- | The text encoding of the current locale
+-- | The Unicode encoding of the current locale
+localeEncoding :: TextEncoding
+
+-- | The Unicode encoding of the current locale, but allowing arbitrary
+-- undecodable bytes to be round-tripped through it.
+--
+-- This 'TextEncoding' is used to decode and encode command line arguments
+-- and environment variables on non-Windows platforms.
+--
+-- On Windows, this encoding *should not* be used if possible because
+-- the use of code pages is deprecated: Strings should be retrieved
+-- via the "wide" W-family of UTF-16 APIs instead
+fileSystemEncoding :: TextEncoding
+
+-- | The Unicode encoding of the current locale, but where undecodable
+-- bytes are replaced with their closest visual match. Used for
+-- the 'CString' marshalling functions in "Foreign.C.String"
+foreignEncoding :: TextEncoding
+
  #if !defined(mingw32_HOST_OS)
  localeEncoding = Iconv.localeEncoding
+fileSystemEncoding = Iconv.mkLocaleEncoding RoundtripFailure
+foreignEncoding = Iconv.mkLocaleEncoding IgnoreCodingFailure
  #else
-localeEncoding = Latin1.latin1
+localeEncoding = CodePage.localeEncoding
+fileSystemEncoding = CodePage.mkLocaleEncoding RoundtripFailure
+foreignEncoding = CodePage.mkLocaleEncoding IgnoreCodingFailure
  #endif
  
--- | Acquire the named text encoding
+-- | Look up the named Unicode encoding.  May fail with 
+--
+--  * 'isDoesNotExistError' if the encoding is unknown
+--
+-- The set of known encodings is system-dependent, but includes at least:
+--
+--  * @UTF-8@
+--
+--  * @UTF-16@, @UTF-16BE@, @UTF-16LE@
+--
+--  * @UTF-32@, @UTF-32BE@, @UTF-32LE@
+--
+-- On systems using GNU iconv (e.g. Linux), there is additional
+-- notation for specifying how illegal characters are handled:
+--
+--  * a suffix of @\/\/IGNORE@, e.g. @UTF-8\/\/IGNORE@, will cause 
+--    all illegal sequences on input to be ignored, and on output
+--    will drop all code points that have no representation in the
+--    target encoding.
+--
+--  * a suffix of @\/\/TRANSLIT@ will choose a replacement character
+--    for illegal sequences or code points.
+--
+-- On Windows, you can access supported code pages with the prefix
+-- @CP@; for example, @\"CP1250\"@.
+--
  mkTextEncoding :: String -> IO TextEncoding
-#if !defined(mingw32_HOST_OS)
-mkTextEncoding = Iconv.mkTextEncoding
+mkTextEncoding e = case mb_coding_failure_mode of
+  Nothing -> unknown_encoding
+  Just cfm -> case enc of
+    "UTF-8"    -> return $ UTF8.mkUTF8 cfm
+    "UTF-16"   -> return $ UTF16.mkUTF16 cfm
+    "UTF-16LE" -> return $ UTF16.mkUTF16le cfm
+    "UTF-16BE" -> return $ UTF16.mkUTF16be cfm
+    "UTF-32"   -> return $ UTF32.mkUTF32 cfm
+    "UTF-32LE" -> return $ UTF32.mkUTF32le cfm
+    "UTF-32BE" -> return $ UTF32.mkUTF32be cfm
+#if defined(mingw32_HOST_OS)
+    'C':'P':n | [(cp,"")] <- reads n -> return $ CodePage.mkCodePageEncoding cfm cp
+    _ -> unknown_encoding
  #else
-mkTextEncoding "UTF-8"    = return utf8
-mkTextEncoding "UTF-16"   = return utf16
-mkTextEncoding "UTF-16LE" = return utf16le
-mkTextEncoding "UTF-16BE" = return utf16be
-mkTextEncoding "UTF-32"   = return utf32
-mkTextEncoding "UTF-32LE" = return utf32le
-mkTextEncoding "UTF-32BE" = return utf32be
-mkTextEncoding e = ioException
-     (IOError Nothing InvalidArgument "mkTextEncoding"
-          ("unknown encoding:" ++ e)  Nothing Nothing)
+    _ -> Iconv.mkIconvEncoding cfm enc
  #endif
+  where
+    -- The only problem with actually documenting //IGNORE and //TRANSLIT as
+    -- supported suffixes is that they are not necessarily supported with non-GNU iconv
+    (enc, suffix) = span (/= '/') e
+    mb_coding_failure_mode = case suffix of
+        ""            -> Just ErrorOnCodingFailure
+        "//IGNORE"    -> Just IgnoreCodingFailure
+        "//TRANSLIT"  -> Just TransliterateCodingFailure
+        "//ROUNDTRIP" -> Just RoundtripFailure
+        _             -> Nothing
+    
+    unknown_encoding = ioException (IOError Nothing NoSuchThing "mkTextEncoding"
+                                            ("unknown encoding:" ++ e)  Nothing Nothing)
  
  latin1_encode :: CharBuffer -> Buffer Word8 -> IO (CharBuffer, Buffer Word8)
-latin1_encode = Latin1.latin1_encode -- unchecked, used for binary
+latin1_encode input output = fmap (\(_why,input',output') -> (input',output')) $ Latin1.latin1_encode input output -- unchecked, used for binary
  --latin1_encode = unsafePerformIO $ do mkTextEncoder Iconv.latin1 >>= return.encode
  
  latin1_decode :: Buffer Word8 -> CharBuffer -> IO (Buffer Word8, CharBuffer)
-latin1_decode = Latin1.latin1_decode
+latin1_decode input output = fmap (\(_why,input',output') -> (input',output')) $ Latin1.latin1_decode input output
  --latin1_decode = unsafePerformIO $ do mkTextDecoder Iconv.latin1 >>= return.encode