Use Unicode private-use characters for roundtripping

author Max Bolingbroke <batterseapower@hotmail.com>

Wed, 18 May 2011 08:41:49 +0000 (09:41 +0100)

committer Max Bolingbroke <batterseapower@hotmail.com>

Wed, 18 May 2011 08:41:49 +0000 (09:41 +0100)
author Max Bolingbroke <batterseapower@hotmail.com>
Wed, 18 May 2011 08:41:49 +0000 (09:41 +0100)
committer Max Bolingbroke <batterseapower@hotmail.com>
Wed, 18 May 2011 08:41:49 +0000 (09:41 +0100)
diff --git a/GHC/Foreign.hs b/GHC/Foreign.hs

index b4c760c..109fa83 100644 (file)
--- a/GHC/Foreign.hs
+++ b/GHC/Foreign.hs
@@ -62,6 +62,7 @@ import GHC.Base
  import GHC.IO
  import GHC.IO.Exception
  import GHC.IO.Buffer
+import GHC.IO.Encoding.Failure (surrogatifyRoundtripCharacter, desurrogatifyRoundtripCharacter)
  import GHC.IO.Encoding.Types
  
  
@@ -171,7 +172,7 @@ peekEncodedCString (TextEncoding { mkTextDecoder = mk_decoder }) (p, sz_bytes)
              if isEmptyBuffer from'
               then
                -- No input remaining: @why@ will be InputUnderflow, but we don't care
-              withBuffer to' $ peekArray (bufferElems to')
+              fmap (map desurrogatifyRoundtripCharacter) $ withBuffer to' $ peekArray (bufferElems to')
               else do
                -- Input remaining: what went wrong?
                putDebugMsg ("peekEncodedCString: " ++ show iteration ++ " " ++ show why)
@@ -181,7 +182,7 @@ peekEncodedCString (TextEncoding { mkTextDecoder = mk_decoder }) (p, sz_bytes)
                putDebugMsg ("peekEncodedCString: from " ++ summaryBuffer from ++ " " ++ summaryBuffer from' ++ " " ++ summaryBuffer from'')
                putDebugMsg ("peekEncodedCString: to " ++ summaryBuffer to ++ " " ++ summaryBuffer to' ++ " " ++ summaryBuffer to'')
                to_chars <- withBuffer to'' $ peekArray (bufferElems to'')
-              fmap (to_chars++) $ go (iteration + 1) from''
+              fmap (map desurrogatifyRoundtripCharacter to_chars++) $ go (iteration + 1) from''
  
        go (0 :: Int) from0
  
@@ -192,7 +193,7 @@ withEncodedCString :: TextEncoding         -- ^ Encoding of CString to create
                     -> (CStringLen -> IO a) -- ^ Worker that can safely use the allocated memory
                     -> IO a
  withEncodedCString (TextEncoding { mkTextEncoder = mk_encoder }) null_terminate s act
-  = bracket mk_encoder close $ \encoder -> withArrayLen s $ \sz p -> do
+  = bracket mk_encoder close $ \encoder -> withArrayLen (map surrogatifyRoundtripCharacter s) $ \sz p -> do
        from <- fmap (\fp -> bufferAdd sz (emptyBuffer fp sz ReadBuffer)) $ newForeignPtr_ p
  
        let go iteration to_sz_bytes = do
@@ -212,7 +213,7 @@ newEncodedCString :: TextEncoding  -- ^ Encoding of CString to create
                    -> String        -- ^ String to encode
                    -> IO CStringLen
  newEncodedCString (TextEncoding { mkTextEncoder = mk_encoder }) null_terminate s
-  = bracket mk_encoder close $ \encoder -> withArrayLen s $ \sz p -> do
+  = bracket mk_encoder close $ \encoder -> withArrayLen (map surrogatifyRoundtripCharacter s) $ \sz p -> do
        from <- fmap (\fp -> bufferAdd sz (emptyBuffer fp sz ReadBuffer)) $ newForeignPtr_ p
  
        let go iteration to_p to_sz_bytes = do
diff --git a/GHC/IO/Encoding.hs b/GHC/IO/Encoding.hs

index 953fc2e..505824e 100644 (file)
--- a/GHC/IO/Encoding.hs
+++ b/GHC/IO/Encoding.hs
@@ -117,11 +117,11 @@ foreignEncoding :: TextEncoding
  
  #if !defined(mingw32_HOST_OS)
  localeEncoding = Iconv.localeEncoding
-fileSystemEncoding = Iconv.mkLocaleEncoding SurrogateEscapeFailure
+fileSystemEncoding = Iconv.mkLocaleEncoding RoundtripFailure
  foreignEncoding = Iconv.mkLocaleEncoding IgnoreCodingFailure
  #else
  localeEncoding = CodePage.localeEncoding
-fileSystemEncoding = CodePage.mkLocaleEncoding SurrogateEscapeFailure
+fileSystemEncoding = CodePage.mkLocaleEncoding RoundtripFailure
  foreignEncoding = CodePage.mkLocaleEncoding IgnoreCodingFailure
  #endif
  
@@ -176,7 +176,7 @@ mkTextEncoding e = case mb_coding_failure_mode of
          ""            -> Just ErrorOnCodingFailure
          "//IGNORE"    -> Just IgnoreCodingFailure
          "//TRANSLIT"  -> Just TransliterateCodingFailure
-        "//SURROGATE" -> Just SurrogateEscapeFailure
+        "//ROUNDTRIP" -> Just RoundtripFailure
          _             -> Nothing
      
      unknown_encoding = ioException (IOError Nothing NoSuchThing "mkTextEncoding"
diff --git a/GHC/IO/Encoding/Failure.hs b/GHC/IO/Encoding/Failure.hs

index 1d9dcb0..9c0e6d9 100644 (file)
--- a/GHC/IO/Encoding/Failure.hs
+++ b/GHC/IO/Encoding/Failure.hs
@@ -15,7 +15,9 @@
  
  module GHC.IO.Encoding.Failure (
      CodingFailureMode(..), codingFailureModeSuffix,
-    isSurrogate, recoverDecode, recoverEncode
+    isSurrogate,
+    surrogatifyRoundtripCharacter, desurrogatifyRoundtripCharacter,
+    recoverDecode, recoverEncode
    ) where
  
  import GHC.IO
@@ -37,16 +39,34 @@ import Data.Maybe
  data CodingFailureMode = ErrorOnCodingFailure         -- ^ Throw an error when an illegal sequence is encountered
                         | IgnoreCodingFailure          -- ^ Attempt to ignore and recover if an illegal sequence is encountered
                         | TransliterateCodingFailure   -- ^ Replace with the closest visual match upon an illegal sequence
-                       | SurrogateEscapeFailure       -- ^ Use the surrogate escape mechanism to attempt to allow illegal sequences to be roundtripped.
+                       | RoundtripFailure             -- ^ Use the private-use escape mechanism to attempt to allow illegal sequences to be roundtripped.
                         deriving (Show)                -- This will only work properly for those encodings which are strict supersets of ASCII in the sense
                                                        -- that valid ASCII data is also valid in that encoding. This is not true for e.g. UTF-16, because
                                                        -- ASCII characters must be padded to two bytes to retain their meaning.
  
+-- Note [Roundtripping]
+-- ~~~~~~~~~~~~~~~~~~~~
+--
+-- Roundtripping is based on the ideas of PEP383. However, unlike PEP383 we do not wish to use lone surrogate codepoints
+-- to escape undecodable bytes, because that may confuse Unicode processing software written in Haskell. Instead, we use
+-- the range of private-use characters from 0xF1E00 to 0xF1EFF.
+--
+-- This introduces a technical problem when it comes to encoding back to bytes using iconv. The iconv code will not fail when
+-- it tries to encode a private-use character (as it would if trying to encode a surrogate), which means that we won't get a
+-- chance to replace it with the byte we originally escaped.
+--
+-- To work around this, when filling the buffer to be encoded (in writeBlocks/withEncodedCString/newEncodedCString), we replace
+-- the private-use characters with lone surrogates again! Likewise, when reading from a buffer (unpack/unpack_nl/peekEncodedCString)
+-- we have to do the inverse process.
+--
+-- The user of String should never see these lone surrogates, but it ensures that iconv will throw an error when encountering them.
+-- We use lone surrogates in the range 0xDC00 to 0xDCFF for this purpose.
+
  codingFailureModeSuffix :: CodingFailureMode -> String
  codingFailureModeSuffix ErrorOnCodingFailure       = ""
  codingFailureModeSuffix IgnoreCodingFailure        = "//IGNORE"
  codingFailureModeSuffix TransliterateCodingFailure = "//TRANSLIT"
-codingFailureModeSuffix SurrogateEscapeFailure     = "//SURROGATE"
+codingFailureModeSuffix RoundtripFailure           = "//ROUNDTRIP"
  
  -- | In transliterate mode, we use this character when decoding unknown bytes.
  --
@@ -55,20 +75,44 @@ unrepresentableChar :: Char
  unrepresentableChar = '\xFFFD'
  
  -- | Some characters are actually "surrogate" codepoints defined for use in UTF-16. We need to signal an
--- invalid character if we detect them when encoding a sequence of 'Char's into 'Word8's because the
--- 'SurrogateEscapeFailure' mode creates unpaired surrogates to round-trip bytes through our internal
--- UTF-16 encoding.
+-- invalid character if we detect them when encoding a sequence of 'Char's into 'Word8's because they won't
+-- give valid Unicode.
+--
+-- We may also need to signal an invalid character if we detect them when encoding a sequence of 'Char's into 'Word8's
+-- because the 'RoundtripFailure' mode creates these to round-trip bytes through our internal UTF-16 encoding.
  isSurrogate :: Char -> Bool
  isSurrogate c = (0xD800 <= x && x <= 0xDBFF) || (0xDC00 <= x && x <= 0xDFFF)
    where x = ord c
  
-escapeToSurrogateCharacter :: Word8 -> Char
-escapeToSurrogateCharacter b
+-- | We use some private-use characters for roundtripping unknown bytes through a String
+isRoundtripEscapeChar :: Char -> Bool
+isRoundtripEscapeChar c = 0xF1E00 <= x && x < 0xF1F00
+  where x = ord c
+
+-- | We use some surrogate characters for roundtripping unknown bytes through a String
+isRoundtripEscapeSurrogateChar :: Char -> Bool
+isRoundtripEscapeSurrogateChar c = 0xDC00 <= x && x < 0xDD00
+  where x = ord c
+
+-- Private use characters (in Strings) --> lone surrogates (in Buffer CharBufElem)
+surrogatifyRoundtripCharacter :: Char -> Char
+surrogatifyRoundtripCharacter c | isRoundtripEscapeChar c = chr (ord c - 0xF1E00 + 0xDC00)
+                                | otherwise               = c
+
+-- Lone surrogates (in Buffer CharBufElem) --> private use characters (in Strings)
+desurrogatifyRoundtripCharacter :: Char -> Char
+desurrogatifyRoundtripCharacter c | isRoundtripEscapeSurrogateChar c = chr (ord c - 0xDC00 + 0xF1E00)
+                                  | otherwise                        = c
+
+-- Bytes (in Buffer Word8) --> lone surrogates (in Buffer CharBufElem)
+escapeToRoundtripCharacterSurrogate :: Word8 -> Char
+escapeToRoundtripCharacterSurrogate b
    | b < 128   = chr (fromIntegral b) -- Disallow 'smuggling' of ASCII bytes. For roundtripping to work, this assumes encoding is ASCII-superset.
    | otherwise = chr (0xDC00 + fromIntegral b)
  
-unescapeSurrogateCharacter :: Char -> Maybe Word8
-unescapeSurrogateCharacter c
+-- Lone surrogates (in Buffer CharBufElem) --> bytes (in Buffer Word8)
+unescapeRoundtripCharacterSurrogate :: Char -> Maybe Word8
+unescapeRoundtripCharacterSurrogate c
      | 0xDC80 <= x && x < 0xDD00 = Just (fromIntegral x) -- Discard high byte
      | otherwise                 = Nothing
    where x = ord c
@@ -83,9 +127,9 @@ recoverDecode cfm input@Buffer{  bufRaw=iraw, bufL=ir, bufR=_  }
    TransliterateCodingFailure -> do
        ow' <- writeCharBuf oraw ow unrepresentableChar
        return (input { bufL=ir+1 }, output { bufR=ow' })
-  SurrogateEscapeFailure     -> do
+  RoundtripFailure           -> do
        b <- readWord8Buf iraw ir
-      ow' <- writeCharBuf oraw ow (escapeToSurrogateCharacter b)
+      ow' <- writeCharBuf oraw ow (escapeToRoundtripCharacterSurrogate b)
        return (input { bufL=ir+1 }, output { bufR=ow' })
  
  recoverEncode :: CodingFailureMode -> Buffer Char -> Buffer Word8 -> IO (Buffer Char, Buffer Word8)
@@ -113,7 +157,7 @@ recoverEncode cfm input@Buffer{  bufRaw=iraw, bufL=ir, bufR=_  }
          -- encode a simple ASCII value
          --writeWord8Buf oraw ow unrepresentableByte
          --return (input { bufL=ir' }, output { bufR=ow+1 })
-    SurrogateEscapeFailure | Just x <- unescapeSurrogateCharacter c -> do
+    RoundtripFailure | Just x <- unescapeRoundtripCharacterSurrogate c -> do
          writeWord8Buf oraw ow x
          return (input { bufL=ir' }, output { bufR=ow+1 })
      _                          -> ioe_encodingError
diff --git a/GHC/IO/Encoding/Types.hs b/GHC/IO/Encoding/Types.hs

index 706f7b5..ebce578 100644 (file)
--- a/GHC/IO/Encoding/Types.hs
+++ b/GHC/IO/Encoding/Types.hs
@@ -43,6 +43,14 @@ data BufferCodec from to state = BufferCodec {
     -- The fact that as many elements as possible are translated is used by the IO
     -- library in order to report translation errors at the point they
     -- actually occur, rather than when the buffer is translated.
+   --
+   -- To allow us to use iconv as a BufferCode efficiently, character buffers are
+   -- defined to contain lone surrogates instead of those private use characters that
+   -- are used for roundtripping. Thus, Chars poked and peeked from a character buffer
+   -- must undergo surrogatifyRoundtripCharacter and desurrogatifyRoundtripCharacter
+   -- respectively.
+   --
+   -- For more information on this, see Note [Roundtripping] in GHC.IO.Encoding.Failure.
    
    recover :: Buffer from -> Buffer to -> IO (Buffer from, Buffer to),
     -- ^ The @recover@ function is used to continue decoding
@@ -56,6 +64,9 @@ data BufferCodec from to state = BufferCodec {
     -- of free space.
     --
     -- @recover@ may raise an exception rather than skipping anything.
+   --
+   -- Currently, some implementations of @recover@ may mutate the input buffer.
+   -- In particular, this feature is used to implement transliteration.
    
    close  :: IO (),
     -- ^ Resources associated with the encoding may now be released.
diff --git a/GHC/IO/Handle/Text.hs b/GHC/IO/Handle/Text.hs

index 160e9d7..0d0e05b 100644 (file)
--- a/GHC/IO/Handle/Text.hs
+++ b/GHC/IO/Handle/Text.hs
@@ -38,6 +38,7 @@ import GHC.IO.FD
  import GHC.IO.Buffer
  import qualified GHC.IO.BufferedIO as Buffered
  import GHC.IO.Exception
+import GHC.IO.Encoding.Failure (surrogatifyRoundtripCharacter, desurrogatifyRoundtripCharacter)
  import GHC.Exception
  import GHC.IO.Handle.Types
  import GHC.IO.Handle.Internals
@@ -279,10 +280,10 @@ unpack !buf !r !w acc0
                   else do c1 <- peekElemOff pbuf (i-1)
                           let c = (fromIntegral c1 - 0xd800) * 0x400 +
                                   (fromIntegral c2 - 0xdc00) + 0x10000
-                         unpackRB (unsafeChr c : acc) (i-2)
+                         unpackRB (desurrogatifyRoundtripCharacter (unsafeChr c) : acc) (i-2)
  #else
                c <- peekElemOff pbuf i
-              unpackRB (c:acc) (i-1)
+              unpackRB (desurrogatifyRoundtripCharacter c:acc) (i-1)
  #endif
       in
       unpackRB acc0 (w-1)
@@ -305,7 +306,7 @@ unpack_nl !buf !r !w acc0
                              then unpackRB ('\n':acc) (i-2)
                              else unpackRB ('\n':acc) (i-1)
                   else do
-                         unpackRB (c:acc) (i-1)
+                         unpackRB (desurrogatifyRoundtripCharacter c:acc) (i-1)
       in do
       c <- peekElemOff pbuf (w-1)
       if (c == '\r')
@@ -585,7 +586,7 @@ writeBlocks hdl line_buffered add_nl nl
             else do
                 shoveString n' cs rest
       | otherwise = do
-        n' <- writeCharBuf raw n c
+        n' <- writeCharBuf raw n (surrogatifyRoundtripCharacter c)
          shoveString n' cs rest
    in
    shoveString 0 s (if add_nl then "\n" else "")
author	Max Bolingbroke <batterseapower@hotmail.com>
	Wed, 18 May 2011 08:41:49 +0000 (09:41 +0100)
committer	Max Bolingbroke <batterseapower@hotmail.com>
	Wed, 18 May 2011 08:41:49 +0000 (09:41 +0100)
GHC/Foreign.hs		patch \| blob \| history
GHC/IO/Encoding.hs		patch \| blob \| history
GHC/IO/Encoding/Failure.hs		patch \| blob \| history
GHC/IO/Encoding/Types.hs		patch \| blob \| history
GHC/IO/Handle/Text.hs		patch \| blob \| history