import GHC.IO
import GHC.IO.Exception
import GHC.IO.Buffer
+import GHC.IO.Encoding.Failure (surrogatifyRoundtripCharacter, desurrogatifyRoundtripCharacter)
import GHC.IO.Encoding.Types
if isEmptyBuffer from'
then
-- No input remaining: @why@ will be InputUnderflow, but we don't care
- withBuffer to' $ peekArray (bufferElems to')
+ fmap (map desurrogatifyRoundtripCharacter) $ withBuffer to' $ peekArray (bufferElems to')
else do
-- Input remaining: what went wrong?
putDebugMsg ("peekEncodedCString: " ++ show iteration ++ " " ++ show why)
putDebugMsg ("peekEncodedCString: from " ++ summaryBuffer from ++ " " ++ summaryBuffer from' ++ " " ++ summaryBuffer from'')
putDebugMsg ("peekEncodedCString: to " ++ summaryBuffer to ++ " " ++ summaryBuffer to' ++ " " ++ summaryBuffer to'')
to_chars <- withBuffer to'' $ peekArray (bufferElems to'')
- fmap (to_chars++) $ go (iteration + 1) from''
+ fmap (map desurrogatifyRoundtripCharacter to_chars++) $ go (iteration + 1) from''
go (0 :: Int) from0
-> (CStringLen -> IO a) -- ^ Worker that can safely use the allocated memory
-> IO a
withEncodedCString (TextEncoding { mkTextEncoder = mk_encoder }) null_terminate s act
- = bracket mk_encoder close $ \encoder -> withArrayLen s $ \sz p -> do
+ = bracket mk_encoder close $ \encoder -> withArrayLen (map surrogatifyRoundtripCharacter s) $ \sz p -> do
from <- fmap (\fp -> bufferAdd sz (emptyBuffer fp sz ReadBuffer)) $ newForeignPtr_ p
let go iteration to_sz_bytes = do
-> String -- ^ String to encode
-> IO CStringLen
newEncodedCString (TextEncoding { mkTextEncoder = mk_encoder }) null_terminate s
- = bracket mk_encoder close $ \encoder -> withArrayLen s $ \sz p -> do
+ = bracket mk_encoder close $ \encoder -> withArrayLen (map surrogatifyRoundtripCharacter s) $ \sz p -> do
from <- fmap (\fp -> bufferAdd sz (emptyBuffer fp sz ReadBuffer)) $ newForeignPtr_ p
let go iteration to_p to_sz_bytes = do
module GHC.IO.Encoding.Failure (
CodingFailureMode(..), codingFailureModeSuffix,
- isSurrogate, recoverDecode, recoverEncode
+ isSurrogate,
+ surrogatifyRoundtripCharacter, desurrogatifyRoundtripCharacter,
+ recoverDecode, recoverEncode
) where
import GHC.IO
data CodingFailureMode = ErrorOnCodingFailure -- ^ Throw an error when an illegal sequence is encountered
| IgnoreCodingFailure -- ^ Attempt to ignore and recover if an illegal sequence is encountered
| TransliterateCodingFailure -- ^ Replace with the closest visual match upon an illegal sequence
- | SurrogateEscapeFailure -- ^ Use the surrogate escape mechanism to attempt to allow illegal sequences to be roundtripped.
+ | RoundtripFailure -- ^ Use the private-use escape mechanism to attempt to allow illegal sequences to be roundtripped.
deriving (Show) -- This will only work properly for those encodings which are strict supersets of ASCII in the sense
-- that valid ASCII data is also valid in that encoding. This is not true for e.g. UTF-16, because
-- ASCII characters must be padded to two bytes to retain their meaning.
+-- Note [Roundtripping]
+-- ~~~~~~~~~~~~~~~~~~~~
+--
+-- Roundtripping is based on the ideas of PEP383. However, unlike PEP383 we do not wish to use lone surrogate codepoints
+-- to escape undecodable bytes, because that may confuse Unicode processing software written in Haskell. Instead, we use
+-- the range of private-use characters from 0xF1E00 to 0xF1EFF.
+--
+-- This introduces a technical problem when it comes to encoding back to bytes using iconv. The iconv code will not fail when
+-- it tries to encode a private-use character (as it would if trying to encode a surrogate), which means that we won't get a
+-- chance to replace it with the byte we originally escaped.
+--
+-- To work around this, when filling the buffer to be encoded (in writeBlocks/withEncodedCString/newEncodedCString), we replace
+-- the private-use characters with lone surrogates again! Likewise, when reading from a buffer (unpack/unpack_nl/peekEncodedCString)
+-- we have to do the inverse process.
+--
+-- The user of String should never see these lone surrogates, but it ensures that iconv will throw an error when encountering them.
+-- We use lone surrogates in the range 0xDC00 to 0xDCFF for this purpose.
+
codingFailureModeSuffix :: CodingFailureMode -> String
codingFailureModeSuffix ErrorOnCodingFailure = ""
codingFailureModeSuffix IgnoreCodingFailure = "//IGNORE"
codingFailureModeSuffix TransliterateCodingFailure = "//TRANSLIT"
-codingFailureModeSuffix SurrogateEscapeFailure = "//SURROGATE"
+codingFailureModeSuffix RoundtripFailure = "//ROUNDTRIP"
-- | In transliterate mode, we use this character when decoding unknown bytes.
--
unrepresentableChar = '\xFFFD'
-- | Some characters are actually "surrogate" codepoints defined for use in UTF-16. We need to signal an
--- invalid character if we detect them when encoding a sequence of 'Char's into 'Word8's because the
--- 'SurrogateEscapeFailure' mode creates unpaired surrogates to round-trip bytes through our internal
--- UTF-16 encoding.
+-- invalid character if we detect them when encoding a sequence of 'Char's into 'Word8's because they won't
+-- give valid Unicode.
+--
+-- We may also need to signal an invalid character if we detect them when encoding a sequence of 'Char's into 'Word8's
+-- because the 'RoundtripFailure' mode creates these to round-trip bytes through our internal UTF-16 encoding.
isSurrogate :: Char -> Bool
isSurrogate c = (0xD800 <= x && x <= 0xDBFF) || (0xDC00 <= x && x <= 0xDFFF)
where x = ord c
-escapeToSurrogateCharacter :: Word8 -> Char
-escapeToSurrogateCharacter b
+-- | We use some private-use characters for roundtripping unknown bytes through a String
+isRoundtripEscapeChar :: Char -> Bool
+isRoundtripEscapeChar c = 0xF1E00 <= x && x < 0xF1F00
+ where x = ord c
+
+-- | We use some surrogate characters for roundtripping unknown bytes through a String
+isRoundtripEscapeSurrogateChar :: Char -> Bool
+isRoundtripEscapeSurrogateChar c = 0xDC00 <= x && x < 0xDD00
+ where x = ord c
+
+-- Private use characters (in Strings) --> lone surrogates (in Buffer CharBufElem)
+surrogatifyRoundtripCharacter :: Char -> Char
+surrogatifyRoundtripCharacter c | isRoundtripEscapeChar c = chr (ord c - 0xF1E00 + 0xDC00)
+ | otherwise = c
+
+-- Lone surrogates (in Buffer CharBufElem) --> private use characters (in Strings)
+desurrogatifyRoundtripCharacter :: Char -> Char
+desurrogatifyRoundtripCharacter c | isRoundtripEscapeSurrogateChar c = chr (ord c - 0xDC00 + 0xF1E00)
+ | otherwise = c
+
+-- Bytes (in Buffer Word8) --> lone surrogates (in Buffer CharBufElem)
+escapeToRoundtripCharacterSurrogate :: Word8 -> Char
+escapeToRoundtripCharacterSurrogate b
| b < 128 = chr (fromIntegral b) -- Disallow 'smuggling' of ASCII bytes. For roundtripping to work, this assumes encoding is ASCII-superset.
| otherwise = chr (0xDC00 + fromIntegral b)
-unescapeSurrogateCharacter :: Char -> Maybe Word8
-unescapeSurrogateCharacter c
+-- Lone surrogates (in Buffer CharBufElem) --> bytes (in Buffer Word8)
+unescapeRoundtripCharacterSurrogate :: Char -> Maybe Word8
+unescapeRoundtripCharacterSurrogate c
| 0xDC80 <= x && x < 0xDD00 = Just (fromIntegral x) -- Discard high byte
| otherwise = Nothing
where x = ord c
TransliterateCodingFailure -> do
ow' <- writeCharBuf oraw ow unrepresentableChar
return (input { bufL=ir+1 }, output { bufR=ow' })
- SurrogateEscapeFailure -> do
+ RoundtripFailure -> do
b <- readWord8Buf iraw ir
- ow' <- writeCharBuf oraw ow (escapeToSurrogateCharacter b)
+ ow' <- writeCharBuf oraw ow (escapeToRoundtripCharacterSurrogate b)
return (input { bufL=ir+1 }, output { bufR=ow' })
recoverEncode :: CodingFailureMode -> Buffer Char -> Buffer Word8 -> IO (Buffer Char, Buffer Word8)
-- encode a simple ASCII value
--writeWord8Buf oraw ow unrepresentableByte
--return (input { bufL=ir' }, output { bufR=ow+1 })
- SurrogateEscapeFailure | Just x <- unescapeSurrogateCharacter c -> do
+ RoundtripFailure | Just x <- unescapeRoundtripCharacterSurrogate c -> do
writeWord8Buf oraw ow x
return (input { bufL=ir' }, output { bufR=ow+1 })
_ -> ioe_encodingError
-- The fact that as many elements as possible are translated is used by the IO
-- library in order to report translation errors at the point they
-- actually occur, rather than when the buffer is translated.
+ --
+ -- To allow us to use iconv as a BufferCode efficiently, character buffers are
+ -- defined to contain lone surrogates instead of those private use characters that
+ -- are used for roundtripping. Thus, Chars poked and peeked from a character buffer
+ -- must undergo surrogatifyRoundtripCharacter and desurrogatifyRoundtripCharacter
+ -- respectively.
+ --
+ -- For more information on this, see Note [Roundtripping] in GHC.IO.Encoding.Failure.
recover :: Buffer from -> Buffer to -> IO (Buffer from, Buffer to),
-- ^ The @recover@ function is used to continue decoding
-- of free space.
--
-- @recover@ may raise an exception rather than skipping anything.
+ --
+ -- Currently, some implementations of @recover@ may mutate the input buffer.
+ -- In particular, this feature is used to implement transliteration.
close :: IO (),
-- ^ Resources associated with the encoding may now be released.