1 {-# LANGUAGE NoImplicitPrelude, PatternGuards #-}
2 -----------------------------------------------------------------------------
4 -- Module : GHC.IO.Encoding.Failure
5 -- Copyright : (c) The University of Glasgow, 2008-2011
6 -- License : see libraries/base/LICENSE
8 -- Maintainer : libraries@haskell.org
9 -- Stability : internal
10 -- Portability : non-portable
12 -- Types for specifying how text encoding/decoding fails
14 -----------------------------------------------------------------------------
16 module GHC.IO.Encoding.Failure (
17 CodingFailureMode(..), codingFailureModeSuffix,
18 isSurrogate, recoverDecode, recoverEncode
23 import GHC.IO.Exception
29 import GHC.Real ( fromIntegral )
31 --import System.Posix.Internals
35 -- | The 'CodingFailureMode' is used to construct 'TextEncoding's, and specifies
36 -- how they handle illegal sequences.
37 data CodingFailureMode = ErrorOnCodingFailure -- ^ Throw an error when an illegal sequence is encountered
38 | IgnoreCodingFailure -- ^ Attempt to ignore and recover if an illegal sequence is encountered
39 | TransliterateCodingFailure -- ^ Replace with the closest visual match upon an illegal sequence
40 | SurrogateEscapeFailure -- ^ Use the surrogate escape mechanism to attempt to allow illegal sequences to be roundtripped.
41 deriving (Show) -- This will only work properly for those encodings which are strict supersets of ASCII in the sense
42 -- that valid ASCII data is also valid in that encoding. This is not true for e.g. UTF-16, because
43 -- ASCII characters must be padded to two bytes to retain their meaning.
45 codingFailureModeSuffix :: CodingFailureMode -> String
46 codingFailureModeSuffix ErrorOnCodingFailure = ""
47 codingFailureModeSuffix IgnoreCodingFailure = "//IGNORE"
48 codingFailureModeSuffix TransliterateCodingFailure = "//TRANSLIT"
49 codingFailureModeSuffix SurrogateEscapeFailure = "//SURROGATE"
51 -- | In transliterate mode, we use this character when decoding unknown bytes.
53 -- This is the defined Unicode replacement character: <http://www.fileformat.info/info/unicode/char/0fffd/index.htm>
54 unrepresentableChar :: Char
55 unrepresentableChar = '\xFFFD'
57 -- | Some characters are actually "surrogate" codepoints defined for use in UTF-16. We need to signal an
58 -- invalid character if we detect them when encoding a sequence of 'Char's into 'Word8's because the
59 -- 'SurrogateEscapeFailure' mode creates unpaired surrogates to round-trip bytes through our internal
61 isSurrogate :: Char -> Bool
62 isSurrogate c = (0xD800 <= x && x <= 0xDBFF) || (0xDC00 <= x && x <= 0xDFFF)
65 escapeToSurrogateCharacter :: Word8 -> Char
66 escapeToSurrogateCharacter b
67 | b < 128 = chr (fromIntegral b) -- Disallow 'smuggling' of ASCII bytes. For roundtripping to work, this assumes encoding is ASCII-superset.
68 | otherwise = chr (0xDC00 + fromIntegral b)
70 unescapeSurrogateCharacter :: Char -> Maybe Word8
71 unescapeSurrogateCharacter c
72 | 0xDC80 <= x && x < 0xDD00 = Just (fromIntegral x) -- Discard high byte
76 recoverDecode :: CodingFailureMode -> Buffer Word8 -> Buffer Char -> IO (Buffer Word8, Buffer Char)
77 recoverDecode cfm input@Buffer{ bufRaw=iraw, bufL=ir, bufR=_ }
78 output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow } = do
79 --puts $ "recoverDecode " ++ show ir
81 ErrorOnCodingFailure -> ioe_decodingError
82 IgnoreCodingFailure -> return (input { bufL=ir+1 }, output)
83 TransliterateCodingFailure -> do
84 ow' <- writeCharBuf oraw ow unrepresentableChar
85 return (input { bufL=ir+1 }, output { bufR=ow' })
86 SurrogateEscapeFailure -> do
87 b <- readWord8Buf iraw ir
88 ow' <- writeCharBuf oraw ow (escapeToSurrogateCharacter b)
89 return (input { bufL=ir+1 }, output { bufR=ow' })
91 recoverEncode :: CodingFailureMode -> Buffer Char -> Buffer Word8 -> IO (Buffer Char, Buffer Word8)
92 recoverEncode cfm input@Buffer{ bufRaw=iraw, bufL=ir, bufR=_ }
93 output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow } = do
94 (c,ir') <- readCharBuf iraw ir
95 --puts $ "recoverEncode " ++ show ir ++ " " ++ show ir'
97 IgnoreCodingFailure -> return (input { bufL=ir' }, output)
98 TransliterateCodingFailure -> do
100 then return (input { bufL=ir' }, output)
102 -- XXX: evil hack! To implement transliteration, we just poke an
103 -- ASCII ? into the input buffer and tell the caller to try and decode
104 -- again. This is *probably* safe given current uses of TextEncoding.
106 -- The "if" test above ensures we skip if the encoding fails to deal with
107 -- the ?, though this should never happen in practice as all encodings are
108 -- in fact capable of reperesenting all ASCII characters.
109 _ir' <- writeCharBuf iraw ir '?'
110 return (input, output)
112 -- This implementation does not work because e.g. UTF-16 requires 2 bytes to
113 -- encode a simple ASCII value
114 --writeWord8Buf oraw ow unrepresentableByte
115 --return (input { bufL=ir' }, output { bufR=ow+1 })
116 SurrogateEscapeFailure | Just x <- unescapeSurrogateCharacter c -> do
117 writeWord8Buf oraw ow x
118 return (input { bufL=ir' }, output { bufR=ow+1 })
119 _ -> ioe_encodingError
121 ioe_decodingError :: IO a
122 ioe_decodingError = ioException
123 (IOError Nothing InvalidArgument "recoverDecode"
124 "invalid byte sequence" Nothing Nothing)
126 ioe_encodingError :: IO a
127 ioe_encodingError = ioException
128 (IOError Nothing InvalidArgument "recoverEncode"
129 "invalid character" Nothing Nothing)