GHC/IO/Encoding/Failure.hs

   1 {-# LANGUAGE NoImplicitPrelude, PatternGuards #-}
   2 -----------------------------------------------------------------------------
   3 -- |
   4 -- Module      :  GHC.IO.Encoding.Failure
   5 -- Copyright   :  (c) The University of Glasgow, 2008-2011
   6 -- License     :  see libraries/base/LICENSE
   7 --
   8 -- Maintainer  :  libraries@haskell.org
   9 -- Stability   :  internal
  10 -- Portability :  non-portable
  11 --
  12 -- Types for specifying how text encoding/decoding fails
  13 --
  14 -----------------------------------------------------------------------------
  15
  16 module GHC.IO.Encoding.Failure (
  17     CodingFailureMode(..), codingFailureModeSuffix,
  18     isSurrogate,
  19     surrogatifyRoundtripCharacter, desurrogatifyRoundtripCharacter,
  20     recoverDecode, recoverEncode
  21   ) where
  22
  23 import GHC.IO
  24 import GHC.IO.Buffer
  25 import GHC.IO.Exception
  26
  27 import GHC.Base
  28 import GHC.Word
  29 import GHC.Show
  30 import GHC.Num
  31 import GHC.Real ( fromIntegral )
  32
  33 --import System.Posix.Internals
  34
  35 import Data.Maybe
  36
  37 -- | The 'CodingFailureMode' is used to construct 'TextEncoding's, and specifies
  38 -- how they handle illegal sequences.
  39 data CodingFailureMode = ErrorOnCodingFailure         -- ^ Throw an error when an illegal sequence is encountered
  40                        | IgnoreCodingFailure          -- ^ Attempt to ignore and recover if an illegal sequence is encountered
  41                        | TransliterateCodingFailure   -- ^ Replace with the closest visual match upon an illegal sequence
  42                        | RoundtripFailure             -- ^ Use the private-use escape mechanism to attempt to allow illegal sequences to be roundtripped.
  43                        deriving (Show)                -- This will only work properly for those encodings which are strict supersets of ASCII in the sense
  44                                                       -- that valid ASCII data is also valid in that encoding. This is not true for e.g. UTF-16, because
  45                                                       -- ASCII characters must be padded to two bytes to retain their meaning.
  46
  47 -- Note [Roundtripping]
  48 -- ~~~~~~~~~~~~~~~~~~~~
  49 --
  50 -- Roundtripping is based on the ideas of PEP383. However, unlike PEP383 we do not wish to use lone surrogate codepoints
  51 -- to escape undecodable bytes, because that may confuse Unicode processing software written in Haskell. Instead, we use
  52 -- the range of private-use characters from 0xEF80 to 0xEFFF designated for "encoding hacks" by the ConScript Unicode Registery.
  53 --
  54 -- This introduces a technical problem when it comes to encoding back to bytes using iconv. The iconv code will not fail when
  55 -- it tries to encode a private-use character (as it would if trying to encode a surrogate), which means that we won't get a
  56 -- chance to replace it with the byte we originally escaped.
  57 --
  58 -- To work around this, when filling the buffer to be encoded (in writeBlocks/withEncodedCString/newEncodedCString), we replace
  59 -- the private-use characters with lone surrogates again! Likewise, when reading from a buffer (unpack/unpack_nl/peekEncodedCString)
  60 -- we have to do the inverse process.
  61 --
  62 -- The user of String should never see these lone surrogates, but it ensures that iconv will throw an error when encountering them.
  63 -- We use lone surrogates in the range 0xDC00 to 0xDCFF for this purpose.
  64
  65 codingFailureModeSuffix :: CodingFailureMode -> String
  66 codingFailureModeSuffix ErrorOnCodingFailure       = ""
  67 codingFailureModeSuffix IgnoreCodingFailure        = "//IGNORE"
  68 codingFailureModeSuffix TransliterateCodingFailure = "//TRANSLIT"
  69 codingFailureModeSuffix RoundtripFailure           = "//ROUNDTRIP"
  70
  71 -- | In transliterate mode, we use this character when decoding unknown bytes.
  72 --
  73 -- This is the defined Unicode replacement character: <http://www.fileformat.info/info/unicode/char/0fffd/index.htm>
  74 unrepresentableChar :: Char
  75 unrepresentableChar = '\xFFFD'
  76
  77 -- | Some characters are actually "surrogate" codepoints defined for use in UTF-16. We need to signal an
  78 -- invalid character if we detect them when encoding a sequence of 'Char's into 'Word8's because they won't
  79 -- give valid Unicode.
  80 --
  81 -- We may also need to signal an invalid character if we detect them when encoding a sequence of 'Char's into 'Word8's
  82 -- because the 'RoundtripFailure' mode creates these to round-trip bytes through our internal UTF-16 encoding.
  83 isSurrogate :: Char -> Bool
  84 isSurrogate c = (0xD800 <= x && x <= 0xDBFF) || (0xDC00 <= x && x <= 0xDFFF)
  85   where x = ord c
  86
  87 -- | We use some private-use characters for roundtripping unknown bytes through a String
  88 isRoundtripEscapeChar :: Char -> Bool
  89 isRoundtripEscapeChar c = 0xEF00 <= x && x < 0xF000
  90   where x = ord c
  91
  92 -- | We use some surrogate characters for roundtripping unknown bytes through a String
  93 isRoundtripEscapeSurrogateChar :: Char -> Bool
  94 isRoundtripEscapeSurrogateChar c = 0xDC00 <= x && x < 0xDD00
  95   where x = ord c
  96
  97 -- Private use characters (in Strings) --> lone surrogates (in Buffer CharBufElem)
  98 surrogatifyRoundtripCharacter :: Char -> Char
  99 surrogatifyRoundtripCharacter c | isRoundtripEscapeChar c = chr (ord c - 0xEF00 + 0xDC00)
 100                                 | otherwise               = c
 101
 102 -- Lone surrogates (in Buffer CharBufElem) --> private use characters (in Strings)
 103 desurrogatifyRoundtripCharacter :: Char -> Char
 104 desurrogatifyRoundtripCharacter c | isRoundtripEscapeSurrogateChar c = chr (ord c - 0xDC00 + 0xEF00)
 105                                   | otherwise                        = c
 106
 107 -- Bytes (in Buffer Word8) --> lone surrogates (in Buffer CharBufElem)
 108 escapeToRoundtripCharacterSurrogate :: Word8 -> Char
 109 escapeToRoundtripCharacterSurrogate b
 110   | b < 128   = chr (fromIntegral b) -- Disallow 'smuggling' of ASCII bytes. For roundtripping to work, this assumes encoding is ASCII-superset.
 111   | otherwise = chr (0xDC00 + fromIntegral b)
 112
 113 -- Lone surrogates (in Buffer CharBufElem) --> bytes (in Buffer Word8)
 114 unescapeRoundtripCharacterSurrogate :: Char -> Maybe Word8
 115 unescapeRoundtripCharacterSurrogate c
 116     | 0xDC80 <= x && x < 0xDD00 = Just (fromIntegral x) -- Discard high byte
 117     | otherwise                 = Nothing
 118   where x = ord c
 119
 120 recoverDecode :: CodingFailureMode -> Buffer Word8 -> Buffer Char -> IO (Buffer Word8, Buffer Char)
 121 recoverDecode cfm input@Buffer{  bufRaw=iraw, bufL=ir, bufR=_  }
 122                   output@Buffer{ bufRaw=oraw, bufL=_,  bufR=ow } = do
 123  --puts $ "recoverDecode " ++ show ir
 124  case cfm of
 125   ErrorOnCodingFailure       -> ioe_decodingError
 126   IgnoreCodingFailure        -> return (input { bufL=ir+1 }, output)
 127   TransliterateCodingFailure -> do
 128       ow' <- writeCharBuf oraw ow unrepresentableChar
 129       return (input { bufL=ir+1 }, output { bufR=ow' })
 130   RoundtripFailure           -> do
 131       b <- readWord8Buf iraw ir
 132       ow' <- writeCharBuf oraw ow (escapeToRoundtripCharacterSurrogate b)
 133       return (input { bufL=ir+1 }, output { bufR=ow' })
 134
 135 recoverEncode :: CodingFailureMode -> Buffer Char -> Buffer Word8 -> IO (Buffer Char, Buffer Word8)
 136 recoverEncode cfm input@Buffer{  bufRaw=iraw, bufL=ir, bufR=_  }
 137                   output@Buffer{ bufRaw=oraw, bufL=_,  bufR=ow } = do
 138   (c,ir') <- readCharBuf iraw ir
 139   --puts $ "recoverEncode " ++ show ir ++ " " ++ show ir'
 140   case cfm of
 141     IgnoreCodingFailure        -> return (input { bufL=ir' }, output)
 142     TransliterateCodingFailure -> do
 143         if c == '?'
 144          then return (input { bufL=ir' }, output)
 145          else do
 146           -- XXX: evil hack! To implement transliteration, we just poke an
 147           -- ASCII ? into the input buffer and tell the caller to try and decode
 148           -- again. This is *probably* safe given current uses of TextEncoding.
 149           --
 150           -- The "if" test above ensures we skip if the encoding fails to deal with
 151           -- the ?, though this should never happen in practice as all encodings are
 152           -- in fact capable of reperesenting all ASCII characters.
 153           _ir' <- writeCharBuf iraw ir '?'
 154           return (input, output)
 155
 156         -- This implementation does not work because e.g. UTF-16 requires 2 bytes to
 157         -- encode a simple ASCII value
 158         --writeWord8Buf oraw ow unrepresentableByte
 159         --return (input { bufL=ir' }, output { bufR=ow+1 })
 160     RoundtripFailure | Just x <- unescapeRoundtripCharacterSurrogate c -> do
 161         writeWord8Buf oraw ow x
 162         return (input { bufL=ir' }, output { bufR=ow+1 })
 163     _                          -> ioe_encodingError
 164
 165 ioe_decodingError :: IO a
 166 ioe_decodingError = ioException
 167     (IOError Nothing InvalidArgument "recoverDecode"
 168         "invalid byte sequence" Nothing Nothing)
 169
 170 ioe_encodingError :: IO a
 171 ioe_encodingError = ioException
 172     (IOError Nothing InvalidArgument "recoverEncode"
 173         "invalid character" Nothing Nothing)