GHC/IO/Encoding/Failure.hs

   1 {-# LANGUAGE NoImplicitPrelude, PatternGuards #-}
   2 -----------------------------------------------------------------------------
   3 -- |
   4 -- Module      :  GHC.IO.Encoding.Failure
   5 -- Copyright   :  (c) The University of Glasgow, 2008-2011
   6 -- License     :  see libraries/base/LICENSE
   7 --
   8 -- Maintainer  :  libraries@haskell.org
   9 -- Stability   :  internal
  10 -- Portability :  non-portable
  11 --
  12 -- Types for specifying how text encoding/decoding fails
  13 --
  14 -----------------------------------------------------------------------------
  15
  16 module GHC.IO.Encoding.Failure (
  17     CodingFailureMode(..), codingFailureModeSuffix,
  18     isSurrogate, recoverDecode, recoverEncode
  19   ) where
  20
  21 import GHC.IO
  22 import GHC.IO.Buffer
  23 import GHC.IO.Exception
  24
  25 import GHC.Base
  26 import GHC.Word
  27 import GHC.Show
  28 import GHC.Num
  29 import GHC.Real ( fromIntegral )
  30
  31 --import System.Posix.Internals
  32
  33 import Data.Maybe
  34
  35 -- | The 'CodingFailureMode' is used to construct 'TextEncoding's, and specifies
  36 -- how they handle illegal sequences.
  37 data CodingFailureMode = ErrorOnCodingFailure         -- ^ Throw an error when an illegal sequence is encountered
  38                        | IgnoreCodingFailure          -- ^ Attempt to ignore and recover if an illegal sequence is encountered
  39                        | TransliterateCodingFailure   -- ^ Replace with the closest visual match upon an illegal sequence
  40                        | SurrogateEscapeFailure       -- ^ Use the surrogate escape mechanism to attempt to allow illegal sequences to be roundtripped.
  41                        deriving (Show)                -- This will only work properly for those encodings which are strict supersets of ASCII in the sense
  42                                                       -- that valid ASCII data is also valid in that encoding. This is not true for e.g. UTF-16, because
  43                                                       -- ASCII characters must be padded to two bytes to retain their meaning.
  44
  45 codingFailureModeSuffix :: CodingFailureMode -> String
  46 codingFailureModeSuffix ErrorOnCodingFailure       = ""
  47 codingFailureModeSuffix IgnoreCodingFailure        = "//IGNORE"
  48 codingFailureModeSuffix TransliterateCodingFailure = "//TRANSLIT"
  49 codingFailureModeSuffix SurrogateEscapeFailure     = "//SURROGATE"
  50
  51 -- | In transliterate mode, we use this character when decoding unknown bytes.
  52 --
  53 -- This is the defined Unicode replacement character: <http://www.fileformat.info/info/unicode/char/0fffd/index.htm>
  54 unrepresentableChar :: Char
  55 unrepresentableChar = '\xFFFD'
  56
  57 -- | Some characters are actually "surrogate" codepoints defined for use in UTF-16. We need to signal an
  58 -- invalid character if we detect them when encoding a sequence of 'Char's into 'Word8's because the
  59 -- 'SurrogateEscapeFailure' mode creates unpaired surrogates to round-trip bytes through our internal
  60 -- UTF-16 encoding.
  61 isSurrogate :: Char -> Bool
  62 isSurrogate c = (0xD800 <= x && x <= 0xDBFF) || (0xDC00 <= x && x <= 0xDFFF)
  63   where x = ord c
  64
  65 escapeToSurrogateCharacter :: Word8 -> Char
  66 escapeToSurrogateCharacter b
  67   | b < 128   = chr (fromIntegral b) -- Disallow 'smuggling' of ASCII bytes. For roundtripping to work, this assumes encoding is ASCII-superset.
  68   | otherwise = chr (0xDC00 + fromIntegral b)
  69
  70 unescapeSurrogateCharacter :: Char -> Maybe Word8
  71 unescapeSurrogateCharacter c
  72     | 0xDC80 <= x && x < 0xDD00 = Just (fromIntegral x) -- Discard high byte
  73     | otherwise                 = Nothing
  74   where x = ord c
  75
  76 recoverDecode :: CodingFailureMode -> Buffer Word8 -> Buffer Char -> IO (Buffer Word8, Buffer Char)
  77 recoverDecode cfm input@Buffer{  bufRaw=iraw, bufL=ir, bufR=_  }
  78                   output@Buffer{ bufRaw=oraw, bufL=_,  bufR=ow } = do
  79  --puts $ "recoverDecode " ++ show ir
  80  case cfm of
  81   ErrorOnCodingFailure       -> ioe_decodingError
  82   IgnoreCodingFailure        -> return (input { bufL=ir+1 }, output)
  83   TransliterateCodingFailure -> do
  84       ow' <- writeCharBuf oraw ow unrepresentableChar
  85       return (input { bufL=ir+1 }, output { bufR=ow' })
  86   SurrogateEscapeFailure     -> do
  87       b <- readWord8Buf iraw ir
  88       ow' <- writeCharBuf oraw ow (escapeToSurrogateCharacter b)
  89       return (input { bufL=ir+1 }, output { bufR=ow' })
  90
  91 recoverEncode :: CodingFailureMode -> Buffer Char -> Buffer Word8 -> IO (Buffer Char, Buffer Word8)
  92 recoverEncode cfm input@Buffer{  bufRaw=iraw, bufL=ir, bufR=_  }
  93                   output@Buffer{ bufRaw=oraw, bufL=_,  bufR=ow } = do
  94   (c,ir') <- readCharBuf iraw ir
  95   --puts $ "recoverEncode " ++ show ir ++ " " ++ show ir'
  96   case cfm of
  97     IgnoreCodingFailure        -> return (input { bufL=ir' }, output)
  98     TransliterateCodingFailure -> do
  99         if c == '?'
 100          then return (input { bufL=ir' }, output)
 101          else do
 102           -- XXX: evil hack! To implement transliteration, we just poke an
 103           -- ASCII ? into the input buffer and tell the caller to try and decode
 104           -- again. This is *probably* safe given current uses of TextEncoding.
 105           --
 106           -- The "if" test above ensures we skip if the encoding fails to deal with
 107           -- the ?, though this should never happen in practice as all encodings are
 108           -- in fact capable of reperesenting all ASCII characters.
 109           _ir' <- writeCharBuf iraw ir '?'
 110           return (input, output)
 111
 112         -- This implementation does not work because e.g. UTF-16 requires 2 bytes to
 113         -- encode a simple ASCII value
 114         --writeWord8Buf oraw ow unrepresentableByte
 115         --return (input { bufL=ir' }, output { bufR=ow+1 })
 116     SurrogateEscapeFailure | Just x <- unescapeSurrogateCharacter c -> do
 117         writeWord8Buf oraw ow x
 118         return (input { bufL=ir' }, output { bufR=ow+1 })
 119     _                          -> ioe_encodingError
 120
 121 ioe_decodingError :: IO a
 122 ioe_decodingError = ioException
 123     (IOError Nothing InvalidArgument "recoverDecode"
 124         "invalid byte sequence" Nothing Nothing)
 125
 126 ioe_encodingError :: IO a
 127 ioe_encodingError = ioException
 128     (IOError Nothing InvalidArgument "recoverEncode"
 129         "invalid character" Nothing Nothing)