From: Simon Marlow Date: Sun, 14 Jun 2009 18:53:32 +0000 (+0000) Subject: Save and restore the codec state when re-decoding X-Git-Url: http://git.megacz.com/?a=commitdiff_plain;h=ccc931d0905f6e0d55cb90b045881d4515112411;p=ghc-base.git Save and restore the codec state when re-decoding We previously had an ugly hack to check for a BOM when re-decoding some binary data in flushCharBuffer. The hack was there essentially because codecs like UTF-16 have a state, and we had not restored it. This patch gives codecs an explicit state, and implemented saving/restoring of the state as necessary. Hence, the hack in flushCharBuffer is replaced by a more general mechanism that works for any codec with state. Unfortunately, iconv doesn't give us a way to save and restore the state, so this is currently only implemented for the built-in codecs. --- diff --git a/GHC/IO/Encoding/Iconv.hs b/GHC/IO/Encoding/Iconv.hs index cca3ebc..237468a 100644 --- a/GHC/IO/Encoding/Iconv.hs +++ b/GHC/IO/Encoding/Iconv.hs @@ -135,7 +135,7 @@ mkTextEncoding charset = do newIConv :: String -> String -> (IConv -> Buffer a -> Buffer b -> IO (Buffer a, Buffer b)) - -> IO (BufferCodec a b) + -> IO (BufferCodec a b ()) newIConv from to fn = withCString from $ \ from_str -> withCString to $ \ to_str -> do @@ -144,7 +144,10 @@ newIConv from to fn = return () return BufferCodec{ encode = fn iconvt, - close = iclose + close = iclose, + -- iconv doesn't supply a way to save/restore the state + getState = return (), + setState = const $ return () } iconvDecode :: IConv -> Buffer Word8 -> Buffer CharBufElem diff --git a/GHC/IO/Encoding/Latin1.hs b/GHC/IO/Encoding/Latin1.hs index 60598f6..504f4dc 100644 --- a/GHC/IO/Encoding/Latin1.hs +++ b/GHC/IO/Encoding/Latin1.hs @@ -42,18 +42,36 @@ latin1 :: TextEncoding latin1 = TextEncoding { mkTextDecoder = latin1_DF, mkTextEncoder = latin1_EF } -latin1_DF :: IO TextDecoder -latin1_DF = return (BufferCodec latin1_decode (return ())) +latin1_DF :: IO (TextDecoder ()) +latin1_DF = + return (BufferCodec { + encode = latin1_decode, + close = return (), + getState = return (), + setState = const $ return () + }) -latin1_EF :: IO TextEncoder -latin1_EF = return (BufferCodec latin1_encode (return ())) +latin1_EF :: IO (TextEncoder ()) +latin1_EF = + return (BufferCodec { + encode = latin1_encode, + close = return (), + getState = return (), + setState = const $ return () + }) latin1_checked :: TextEncoding latin1_checked = TextEncoding { mkTextDecoder = latin1_DF, mkTextEncoder = latin1_checked_EF } -latin1_checked_EF :: IO TextEncoder -latin1_checked_EF = return (BufferCodec latin1_checked_encode (return ())) +latin1_checked_EF :: IO (TextEncoder ()) +latin1_checked_EF = + return (BufferCodec { + encode = latin1_checked_encode, + close = return (), + getState = return (), + setState = const $ return () + }) latin1_decode :: DecodeBuffer diff --git a/GHC/IO/Encoding/Types.hs b/GHC/IO/Encoding/Types.hs index b857bdf..4d267aa 100644 --- a/GHC/IO/Encoding/Types.hs +++ b/GHC/IO/Encoding/Types.hs @@ -28,7 +28,7 @@ import GHC.IO.Buffer -- ----------------------------------------------------------------------------- -- Text encoders/decoders -data BufferCodec from to = BufferCodec { +data BufferCodec from to state = BufferCodec { encode :: Buffer from -> Buffer to -> IO (Buffer from, Buffer to), -- ^ The @encode@ function translates elements of the buffer @from@ -- to the buffer @to@. It should translate as many elements as possible @@ -45,10 +45,27 @@ data BufferCodec from to = BufferCodec { -- library in order to report translation errors at the point they -- actually occur, rather than when the buffer is translated. -- - close :: IO () + close :: IO (), -- ^ Resources associated with the encoding may now be released. -- The @encode@ function may not be called again after calling -- @close@. + + getState :: IO state, + -- ^ Return the current state of the codec. + -- + -- Many codecs are not stateful, and in these case the state can be + -- represented as '()'. Other codecs maintain a state. For + -- example, UTF-16 recognises a BOM (byte-order-mark) character at + -- the beginning of the input, and remembers thereafter whether to + -- use big-endian or little-endian mode. In this case, the state + -- of the codec would include two pieces of information: whether we + -- are at the beginning of the stream (the BOM only occurs at the + -- beginning), and if not, whether to use the big or little-endian + -- encoding. + + setState :: state -> IO() + -- restore the state of the codec using the state from a previous + -- call to 'getState'. } type DecodeBuffer = Buffer Word8 -> Buffer Char @@ -57,16 +74,16 @@ type DecodeBuffer = Buffer Word8 -> Buffer Char type EncodeBuffer = Buffer Char -> Buffer Word8 -> IO (Buffer Char, Buffer Word8) -type TextDecoder = BufferCodec Word8 CharBufElem -type TextEncoder = BufferCodec CharBufElem Word8 +type TextDecoder state = BufferCodec Word8 CharBufElem state +type TextEncoder state = BufferCodec CharBufElem Word8 state -- | A 'TextEncoding' is a specification of a conversion scheme -- between sequences of bytes and sequences of Unicode characters. -- -- For example, UTF-8 is an encoding of Unicode characters into a sequence --- of bytes. The 'TextEncoding' for UTF-8 is 'utf_8'. +-- of bytes. The 'TextEncoding' for UTF-8 is 'utf8'. data TextEncoding - = TextEncoding { - mkTextDecoder :: IO TextDecoder, - mkTextEncoder :: IO TextEncoder + = forall dstate estate . TextEncoding { + mkTextDecoder :: IO (TextDecoder dstate), + mkTextEncoder :: IO (TextEncoder estate) } diff --git a/GHC/IO/Encoding/UTF16.hs b/GHC/IO/Encoding/UTF16.hs index e3801c0..a5a6b62 100644 --- a/GHC/IO/Encoding/UTF16.hs +++ b/GHC/IO/Encoding/UTF16.hs @@ -62,15 +62,25 @@ utf16 :: TextEncoding utf16 = TextEncoding { mkTextDecoder = utf16_DF, mkTextEncoder = utf16_EF } -utf16_DF :: IO TextDecoder +utf16_DF :: IO (TextDecoder (Maybe DecodeBuffer)) utf16_DF = do seen_bom <- newIORef Nothing - return (BufferCodec (utf16_decode seen_bom) (return ())) - -utf16_EF :: IO TextEncoder + return (BufferCodec { + encode = utf16_decode seen_bom, + close = return (), + getState = readIORef seen_bom, + setState = writeIORef seen_bom + }) + +utf16_EF :: IO (TextEncoder Bool) utf16_EF = do done_bom <- newIORef False - return (BufferCodec (utf16_encode done_bom) (return ())) + return (BufferCodec { + encode = utf16_encode done_bom, + close = return (), + getState = readIORef done_bom, + setState = writeIORef done_bom + }) utf16_encode :: IORef Bool -> EncodeBuffer utf16_encode done_bom input @@ -131,23 +141,45 @@ utf16be :: TextEncoding utf16be = TextEncoding { mkTextDecoder = utf16be_DF, mkTextEncoder = utf16be_EF } -utf16be_DF :: IO TextDecoder -utf16be_DF = return (BufferCodec utf16be_decode (return ())) - -utf16be_EF :: IO TextEncoder -utf16be_EF = return (BufferCodec utf16be_encode (return ())) - +utf16be_DF :: IO (TextDecoder ()) +utf16be_DF = + return (BufferCodec { + encode = utf16be_decode, + close = return (), + getState = return (), + setState = const $ return () + }) + +utf16be_EF :: IO (TextEncoder ()) +utf16be_EF = + return (BufferCodec { + encode = utf16be_encode, + close = return (), + getState = return (), + setState = const $ return () + }) utf16le :: TextEncoding utf16le = TextEncoding { mkTextDecoder = utf16le_DF, mkTextEncoder = utf16le_EF } -utf16le_DF :: IO TextDecoder -utf16le_DF = return (BufferCodec utf16le_decode (return ())) - -utf16le_EF :: IO TextEncoder -utf16le_EF = return (BufferCodec utf16le_encode (return ())) - +utf16le_DF :: IO (TextDecoder ()) +utf16le_DF = + return (BufferCodec { + encode = utf16le_decode, + close = return (), + getState = return (), + setState = const $ return () + }) + +utf16le_EF :: IO (TextEncoder ()) +utf16le_EF = + return (BufferCodec { + encode = utf16le_encode, + close = return (), + getState = return (), + setState = const $ return () + }) utf16be_decode :: DecodeBuffer diff --git a/GHC/IO/Encoding/UTF32.hs b/GHC/IO/Encoding/UTF32.hs index b26aaae..17a817e 100644 --- a/GHC/IO/Encoding/UTF32.hs +++ b/GHC/IO/Encoding/UTF32.hs @@ -51,15 +51,25 @@ utf32 :: TextEncoding utf32 = TextEncoding { mkTextDecoder = utf32_DF, mkTextEncoder = utf32_EF } -utf32_DF :: IO TextDecoder +utf32_DF :: IO (TextDecoder (Maybe DecodeBuffer)) utf32_DF = do seen_bom <- newIORef Nothing - return (BufferCodec (utf32_decode seen_bom) (return ())) - -utf32_EF :: IO TextEncoder + return (BufferCodec { + encode = utf32_decode seen_bom, + close = return (), + getState = readIORef seen_bom, + setState = writeIORef seen_bom + }) + +utf32_EF :: IO (TextEncoder Bool) utf32_EF = do done_bom <- newIORef False - return (BufferCodec (utf32_encode done_bom) (return ())) + return (BufferCodec { + encode = utf32_encode done_bom, + close = return (), + getState = readIORef done_bom, + setState = writeIORef done_bom + }) utf32_encode :: IORef Bool -> EncodeBuffer utf32_encode done_bom input @@ -123,23 +133,46 @@ utf32be :: TextEncoding utf32be = TextEncoding { mkTextDecoder = utf32be_DF, mkTextEncoder = utf32be_EF } -utf32be_DF :: IO TextDecoder -utf32be_DF = return (BufferCodec utf32be_decode (return ())) - -utf32be_EF :: IO TextEncoder -utf32be_EF = return (BufferCodec utf32be_encode (return ())) +utf32be_DF :: IO (TextDecoder ()) +utf32be_DF = + return (BufferCodec { + encode = utf32be_decode, + close = return (), + getState = return (), + setState = const $ return () + }) + +utf32be_EF :: IO (TextEncoder ()) +utf32be_EF = + return (BufferCodec { + encode = utf32be_encode, + close = return (), + getState = return (), + setState = const $ return () + }) utf32le :: TextEncoding utf32le = TextEncoding { mkTextDecoder = utf32le_DF, mkTextEncoder = utf32le_EF } -utf32le_DF :: IO TextDecoder -utf32le_DF = return (BufferCodec utf32le_decode (return ())) - -utf32le_EF :: IO TextEncoder -utf32le_EF = return (BufferCodec utf32le_encode (return ())) - +utf32le_DF :: IO (TextDecoder ()) +utf32le_DF = + return (BufferCodec { + encode = utf32le_decode, + close = return (), + getState = return (), + setState = const $ return () + }) + +utf32le_EF :: IO (TextEncoder ()) +utf32le_EF = + return (BufferCodec { + encode = utf32le_encode, + close = return (), + getState = return (), + setState = const $ return () + }) utf32be_decode :: DecodeBuffer diff --git a/GHC/IO/Encoding/UTF8.hs b/GHC/IO/Encoding/UTF8.hs index 43adff1..0efb187 100644 --- a/GHC/IO/Encoding/UTF8.hs +++ b/GHC/IO/Encoding/UTF8.hs @@ -39,11 +39,23 @@ utf8 :: TextEncoding utf8 = TextEncoding { mkTextDecoder = utf8_DF, mkTextEncoder = utf8_EF } -utf8_DF :: IO TextDecoder -utf8_DF = return (BufferCodec utf8_decode (return ())) - -utf8_EF :: IO TextEncoder -utf8_EF = return (BufferCodec utf8_encode (return ())) +utf8_DF :: IO (TextDecoder ()) +utf8_DF = + return (BufferCodec { + encode = utf8_decode, + close = return (), + getState = return (), + setState = const $ return () + }) + +utf8_EF :: IO (TextEncoder ()) +utf8_EF = + return (BufferCodec { + encode = utf8_encode, + close = return (), + getState = return (), + setState = const $ return () + }) utf8_decode :: DecodeBuffer utf8_decode diff --git a/GHC/IO/Handle.hs b/GHC/IO/Handle.hs index 8345616..f436f15 100644 --- a/GHC/IO/Handle.hs +++ b/GHC/IO/Handle.hs @@ -259,8 +259,10 @@ hSetEncoding :: Handle -> TextEncoding -> IO () hSetEncoding hdl encoding = do withHandle "hSetEncoding" hdl $ \h_@Handle__{..} -> do flushCharBuffer h_ - (mb_encoder,mb_decoder) <- getEncoding (Just encoding) haType - return (Handle__{ haDecoder = mb_decoder, haEncoder = mb_encoder, .. }, + openTextEncoding (Just encoding) haType $ \ mb_encoder mb_decoder -> do + bbuf <- readIORef haByteBuffer + ref <- newIORef (error "last_decode") + return (Handle__{ haLastDecode = ref, haDecoder = mb_decoder, haEncoder = mb_encoder, .. }, ()) -- ----------------------------------------------------------------------------- @@ -513,15 +515,21 @@ hSetBinaryMode handle bin = withAllHandles__ "hSetBinaryMode" handle $ \ h_@Handle__{..} -> do flushBuffer h_ + let mb_te | bin = Nothing | otherwise = Just localeEncoding + openTextEncoding mb_te haType $ \ mb_encoder mb_decoder -> do + -- should match the default newline mode, whatever that is let nl | bin = noNewlineTranslation | otherwise = nativeNewlineMode - (mb_encoder, mb_decoder) <- getEncoding mb_te haType - return Handle__{ haEncoder = mb_encoder, + bbuf <- readIORef haByteBuffer + ref <- newIORef (error "codec_state", bbuf) + + return Handle__{ haLastDecode = ref, + haEncoder = mb_encoder, haDecoder = mb_decoder, haInputNL = inputNL nl, haOutputNL = outputNL nl, .. } diff --git a/GHC/IO/Handle/Internals.hs b/GHC/IO/Handle/Internals.hs index 739c422..ed32eaa 100644 --- a/GHC/IO/Handle/Internals.hs +++ b/GHC/IO/Handle/Internals.hs @@ -31,7 +31,7 @@ module GHC.IO.Handle.Internals ( wantSeekableHandle, mkHandle, mkFileHandle, mkDuplexHandle, - getEncoding, initBufferState, + openTextEncoding, initBufferState, dEFAULT_CHAR_BUFFER_SIZE, flushBuffer, flushWriteBuffer, flushWriteBuffer_, flushCharReadBuffer, @@ -432,7 +432,7 @@ flushCharReadBuffer Handle__{..} = do -- haLastDecode is the byte buffer just before we did our last batch of -- decoding. We're going to re-decode the bytes up to the current char, -- to find out where we should revert the byte buffer to. - bbuf0 <- readIORef haLastDecode + (codec_state, bbuf0) <- readIORef haLastDecode cbuf0 <- readIORef haCharBuffer writeIORef haCharBuffer cbuf0{ bufL=0, bufR=0 } @@ -453,24 +453,17 @@ flushCharReadBuffer Handle__{..} = do Just decoder -> do debugIO ("flushCharReadBuffer re-decode, bbuf=" ++ summaryBuffer bbuf0 ++ " cbuf=" ++ summaryBuffer cbuf0) + + -- restore the codec state + setState decoder codec_state (bbuf1,cbuf1) <- (encode decoder) bbuf0 cbuf0{ bufL=0, bufR=0, bufSize = bufL cbuf0 } - -- tricky case: if the decoded string starts with e BOM, then it was - -- probably ignored last time we decoded these bytes, and we should - -- therefore decode another char. - (c,_) <- readCharBuf (bufRaw cbuf1) (bufL cbuf1) - (bbuf2,_) <- if (c == '\xfeff') - then do debugIO "found BOM, decoding another char" - (encode decoder) bbuf1 - cbuf0{ bufL=0, bufR=0, bufSize = 1 } - else return (bbuf1,cbuf1) - debugIO ("finished, bbuf=" ++ summaryBuffer bbuf1 ++ " cbuf=" ++ summaryBuffer cbuf1) - writeIORef haByteBuffer bbuf2 + writeIORef haByteBuffer bbuf1 -- When flushing the byte read buffer, we seek backwards by the number @@ -508,12 +501,12 @@ mkHandle :: (IODevice dev, BufferedIO dev, Typeable dev) => dev -> IO Handle mkHandle dev filepath ha_type buffered mb_codec nl finalizer other_side = do + openTextEncoding mb_codec ha_type $ \ mb_encoder mb_decoder -> do + let buf_state = initBufferState ha_type bbuf <- Buffered.newBuffer dev buf_state bbufref <- newIORef bbuf - last_decode <- newIORef bbuf - - (mb_encoder, mb_decoder) <- getEncoding mb_codec ha_type + last_decode <- newIORef (error "codec_state", bbuf) (cbufref,bmode) <- if buffered then getCharBuffer dev buf_state @@ -585,23 +578,25 @@ initBufferState :: HandleType -> BufferState initBufferState ReadHandle = ReadBuffer initBufferState _ = WriteBuffer -getEncoding :: Maybe TextEncoding -> HandleType - -> IO (Maybe TextEncoder, - Maybe TextDecoder) +openTextEncoding + :: Maybe TextEncoding + -> HandleType + -> (forall es ds . Maybe (TextEncoder es) -> Maybe (TextDecoder ds) -> IO a) + -> IO a -getEncoding Nothing ha_type = return (Nothing, Nothing) -getEncoding (Just te) ha_type = do +openTextEncoding Nothing ha_type cont = cont Nothing Nothing +openTextEncoding (Just TextEncoding{..}) ha_type cont = do mb_decoder <- if isReadableHandleType ha_type then do - decoder <- mkTextDecoder te + decoder <- mkTextDecoder return (Just decoder) else return Nothing mb_encoder <- if isWritableHandleType ha_type then do - encoder <- mkTextEncoder te + encoder <- mkTextEncoder return (Just encoder) else return Nothing - return (mb_encoder, mb_decoder) + cont mb_encoder mb_decoder -- --------------------------------------------------------------------------- -- closing Handles @@ -737,10 +732,15 @@ readTextDevice h_@Handle__{..} cbuf = do debugIO ("readTextDevice after reading: bbuf=" ++ summaryBuffer bbuf1) - writeIORef haLastDecode bbuf1 - (bbuf2,cbuf') <- case haDecoder of - Nothing -> latin1_decode bbuf1 cbuf - Just decoder -> (encode decoder) bbuf1 cbuf + (bbuf2,cbuf') <- + case haDecoder of + Nothing -> do + writeIORef haLastDecode (error "codec_state", bbuf1) + latin1_decode bbuf1 cbuf + Just decoder -> do + state <- getState decoder + writeIORef haLastDecode (state, bbuf1) + (encode decoder) bbuf1 cbuf debugIO ("readTextDevice after decoding: cbuf=" ++ summaryBuffer cbuf' ++ " bbuf=" ++ summaryBuffer bbuf2) @@ -766,10 +766,15 @@ readTextDevice' h_@Handle__{..} bbuf0 cbuf = do debugIO ("readTextDevice after reading: bbuf=" ++ summaryBuffer bbuf2) - writeIORef haLastDecode bbuf2 - (bbuf3,cbuf') <- case haDecoder of - Nothing -> latin1_decode bbuf2 cbuf - Just decoder -> (encode decoder) bbuf2 cbuf + (bbuf3,cbuf') <- + case haDecoder of + Nothing -> do + writeIORef haLastDecode (error "codec_state", bbuf2) + latin1_decode bbuf2 cbuf + Just decoder -> do + state <- getState decoder + writeIORef haLastDecode (state, bbuf2) + (encode decoder) bbuf2 cbuf debugIO ("readTextDevice after decoding: cbuf=" ++ summaryBuffer cbuf' ++ " bbuf=" ++ summaryBuffer bbuf3) diff --git a/GHC/IO/Handle/Types.hs b/GHC/IO/Handle/Types.hs index f3cf717..cdde7d8 100644 --- a/GHC/IO/Handle/Types.hs +++ b/GHC/IO/Handle/Types.hs @@ -121,17 +121,17 @@ instance Eq Handle where _ == _ = False data Handle__ - = forall dev . (IODevice dev, BufferedIO dev, Typeable dev) => + = forall dev enc_state dec_state . (IODevice dev, BufferedIO dev, Typeable dev) => Handle__ { haDevice :: !dev, haType :: HandleType, -- type (read/write/append etc.) haByteBuffer :: !(IORef (Buffer Word8)), haBufferMode :: BufferMode, - haLastDecode :: !(IORef (Buffer Word8)), + haLastDecode :: !(IORef (dec_state, Buffer Word8)), haCharBuffer :: !(IORef (Buffer CharBufElem)), -- the current buffer haBuffers :: !(IORef (BufferList CharBufElem)), -- spare buffers - haEncoder :: Maybe TextEncoder, - haDecoder :: Maybe TextDecoder, + haEncoder :: Maybe (TextEncoder enc_state), + haDecoder :: Maybe (TextDecoder dec_state), haInputNL :: Newline, haOutputNL :: Newline, haOtherSide :: Maybe (MVar Handle__) -- ptr to the write side of a