newIConv :: String -> String
-> (IConv -> Buffer a -> Buffer b -> IO (Buffer a, Buffer b))
- -> IO (BufferCodec a b)
+ -> IO (BufferCodec a b ())
newIConv from to fn =
withCString from $ \ from_str ->
withCString to $ \ to_str -> do
return ()
return BufferCodec{
encode = fn iconvt,
- close = iclose
+ close = iclose,
+ -- iconv doesn't supply a way to save/restore the state
+ getState = return (),
+ setState = const $ return ()
}
iconvDecode :: IConv -> Buffer Word8 -> Buffer CharBufElem
latin1 = TextEncoding { mkTextDecoder = latin1_DF,
mkTextEncoder = latin1_EF }
-latin1_DF :: IO TextDecoder
-latin1_DF = return (BufferCodec latin1_decode (return ()))
+latin1_DF :: IO (TextDecoder ())
+latin1_DF =
+ return (BufferCodec {
+ encode = latin1_decode,
+ close = return (),
+ getState = return (),
+ setState = const $ return ()
+ })
-latin1_EF :: IO TextEncoder
-latin1_EF = return (BufferCodec latin1_encode (return ()))
+latin1_EF :: IO (TextEncoder ())
+latin1_EF =
+ return (BufferCodec {
+ encode = latin1_encode,
+ close = return (),
+ getState = return (),
+ setState = const $ return ()
+ })
latin1_checked :: TextEncoding
latin1_checked = TextEncoding { mkTextDecoder = latin1_DF,
mkTextEncoder = latin1_checked_EF }
-latin1_checked_EF :: IO TextEncoder
-latin1_checked_EF = return (BufferCodec latin1_checked_encode (return ()))
+latin1_checked_EF :: IO (TextEncoder ())
+latin1_checked_EF =
+ return (BufferCodec {
+ encode = latin1_checked_encode,
+ close = return (),
+ getState = return (),
+ setState = const $ return ()
+ })
latin1_decode :: DecodeBuffer
-- -----------------------------------------------------------------------------
-- Text encoders/decoders
-data BufferCodec from to = BufferCodec {
+data BufferCodec from to state = BufferCodec {
encode :: Buffer from -> Buffer to -> IO (Buffer from, Buffer to),
-- ^ The @encode@ function translates elements of the buffer @from@
-- to the buffer @to@. It should translate as many elements as possible
-- library in order to report translation errors at the point they
-- actually occur, rather than when the buffer is translated.
--
- close :: IO ()
+ close :: IO (),
-- ^ Resources associated with the encoding may now be released.
-- The @encode@ function may not be called again after calling
-- @close@.
+
+ getState :: IO state,
+ -- ^ Return the current state of the codec.
+ --
+ -- Many codecs are not stateful, and in these case the state can be
+ -- represented as '()'. Other codecs maintain a state. For
+ -- example, UTF-16 recognises a BOM (byte-order-mark) character at
+ -- the beginning of the input, and remembers thereafter whether to
+ -- use big-endian or little-endian mode. In this case, the state
+ -- of the codec would include two pieces of information: whether we
+ -- are at the beginning of the stream (the BOM only occurs at the
+ -- beginning), and if not, whether to use the big or little-endian
+ -- encoding.
+
+ setState :: state -> IO()
+ -- restore the state of the codec using the state from a previous
+ -- call to 'getState'.
}
type DecodeBuffer = Buffer Word8 -> Buffer Char
type EncodeBuffer = Buffer Char -> Buffer Word8
-> IO (Buffer Char, Buffer Word8)
-type TextDecoder = BufferCodec Word8 CharBufElem
-type TextEncoder = BufferCodec CharBufElem Word8
+type TextDecoder state = BufferCodec Word8 CharBufElem state
+type TextEncoder state = BufferCodec CharBufElem Word8 state
-- | A 'TextEncoding' is a specification of a conversion scheme
-- between sequences of bytes and sequences of Unicode characters.
--
-- For example, UTF-8 is an encoding of Unicode characters into a sequence
--- of bytes. The 'TextEncoding' for UTF-8 is 'utf_8'.
+-- of bytes. The 'TextEncoding' for UTF-8 is 'utf8'.
data TextEncoding
- = TextEncoding {
- mkTextDecoder :: IO TextDecoder,
- mkTextEncoder :: IO TextEncoder
+ = forall dstate estate . TextEncoding {
+ mkTextDecoder :: IO (TextDecoder dstate),
+ mkTextEncoder :: IO (TextEncoder estate)
}
utf16 = TextEncoding { mkTextDecoder = utf16_DF,
mkTextEncoder = utf16_EF }
-utf16_DF :: IO TextDecoder
+utf16_DF :: IO (TextDecoder (Maybe DecodeBuffer))
utf16_DF = do
seen_bom <- newIORef Nothing
- return (BufferCodec (utf16_decode seen_bom) (return ()))
-
-utf16_EF :: IO TextEncoder
+ return (BufferCodec {
+ encode = utf16_decode seen_bom,
+ close = return (),
+ getState = readIORef seen_bom,
+ setState = writeIORef seen_bom
+ })
+
+utf16_EF :: IO (TextEncoder Bool)
utf16_EF = do
done_bom <- newIORef False
- return (BufferCodec (utf16_encode done_bom) (return ()))
+ return (BufferCodec {
+ encode = utf16_encode done_bom,
+ close = return (),
+ getState = readIORef done_bom,
+ setState = writeIORef done_bom
+ })
utf16_encode :: IORef Bool -> EncodeBuffer
utf16_encode done_bom input
utf16be = TextEncoding { mkTextDecoder = utf16be_DF,
mkTextEncoder = utf16be_EF }
-utf16be_DF :: IO TextDecoder
-utf16be_DF = return (BufferCodec utf16be_decode (return ()))
-
-utf16be_EF :: IO TextEncoder
-utf16be_EF = return (BufferCodec utf16be_encode (return ()))
-
+utf16be_DF :: IO (TextDecoder ())
+utf16be_DF =
+ return (BufferCodec {
+ encode = utf16be_decode,
+ close = return (),
+ getState = return (),
+ setState = const $ return ()
+ })
+
+utf16be_EF :: IO (TextEncoder ())
+utf16be_EF =
+ return (BufferCodec {
+ encode = utf16be_encode,
+ close = return (),
+ getState = return (),
+ setState = const $ return ()
+ })
utf16le :: TextEncoding
utf16le = TextEncoding { mkTextDecoder = utf16le_DF,
mkTextEncoder = utf16le_EF }
-utf16le_DF :: IO TextDecoder
-utf16le_DF = return (BufferCodec utf16le_decode (return ()))
-
-utf16le_EF :: IO TextEncoder
-utf16le_EF = return (BufferCodec utf16le_encode (return ()))
-
+utf16le_DF :: IO (TextDecoder ())
+utf16le_DF =
+ return (BufferCodec {
+ encode = utf16le_decode,
+ close = return (),
+ getState = return (),
+ setState = const $ return ()
+ })
+
+utf16le_EF :: IO (TextEncoder ())
+utf16le_EF =
+ return (BufferCodec {
+ encode = utf16le_encode,
+ close = return (),
+ getState = return (),
+ setState = const $ return ()
+ })
utf16be_decode :: DecodeBuffer
utf32 = TextEncoding { mkTextDecoder = utf32_DF,
mkTextEncoder = utf32_EF }
-utf32_DF :: IO TextDecoder
+utf32_DF :: IO (TextDecoder (Maybe DecodeBuffer))
utf32_DF = do
seen_bom <- newIORef Nothing
- return (BufferCodec (utf32_decode seen_bom) (return ()))
-
-utf32_EF :: IO TextEncoder
+ return (BufferCodec {
+ encode = utf32_decode seen_bom,
+ close = return (),
+ getState = readIORef seen_bom,
+ setState = writeIORef seen_bom
+ })
+
+utf32_EF :: IO (TextEncoder Bool)
utf32_EF = do
done_bom <- newIORef False
- return (BufferCodec (utf32_encode done_bom) (return ()))
+ return (BufferCodec {
+ encode = utf32_encode done_bom,
+ close = return (),
+ getState = readIORef done_bom,
+ setState = writeIORef done_bom
+ })
utf32_encode :: IORef Bool -> EncodeBuffer
utf32_encode done_bom input
utf32be = TextEncoding { mkTextDecoder = utf32be_DF,
mkTextEncoder = utf32be_EF }
-utf32be_DF :: IO TextDecoder
-utf32be_DF = return (BufferCodec utf32be_decode (return ()))
-
-utf32be_EF :: IO TextEncoder
-utf32be_EF = return (BufferCodec utf32be_encode (return ()))
+utf32be_DF :: IO (TextDecoder ())
+utf32be_DF =
+ return (BufferCodec {
+ encode = utf32be_decode,
+ close = return (),
+ getState = return (),
+ setState = const $ return ()
+ })
+
+utf32be_EF :: IO (TextEncoder ())
+utf32be_EF =
+ return (BufferCodec {
+ encode = utf32be_encode,
+ close = return (),
+ getState = return (),
+ setState = const $ return ()
+ })
utf32le :: TextEncoding
utf32le = TextEncoding { mkTextDecoder = utf32le_DF,
mkTextEncoder = utf32le_EF }
-utf32le_DF :: IO TextDecoder
-utf32le_DF = return (BufferCodec utf32le_decode (return ()))
-
-utf32le_EF :: IO TextEncoder
-utf32le_EF = return (BufferCodec utf32le_encode (return ()))
-
+utf32le_DF :: IO (TextDecoder ())
+utf32le_DF =
+ return (BufferCodec {
+ encode = utf32le_decode,
+ close = return (),
+ getState = return (),
+ setState = const $ return ()
+ })
+
+utf32le_EF :: IO (TextEncoder ())
+utf32le_EF =
+ return (BufferCodec {
+ encode = utf32le_encode,
+ close = return (),
+ getState = return (),
+ setState = const $ return ()
+ })
utf32be_decode :: DecodeBuffer
utf8 = TextEncoding { mkTextDecoder = utf8_DF,
mkTextEncoder = utf8_EF }
-utf8_DF :: IO TextDecoder
-utf8_DF = return (BufferCodec utf8_decode (return ()))
-
-utf8_EF :: IO TextEncoder
-utf8_EF = return (BufferCodec utf8_encode (return ()))
+utf8_DF :: IO (TextDecoder ())
+utf8_DF =
+ return (BufferCodec {
+ encode = utf8_decode,
+ close = return (),
+ getState = return (),
+ setState = const $ return ()
+ })
+
+utf8_EF :: IO (TextEncoder ())
+utf8_EF =
+ return (BufferCodec {
+ encode = utf8_encode,
+ close = return (),
+ getState = return (),
+ setState = const $ return ()
+ })
utf8_decode :: DecodeBuffer
utf8_decode
hSetEncoding hdl encoding = do
withHandle "hSetEncoding" hdl $ \h_@Handle__{..} -> do
flushCharBuffer h_
- (mb_encoder,mb_decoder) <- getEncoding (Just encoding) haType
- return (Handle__{ haDecoder = mb_decoder, haEncoder = mb_encoder, .. },
+ openTextEncoding (Just encoding) haType $ \ mb_encoder mb_decoder -> do
+ bbuf <- readIORef haByteBuffer
+ ref <- newIORef (error "last_decode")
+ return (Handle__{ haLastDecode = ref, haDecoder = mb_decoder, haEncoder = mb_encoder, .. },
())
-- -----------------------------------------------------------------------------
withAllHandles__ "hSetBinaryMode" handle $ \ h_@Handle__{..} ->
do
flushBuffer h_
+
let mb_te | bin = Nothing
| otherwise = Just localeEncoding
+ openTextEncoding mb_te haType $ \ mb_encoder mb_decoder -> do
+
-- should match the default newline mode, whatever that is
let nl | bin = noNewlineTranslation
| otherwise = nativeNewlineMode
- (mb_encoder, mb_decoder) <- getEncoding mb_te haType
- return Handle__{ haEncoder = mb_encoder,
+ bbuf <- readIORef haByteBuffer
+ ref <- newIORef (error "codec_state", bbuf)
+
+ return Handle__{ haLastDecode = ref,
+ haEncoder = mb_encoder,
haDecoder = mb_decoder,
haInputNL = inputNL nl,
haOutputNL = outputNL nl, .. }
wantSeekableHandle,
mkHandle, mkFileHandle, mkDuplexHandle,
- getEncoding, initBufferState,
+ openTextEncoding, initBufferState,
dEFAULT_CHAR_BUFFER_SIZE,
flushBuffer, flushWriteBuffer, flushWriteBuffer_, flushCharReadBuffer,
-- haLastDecode is the byte buffer just before we did our last batch of
-- decoding. We're going to re-decode the bytes up to the current char,
-- to find out where we should revert the byte buffer to.
- bbuf0 <- readIORef haLastDecode
+ (codec_state, bbuf0) <- readIORef haLastDecode
cbuf0 <- readIORef haCharBuffer
writeIORef haCharBuffer cbuf0{ bufL=0, bufR=0 }
Just decoder -> do
debugIO ("flushCharReadBuffer re-decode, bbuf=" ++ summaryBuffer bbuf0 ++
" cbuf=" ++ summaryBuffer cbuf0)
+
+ -- restore the codec state
+ setState decoder codec_state
(bbuf1,cbuf1) <- (encode decoder) bbuf0
cbuf0{ bufL=0, bufR=0, bufSize = bufL cbuf0 }
- -- tricky case: if the decoded string starts with e BOM, then it was
- -- probably ignored last time we decoded these bytes, and we should
- -- therefore decode another char.
- (c,_) <- readCharBuf (bufRaw cbuf1) (bufL cbuf1)
- (bbuf2,_) <- if (c == '\xfeff')
- then do debugIO "found BOM, decoding another char"
- (encode decoder) bbuf1
- cbuf0{ bufL=0, bufR=0, bufSize = 1 }
- else return (bbuf1,cbuf1)
-
debugIO ("finished, bbuf=" ++ summaryBuffer bbuf1 ++
" cbuf=" ++ summaryBuffer cbuf1)
- writeIORef haByteBuffer bbuf2
+ writeIORef haByteBuffer bbuf1
-- When flushing the byte read buffer, we seek backwards by the number
-> IO Handle
mkHandle dev filepath ha_type buffered mb_codec nl finalizer other_side = do
+ openTextEncoding mb_codec ha_type $ \ mb_encoder mb_decoder -> do
+
let buf_state = initBufferState ha_type
bbuf <- Buffered.newBuffer dev buf_state
bbufref <- newIORef bbuf
- last_decode <- newIORef bbuf
-
- (mb_encoder, mb_decoder) <- getEncoding mb_codec ha_type
+ last_decode <- newIORef (error "codec_state", bbuf)
(cbufref,bmode) <-
if buffered then getCharBuffer dev buf_state
initBufferState ReadHandle = ReadBuffer
initBufferState _ = WriteBuffer
-getEncoding :: Maybe TextEncoding -> HandleType
- -> IO (Maybe TextEncoder,
- Maybe TextDecoder)
+openTextEncoding
+ :: Maybe TextEncoding
+ -> HandleType
+ -> (forall es ds . Maybe (TextEncoder es) -> Maybe (TextDecoder ds) -> IO a)
+ -> IO a
-getEncoding Nothing ha_type = return (Nothing, Nothing)
-getEncoding (Just te) ha_type = do
+openTextEncoding Nothing ha_type cont = cont Nothing Nothing
+openTextEncoding (Just TextEncoding{..}) ha_type cont = do
mb_decoder <- if isReadableHandleType ha_type then do
- decoder <- mkTextDecoder te
+ decoder <- mkTextDecoder
return (Just decoder)
else
return Nothing
mb_encoder <- if isWritableHandleType ha_type then do
- encoder <- mkTextEncoder te
+ encoder <- mkTextEncoder
return (Just encoder)
else
return Nothing
- return (mb_encoder, mb_decoder)
+ cont mb_encoder mb_decoder
-- ---------------------------------------------------------------------------
-- closing Handles
debugIO ("readTextDevice after reading: bbuf=" ++ summaryBuffer bbuf1)
- writeIORef haLastDecode bbuf1
- (bbuf2,cbuf') <- case haDecoder of
- Nothing -> latin1_decode bbuf1 cbuf
- Just decoder -> (encode decoder) bbuf1 cbuf
+ (bbuf2,cbuf') <-
+ case haDecoder of
+ Nothing -> do
+ writeIORef haLastDecode (error "codec_state", bbuf1)
+ latin1_decode bbuf1 cbuf
+ Just decoder -> do
+ state <- getState decoder
+ writeIORef haLastDecode (state, bbuf1)
+ (encode decoder) bbuf1 cbuf
debugIO ("readTextDevice after decoding: cbuf=" ++ summaryBuffer cbuf' ++
" bbuf=" ++ summaryBuffer bbuf2)
debugIO ("readTextDevice after reading: bbuf=" ++ summaryBuffer bbuf2)
- writeIORef haLastDecode bbuf2
- (bbuf3,cbuf') <- case haDecoder of
- Nothing -> latin1_decode bbuf2 cbuf
- Just decoder -> (encode decoder) bbuf2 cbuf
+ (bbuf3,cbuf') <-
+ case haDecoder of
+ Nothing -> do
+ writeIORef haLastDecode (error "codec_state", bbuf2)
+ latin1_decode bbuf2 cbuf
+ Just decoder -> do
+ state <- getState decoder
+ writeIORef haLastDecode (state, bbuf2)
+ (encode decoder) bbuf2 cbuf
debugIO ("readTextDevice after decoding: cbuf=" ++ summaryBuffer cbuf' ++
" bbuf=" ++ summaryBuffer bbuf3)
_ == _ = False
data Handle__
- = forall dev . (IODevice dev, BufferedIO dev, Typeable dev) =>
+ = forall dev enc_state dec_state . (IODevice dev, BufferedIO dev, Typeable dev) =>
Handle__ {
haDevice :: !dev,
haType :: HandleType, -- type (read/write/append etc.)
haByteBuffer :: !(IORef (Buffer Word8)),
haBufferMode :: BufferMode,
- haLastDecode :: !(IORef (Buffer Word8)),
+ haLastDecode :: !(IORef (dec_state, Buffer Word8)),
haCharBuffer :: !(IORef (Buffer CharBufElem)), -- the current buffer
haBuffers :: !(IORef (BufferList CharBufElem)), -- spare buffers
- haEncoder :: Maybe TextEncoder,
- haDecoder :: Maybe TextDecoder,
+ haEncoder :: Maybe (TextEncoder enc_state),
+ haDecoder :: Maybe (TextDecoder dec_state),
haInputNL :: Newline,
haOutputNL :: Newline,
haOtherSide :: Maybe (MVar Handle__) -- ptr to the write side of a