From: Ben.Lippmeier@anu.edu.au Date: Wed, 30 Sep 2009 08:42:29 +0000 (+0000) Subject: Strip any Byte Order Mark (BOM) from the front of decoded streams. X-Git-Url: http://git.megacz.com/?a=commitdiff_plain;h=eee5b61e5ff6f141a778a77997939b72ebb3feb3;p=ghc-base.git Strip any Byte Order Mark (BOM) from the front of decoded streams. When decoding to UTF-32, Solaris iconv inserts a BOM at the front of the stream, but Linux iconv doesn't. --- diff --git a/GHC/IO/Handle/Internals.hs b/GHC/IO/Handle/Internals.hs index b8dc82a..cc9e3d3 100644 --- a/GHC/IO/Handle/Internals.hs +++ b/GHC/IO/Handle/Internals.hs @@ -727,7 +727,7 @@ readTextDevice h_@Handle__{..} cbuf = do debugIO ("readTextDevice after reading: bbuf=" ++ summaryBuffer bbuf1) - (bbuf2,cbuf2) <- + (bbuf2,cbuf') <- case haDecoder of Nothing -> do writeIORef haLastDecode (error "codec_state", bbuf1) @@ -737,16 +737,13 @@ readTextDevice h_@Handle__{..} cbuf = do writeIORef haLastDecode (state, bbuf1) (encode decoder) bbuf1 cbuf - debugIO ("readTextDevice after decoding: cbuf=" ++ summaryBuffer cbuf2 ++ + debugIO ("readTextDevice after decoding: cbuf=" ++ summaryBuffer cbuf' ++ " bbuf=" ++ summaryBuffer bbuf2) - cbuf3 <- stripByteOrderMark cbuf2 - writeIORef haByteBuffer bbuf2 - if bufR cbuf3 == bufR cbuf -- no new characters + if bufR cbuf' == bufR cbuf -- no new characters then readTextDevice' h_ bbuf2 cbuf -- we need more bytes to make a Char - else return cbuf3 - + else return cbuf' -- we have an incomplete byte sequence at the end of the buffer: try to -- read more bytes. @@ -795,7 +792,7 @@ readTextDeviceNonBlocking h_@Handle__{..} cbuf = do if isNothing r then ioe_EOF else do -- raise EOF return bbuf1 - (bbuf2,cbuf2) <- + (bbuf2,cbuf') <- case haDecoder of Nothing -> do writeIORef haLastDecode (error "codec_state", bbuf1) @@ -805,23 +802,5 @@ readTextDeviceNonBlocking h_@Handle__{..} cbuf = do writeIORef haLastDecode (state, bbuf1) (encode decoder) bbuf1 cbuf - cbuf3 <- stripByteOrderMark cbuf2 - writeIORef haByteBuffer bbuf2 - return cbuf3 - - --- | When converting from UTF-8 to UCS-4, Solaris iconv adds a Byte Order Mark (BOM) --- of value 0xfeff to the start of the stream. We don't want to return this to --- the caller, so strip it here. This is a safe operation for other platforms, --- so always do it. -stripByteOrderMark :: CharBuffer -> IO CharBuffer -stripByteOrderMark cbuf - | isEmptyBuffer cbuf - = return cbuf - - | otherwise - = do firstChar <- peekCharBuf (bufRaw cbuf) 0 - if firstChar == chr 0xfeff - then return (bufferRemove 1 cbuf) - else return cbuf + return cbuf'