From: Ben.Lippmeier@anu.edu.au Date: Wed, 30 Sep 2009 08:42:29 +0000 (+0000) Subject: Strip any Byte Order Mark (BOM) from the front of decoded streams. X-Git-Url: http://git.megacz.com/?a=commitdiff_plain;h=d9310d8b5afbf8ba865187c4a3cac1b3c3e2db6f;p=ghc-base.git Strip any Byte Order Mark (BOM) from the front of decoded streams. When decoding to UTF-32, Solaris iconv inserts a BOM at the front of the stream, but Linux iconv doesn't. --- diff --git a/GHC/IO/Handle/Internals.hs b/GHC/IO/Handle/Internals.hs index cc9e3d3..b8dc82a 100644 --- a/GHC/IO/Handle/Internals.hs +++ b/GHC/IO/Handle/Internals.hs @@ -727,7 +727,7 @@ readTextDevice h_@Handle__{..} cbuf = do debugIO ("readTextDevice after reading: bbuf=" ++ summaryBuffer bbuf1) - (bbuf2,cbuf') <- + (bbuf2,cbuf2) <- case haDecoder of Nothing -> do writeIORef haLastDecode (error "codec_state", bbuf1) @@ -737,13 +737,16 @@ readTextDevice h_@Handle__{..} cbuf = do writeIORef haLastDecode (state, bbuf1) (encode decoder) bbuf1 cbuf - debugIO ("readTextDevice after decoding: cbuf=" ++ summaryBuffer cbuf' ++ + debugIO ("readTextDevice after decoding: cbuf=" ++ summaryBuffer cbuf2 ++ " bbuf=" ++ summaryBuffer bbuf2) + cbuf3 <- stripByteOrderMark cbuf2 + writeIORef haByteBuffer bbuf2 - if bufR cbuf' == bufR cbuf -- no new characters + if bufR cbuf3 == bufR cbuf -- no new characters then readTextDevice' h_ bbuf2 cbuf -- we need more bytes to make a Char - else return cbuf' + else return cbuf3 + -- we have an incomplete byte sequence at the end of the buffer: try to -- read more bytes. @@ -792,7 +795,7 @@ readTextDeviceNonBlocking h_@Handle__{..} cbuf = do if isNothing r then ioe_EOF else do -- raise EOF return bbuf1 - (bbuf2,cbuf') <- + (bbuf2,cbuf2) <- case haDecoder of Nothing -> do writeIORef haLastDecode (error "codec_state", bbuf1) @@ -802,5 +805,23 @@ readTextDeviceNonBlocking h_@Handle__{..} cbuf = do writeIORef haLastDecode (state, bbuf1) (encode decoder) bbuf1 cbuf + cbuf3 <- stripByteOrderMark cbuf2 + writeIORef haByteBuffer bbuf2 - return cbuf' + return cbuf3 + + +-- | When converting from UTF-8 to UCS-4, Solaris iconv adds a Byte Order Mark (BOM) +-- of value 0xfeff to the start of the stream. We don't want to return this to +-- the caller, so strip it here. This is a safe operation for other platforms, +-- so always do it. +stripByteOrderMark :: CharBuffer -> IO CharBuffer +stripByteOrderMark cbuf + | isEmptyBuffer cbuf + = return cbuf + + | otherwise + = do firstChar <- peekCharBuf (bufRaw cbuf) 0 + if firstChar == chr 0xfeff + then return (bufferRemove 1 cbuf) + else return cbuf