From 1bf6eeca9b174c5d9522a551e58daad6895faab7 Mon Sep 17 00:00:00 2001 From: Simon Marlow Date: Wed, 15 Jul 2009 12:22:57 +0000 Subject: [PATCH] Add the utf8_bom codec as suggested during the discussion on the libraries list. --- GHC/IO/Encoding.hs | 13 ++++++++- GHC/IO/Encoding/UTF8.hs | 71 +++++++++++++++++++++++++++++++++++++++++++++-- System/IO.hs | 2 +- 3 files changed, 82 insertions(+), 4 deletions(-) diff --git a/GHC/IO/Encoding.hs b/GHC/IO/Encoding.hs index 78aad98..a1da1b1 100644 --- a/GHC/IO/Encoding.hs +++ b/GHC/IO/Encoding.hs @@ -16,7 +16,7 @@ module GHC.IO.Encoding ( BufferCodec(..), TextEncoding(..), TextEncoder, TextDecoder, latin1, latin1_encode, latin1_decode, - utf8, + utf8, utf8_bom, utf16, utf16le, utf16be, utf32, utf32le, utf32be, localeEncoding, @@ -54,6 +54,17 @@ latin1 = Latin1.latin1_checked utf8 :: TextEncoding utf8 = UTF8.utf8 +-- | The UTF-8 Unicode encoding, with a byte-order-mark (BOM; the byte +-- sequence 0xEF 0xBB 0xBF). This encoding behaves like 'utf8', +-- except that on input, the BOM sequence is ignored at the beginning +-- of the stream, and on output, the BOM sequence is prepended. +-- +-- The byte-order-mark is strictly unnecessary in UTF-8, but is +-- sometimes used to identify the encoding of a file. +-- +utf8_bom :: TextEncoding +utf8_bom = UTF8.utf8_bom + -- | The UTF-16 Unicode encoding (a byte-order-mark should be used to -- indicate endianness). utf16 :: TextEncoding diff --git a/GHC/IO/Encoding/UTF8.hs b/GHC/IO/Encoding/UTF8.hs index 5912525..c249289 100644 --- a/GHC/IO/Encoding/UTF8.hs +++ b/GHC/IO/Encoding/UTF8.hs @@ -20,13 +20,13 @@ module GHC.IO.Encoding.UTF8 ( utf8, - utf8_decode, - utf8_encode, + utf8_bom, ) where import GHC.Base import GHC.Real import GHC.Num +import GHC.IORef -- import GHC.IO import GHC.IO.Exception import GHC.IO.Buffer @@ -57,6 +57,73 @@ utf8_EF = setState = const $ return () }) +utf8_bom :: TextEncoding +utf8_bom = TextEncoding { mkTextDecoder = utf8_bom_DF, + mkTextEncoder = utf8_bom_EF } + +utf8_bom_DF :: IO (TextDecoder Bool) +utf8_bom_DF = do + ref <- newIORef True + return (BufferCodec { + encode = utf8_bom_decode ref, + close = return (), + getState = readIORef ref, + setState = writeIORef ref + }) + +utf8_bom_EF :: IO (TextEncoder Bool) +utf8_bom_EF = do + ref <- newIORef True + return (BufferCodec { + encode = utf8_bom_encode ref, + close = return (), + getState = readIORef ref, + setState = writeIORef ref + }) + +utf8_bom_decode :: IORef Bool -> DecodeBuffer +utf8_bom_decode ref + input@Buffer{ bufRaw=iraw, bufL=ir, bufR=iw, bufSize=_ } + output + = do + first <- readIORef ref + if not first + then utf8_decode input output + else do + let no_bom = do writeIORef ref False; utf8_decode input output + if iw - ir < 1 then return (input,output) else do + c0 <- readWord8Buf iraw ir + if (c0 /= bom0) then no_bom else do + if iw - ir < 2 then return (input,output) else do + c1 <- readWord8Buf iraw (ir+1) + if (c1 /= bom1) then no_bom else do + if iw - ir < 3 then return (input,output) else do + c2 <- readWord8Buf iraw (ir+2) + if (c2 /= bom2) then no_bom else do + -- found a BOM, ignore it and carry on + writeIORef ref False + utf8_decode input{ bufL = ir + 3 } output + +utf8_bom_encode :: IORef Bool -> EncodeBuffer +utf8_bom_encode ref input + output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow, bufSize=os } + = do + b <- readIORef ref + if not b then utf8_encode input output + else if os - ow < 3 + then return (input,output) + else do + writeIORef ref False + writeWord8Buf oraw ow bom0 + writeWord8Buf oraw (ow+1) bom1 + writeWord8Buf oraw (ow+2) bom2 + utf8_encode input output{ bufR = ow+3 } + +bom0, bom1, bom2 :: Word8 +bom0 = 0xef +bom1 = 0xbb +bom2 = 0xbf + utf8_decode :: DecodeBuffer utf8_decode input@Buffer{ bufRaw=iraw, bufL=ir0, bufR=iw, bufSize=_ } diff --git a/System/IO.hs b/System/IO.hs index 9560c26..4467974 100644 --- a/System/IO.hs +++ b/System/IO.hs @@ -183,7 +183,7 @@ module System.IO ( -- ** Unicode encodings TextEncoding, latin1, - utf8, + utf8, utf8_bom, utf16, utf16le, utf16be, utf32, utf32le, utf32be, localeEncoding, -- 1.7.10.4