Add the utf8_bom codec

author Simon Marlow <marlowsd@gmail.com>

Wed, 15 Jul 2009 12:22:57 +0000 (12:22 +0000)

committer Simon Marlow <marlowsd@gmail.com>

Wed, 15 Jul 2009 12:22:57 +0000 (12:22 +0000)
author Simon Marlow <marlowsd@gmail.com>
Wed, 15 Jul 2009 12:22:57 +0000 (12:22 +0000)
committer Simon Marlow <marlowsd@gmail.com>
Wed, 15 Jul 2009 12:22:57 +0000 (12:22 +0000)
diff --git a/GHC/IO/Encoding.hs b/GHC/IO/Encoding.hs

index 78aad98..a1da1b1 100644 (file)
--- a/GHC/IO/Encoding.hs
+++ b/GHC/IO/Encoding.hs
@@ -16,7 +16,7 @@
  module GHC.IO.Encoding (
    BufferCodec(..), TextEncoding(..), TextEncoder, TextDecoder,
    latin1, latin1_encode, latin1_decode,
-  utf8, 
+  utf8, utf8_bom,
    utf16, utf16le, utf16be,
    utf32, utf32le, utf32be, 
    localeEncoding,
@@ -54,6 +54,17 @@ latin1 = Latin1.latin1_checked
  utf8  :: TextEncoding
  utf8 = UTF8.utf8
  
+-- | The UTF-8 Unicode encoding, with a byte-order-mark (BOM; the byte
+-- sequence 0xEF 0xBB 0xBF).  This encoding behaves like 'utf8',
+-- except that on input, the BOM sequence is ignored at the beginning
+-- of the stream, and on output, the BOM sequence is prepended.
+--
+-- The byte-order-mark is strictly unnecessary in UTF-8, but is
+-- sometimes used to identify the encoding of a file.
+--
+utf8_bom  :: TextEncoding
+utf8_bom = UTF8.utf8_bom
+
  -- | The UTF-16 Unicode encoding (a byte-order-mark should be used to
  -- indicate endianness).
  utf16  :: TextEncoding
diff --git a/GHC/IO/Encoding/UTF8.hs b/GHC/IO/Encoding/UTF8.hs

index 5912525..c249289 100644 (file)
--- a/GHC/IO/Encoding/UTF8.hs
+++ b/GHC/IO/Encoding/UTF8.hs
@@ -20,13 +20,13 @@
  
  module GHC.IO.Encoding.UTF8 (
    utf8,
-  utf8_decode,
-  utf8_encode,
+  utf8_bom,
    ) where
  
  import GHC.Base
  import GHC.Real
  import GHC.Num
+import GHC.IORef
  -- import GHC.IO
  import GHC.IO.Exception
  import GHC.IO.Buffer
@@ -57,6 +57,73 @@ utf8_EF =
               setState = const $ return ()
            })
  
+utf8_bom :: TextEncoding
+utf8_bom = TextEncoding { mkTextDecoder = utf8_bom_DF,
+                          mkTextEncoder = utf8_bom_EF }
+
+utf8_bom_DF :: IO (TextDecoder Bool)
+utf8_bom_DF = do
+   ref <- newIORef True
+   return (BufferCodec {
+             encode   = utf8_bom_decode ref,
+             close    = return (),
+             getState = readIORef ref,
+             setState = writeIORef ref
+          })
+
+utf8_bom_EF :: IO (TextEncoder Bool)
+utf8_bom_EF = do
+   ref <- newIORef True
+   return (BufferCodec {
+             encode   = utf8_bom_encode ref,
+             close    = return (),
+             getState = readIORef ref,
+             setState = writeIORef ref
+          })
+
+utf8_bom_decode :: IORef Bool -> DecodeBuffer
+utf8_bom_decode ref
+  input@Buffer{  bufRaw=iraw, bufL=ir, bufR=iw,  bufSize=_  }
+  output
+ = do
+   first <- readIORef ref
+   if not first
+      then utf8_decode input output
+      else do
+       let no_bom = do writeIORef ref False; utf8_decode input output
+       if iw - ir < 1 then return (input,output) else do
+       c0 <- readWord8Buf iraw ir
+       if (c0 /= bom0) then no_bom else do
+       if iw - ir < 2 then return (input,output) else do
+       c1 <- readWord8Buf iraw (ir+1)
+       if (c1 /= bom1) then no_bom else do
+       if iw - ir < 3 then return (input,output) else do
+       c2 <- readWord8Buf iraw (ir+2)
+       if (c2 /= bom2) then no_bom else do
+       -- found a BOM, ignore it and carry on
+       writeIORef ref False
+       utf8_decode input{ bufL = ir + 3 } output
+
+utf8_bom_encode :: IORef Bool -> EncodeBuffer
+utf8_bom_encode ref input
+  output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow, bufSize=os }
+ = do
+  b <- readIORef ref
+  if not b then utf8_encode input output
+           else if os - ow < 3
+                  then return (input,output)
+                  else do
+                    writeIORef ref False
+                    writeWord8Buf oraw ow     bom0
+                    writeWord8Buf oraw (ow+1) bom1
+                    writeWord8Buf oraw (ow+2) bom2
+                    utf8_encode input output{ bufR = ow+3 }
+
+bom0, bom1, bom2 :: Word8
+bom0 = 0xef
+bom1 = 0xbb
+bom2 = 0xbf
+
  utf8_decode :: DecodeBuffer
  utf8_decode 
    input@Buffer{  bufRaw=iraw, bufL=ir0, bufR=iw,  bufSize=_  }
diff --git a/System/IO.hs b/System/IO.hs

index 9560c26..4467974 100644 (file)
--- a/System/IO.hs
+++ b/System/IO.hs
@@ -183,7 +183,7 @@ module System.IO (
      -- ** Unicode encodings
      TextEncoding, 
      latin1,
-    utf8, 
+    utf8, utf8_bom,
      utf16, utf16le, utf16be,
      utf32, utf32le, utf32be, 
      localeEncoding,
author	Simon Marlow <marlowsd@gmail.com>
	Wed, 15 Jul 2009 12:22:57 +0000 (12:22 +0000)
committer	Simon Marlow <marlowsd@gmail.com>
	Wed, 15 Jul 2009 12:22:57 +0000 (12:22 +0000)
GHC/IO/Encoding.hs		patch \| blob \| history
GHC/IO/Encoding/UTF8.hs		patch \| blob \| history
System/IO.hs		patch \| blob \| history