Use explicit language extensions & remove extension fields from base.cabal
[ghc-base.git] / GHC / IO / Encoding / UTF8.hs
index 0efb187..dea4fde 100644 (file)
@@ -1,5 +1,10 @@
-{-# OPTIONS_GHC -fno-implicit-prelude -funbox-strict-fields #-}
-{-# LANGUAGE BangPatterns #-}
+{-# LANGUAGE NoImplicitPrelude
+           , BangPatterns
+           , NondecreasingIndentation
+           , MagicHash
+  #-}
+{-# OPTIONS_GHC -funbox-strict-fields #-}
+
 -----------------------------------------------------------------------------
 -- |
 -- Module      :  GHC.IO.Encoding.UTF8
 
 module GHC.IO.Encoding.UTF8 (
   utf8,
-  utf8_decode,
-  utf8_encode,
+  utf8_bom,
   ) where
 
 import GHC.Base
 import GHC.Real
 import GHC.Num
-import GHC.IO
+import GHC.IORef
+-- import GHC.IO
 import GHC.IO.Exception
 import GHC.IO.Buffer
 import GHC.IO.Encoding.Types
@@ -36,7 +41,8 @@ import Data.Bits
 import Data.Maybe
 
 utf8 :: TextEncoding
-utf8 = TextEncoding { mkTextDecoder = utf8_DF,
+utf8 = TextEncoding { textEncodingName = "UTF-8",
+                      mkTextDecoder = utf8_DF,
                      mkTextEncoder = utf8_EF }
 
 utf8_DF :: IO (TextDecoder ())
@@ -57,6 +63,74 @@ utf8_EF =
              setState = const $ return ()
           })
 
+utf8_bom :: TextEncoding
+utf8_bom = TextEncoding { textEncodingName = "UTF-8BOM",
+                          mkTextDecoder = utf8_bom_DF,
+                          mkTextEncoder = utf8_bom_EF }
+
+utf8_bom_DF :: IO (TextDecoder Bool)
+utf8_bom_DF = do
+   ref <- newIORef True
+   return (BufferCodec {
+             encode   = utf8_bom_decode ref,
+             close    = return (),
+             getState = readIORef ref,
+             setState = writeIORef ref
+          })
+
+utf8_bom_EF :: IO (TextEncoder Bool)
+utf8_bom_EF = do
+   ref <- newIORef True
+   return (BufferCodec {
+             encode   = utf8_bom_encode ref,
+             close    = return (),
+             getState = readIORef ref,
+             setState = writeIORef ref
+          })
+
+utf8_bom_decode :: IORef Bool -> DecodeBuffer
+utf8_bom_decode ref
+  input@Buffer{  bufRaw=iraw, bufL=ir, bufR=iw,  bufSize=_  }
+  output
+ = do
+   first <- readIORef ref
+   if not first
+      then utf8_decode input output
+      else do
+       let no_bom = do writeIORef ref False; utf8_decode input output
+       if iw - ir < 1 then return (input,output) else do
+       c0 <- readWord8Buf iraw ir
+       if (c0 /= bom0) then no_bom else do
+       if iw - ir < 2 then return (input,output) else do
+       c1 <- readWord8Buf iraw (ir+1)
+       if (c1 /= bom1) then no_bom else do
+       if iw - ir < 3 then return (input,output) else do
+       c2 <- readWord8Buf iraw (ir+2)
+       if (c2 /= bom2) then no_bom else do
+       -- found a BOM, ignore it and carry on
+       writeIORef ref False
+       utf8_decode input{ bufL = ir + 3 } output
+
+utf8_bom_encode :: IORef Bool -> EncodeBuffer
+utf8_bom_encode ref input
+  output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow, bufSize=os }
+ = do
+  b <- readIORef ref
+  if not b then utf8_encode input output
+           else if os - ow < 3
+                  then return (input,output)
+                  else do
+                    writeIORef ref False
+                    writeWord8Buf oraw ow     bom0
+                    writeWord8Buf oraw (ow+1) bom1
+                    writeWord8Buf oraw (ow+2) bom2
+                    utf8_encode input output{ bufR = ow+3 }
+
+bom0, bom1, bom2 :: Word8
+bom0 = 0xef
+bom1 = 0xbb
+bom2 = 0xbf
+
 utf8_decode :: DecodeBuffer
 utf8_decode 
   input@Buffer{  bufRaw=iraw, bufL=ir0, bufR=iw,  bufSize=_  }
@@ -68,29 +142,50 @@ utf8_decode
               c0 <- readWord8Buf iraw ir
               case c0 of
                 _ | c0 <= 0x7f -> do 
-                           writeCharBuf oraw ow (unsafeChr (fromIntegral c0))
-                           loop (ir+1) (ow+1)
+                           ow' <- writeCharBuf oraw ow (unsafeChr (fromIntegral c0))
+                           loop (ir+1) ow'
                   | c0 >= 0xc0 && c0 <= 0xdf ->
                            if iw - ir < 2 then done ir ow else do
                            c1 <- readWord8Buf iraw (ir+1)
                            if (c1 < 0x80 || c1 >= 0xc0) then invalid else do
-                           writeCharBuf oraw ow (chr2 c0 c1)
-                           loop (ir+2) (ow+1)
+                           ow' <- writeCharBuf oraw ow (chr2 c0 c1)
+                           loop (ir+2) ow'
                   | c0 >= 0xe0 && c0 <= 0xef ->
-                           if iw - ir < 3 then done ir ow else do
+                      case iw - ir of
+                        1 -> done ir ow
+                        2 -> do -- check for an error even when we don't have
+                                -- the full sequence yet (#3341)
+                           c1 <- readWord8Buf iraw (ir+1)
+                           if not (validate3 c0 c1 0x80) 
+                              then invalid else done ir ow
+                        _ -> do
                            c1 <- readWord8Buf iraw (ir+1)
                            c2 <- readWord8Buf iraw (ir+2)
                            if not (validate3 c0 c1 c2) then invalid else do
-                           writeCharBuf oraw ow (chr3 c0 c1 c2)
-                           loop (ir+3) (ow+1)
-                  | otherwise ->
-                           if iw - ir < 4 then done ir ow else do
+                           ow' <- writeCharBuf oraw ow (chr3 c0 c1 c2)
+                           loop (ir+3) ow'
+                  | c0 >= 0xf0 ->
+                      case iw - ir of
+                        1 -> done ir ow
+                        2 -> do -- check for an error even when we don't have
+                                -- the full sequence yet (#3341)
+                           c1 <- readWord8Buf iraw (ir+1)
+                           if not (validate4 c0 c1 0x80 0x80)
+                              then invalid else done ir ow
+                        3 -> do
+                           c1 <- readWord8Buf iraw (ir+1)
+                           c2 <- readWord8Buf iraw (ir+2)
+                           if not (validate4 c0 c1 c2 0x80)
+                              then invalid else done ir ow
+                        _ -> do
                            c1 <- readWord8Buf iraw (ir+1)
                            c2 <- readWord8Buf iraw (ir+2)
                            c3 <- readWord8Buf iraw (ir+3)
                            if not (validate4 c0 c1 c2 c3) then invalid else do
-                           writeCharBuf oraw ow (chr4 c0 c1 c2 c3)
-                           loop (ir+4) (ow+1)
+                           ow' <- writeCharBuf oraw ow (chr4 c0 c1 c2 c3)
+                           loop (ir+4) ow'
+                  | otherwise ->
+                           invalid
          where
            invalid = if ir > ir0 then done ir ow else ioe_decodingError