Merge remote branch 'origin/master' into ghc-generics

author Simon Peyton Jones <simonpj@microsoft.com>

Thu, 19 May 2011 07:38:35 +0000 (08:38 +0100)

committer Simon Peyton Jones <simonpj@microsoft.com>

Thu, 19 May 2011 07:38:35 +0000 (08:38 +0100)
author Simon Peyton Jones <simonpj@microsoft.com>
Thu, 19 May 2011 07:38:35 +0000 (08:38 +0100)
committer Simon Peyton Jones <simonpj@microsoft.com>
Thu, 19 May 2011 07:38:35 +0000 (08:38 +0100)
diff --git a/Control/Exception/Base.hs b/Control/Exception/Base.hs

index a5d72ce..a617917 100644 (file)
--- a/Control/Exception/Base.hs
+++ b/Control/Exception/Base.hs
@@ -119,7 +119,7 @@ module Control.Exception.Base (
  
  #ifdef __GLASGOW_HASKELL__
  import GHC.Base
-import GHC.IO hiding (finally,onException)
+import GHC.IO hiding (bracket,finally,onException)
  import GHC.IO.Exception
  import GHC.Exception
  import GHC.Show
diff --git a/Foreign/C/String.hs b/Foreign/C/String.hs

index becfa4e..fdefdc6 100644 (file)
--- a/Foreign/C/String.hs
+++ b/Foreign/C/String.hs
@@ -23,7 +23,6 @@
  -----------------------------------------------------------------------------
  
  module Foreign.C.String (   -- representation of strings in C
-
    -- * C strings
  
    CString,           -- = Ptr CChar
@@ -31,8 +30,14 @@ module Foreign.C.String (   -- representation of strings in C
  
    -- ** Using a locale-dependent encoding
  
+#ifndef __GLASGOW_HASKELL__
    -- | Currently these functions are identical to their @CAString@ counterparts;
    -- eventually they will use an encoding determined by the current locale.
+#else
+  -- | These functions are different from their @CAString@ counterparts
+  -- in that they will use an encoding determined by the current locale,
+  -- rather than always assuming ASCII.
+#endif
  
    -- conversion of C strings into Haskell strings
    --
@@ -102,10 +107,15 @@ import Foreign.Storable
  import Data.Word
  
  #ifdef __GLASGOW_HASKELL__
+import Control.Monad
+
  import GHC.List
  import GHC.Real
  import GHC.Num
  import GHC.Base
+
+import {-# SOURCE #-} GHC.IO.Encoding
+import qualified GHC.Foreign as GHC
  #else
  import Data.Char ( chr, ord )
  #define unsafeChr chr
@@ -133,12 +143,20 @@ type CStringLen = (Ptr CChar, Int)
  -- | Marshal a NUL terminated C string into a Haskell string.
  --
  peekCString    :: CString -> IO String
+#ifndef __GLASGOW_HASKELL__
  peekCString = peekCAString
+#else
+peekCString = GHC.peekCString foreignEncoding
+#endif
  
  -- | Marshal a C string with explicit length into a Haskell string.
  --
  peekCStringLen           :: CStringLen -> IO String
+#ifndef __GLASGOW_HASKELL__
  peekCStringLen = peekCAStringLen
+#else
+peekCStringLen = GHC.peekCStringLen foreignEncoding
+#endif
  
  -- | Marshal a Haskell string into a NUL terminated C string.
  --
@@ -149,7 +167,11 @@ peekCStringLen = peekCAStringLen
  --   'Foreign.Marshal.Alloc.finalizerFree'.
  --
  newCString :: String -> IO CString
+#ifndef __GLASGOW_HASKELL__
  newCString = newCAString
+#else
+newCString = GHC.newCString foreignEncoding
+#endif
  
  -- | Marshal a Haskell string into a C string (ie, character array) with
  -- explicit length information.
@@ -159,7 +181,11 @@ newCString = newCAString
  --   'Foreign.Marshal.Alloc.finalizerFree'.
  --
  newCStringLen     :: String -> IO CStringLen
+#ifndef __GLASGOW_HASKELL__
  newCStringLen = newCAStringLen
+#else
+newCStringLen = GHC.newCStringLen foreignEncoding
+#endif
  
  -- | Marshal a Haskell string into a NUL terminated C string using temporary
  -- storage.
@@ -171,7 +197,11 @@ newCStringLen = newCAStringLen
  --   storage must /not/ be used after this.
  --
  withCString :: String -> (CString -> IO a) -> IO a
+#ifndef __GLASGOW_HASKELL__
  withCString = withCAString
+#else
+withCString = GHC.withCString foreignEncoding
+#endif
  
  -- | Marshal a Haskell string into a C string (ie, character array)
  -- in temporary storage, with explicit length information.
@@ -181,14 +211,26 @@ withCString = withCAString
  --   storage must /not/ be used after this.
  --
  withCStringLen         :: String -> (CStringLen -> IO a) -> IO a
+#ifndef __GLASGOW_HASKELL__
  withCStringLen = withCAStringLen
+#else
+withCStringLen = GHC.withCStringLen foreignEncoding
+#endif
  
+
+#ifndef __GLASGOW_HASKELL__
  -- | Determines whether a character can be accurately encoded in a 'CString'.
  -- Unrepresentable characters are converted to @\'?\'@.
  --
  -- Currently only Latin-1 characters are representable.
  charIsRepresentable :: Char -> IO Bool
  charIsRepresentable c = return (ord c < 256)
+#else
+-- -- | Determines whether a character can be accurately encoded in a 'CString'.
+-- -- Unrepresentable characters are converted to '?' or their nearest visual equivalent.
+charIsRepresentable :: Char -> IO Bool
+charIsRepresentable = GHC.charIsRepresentable foreignEncoding
+#endif
  
  -- single byte characters
  -- ----------------------
diff --git a/GHC/Conc/Windows.hs b/GHC/Conc/Windows.hs

index 6ae525c..fecbb20 100644 (file)
--- a/GHC/Conc/Windows.hs
+++ b/GHC/Conc/Windows.hs
@@ -42,7 +42,6 @@ import Control.Monad
  import Data.Bits (shiftR)
  import Data.Maybe (Maybe(..))
  import Data.Typeable
-import Foreign.C.Error (throwErrno)
  import GHC.Base
  import GHC.Conc.Sync
  import GHC.Enum (Enum)
@@ -55,6 +54,7 @@ import GHC.Read (Read)
  import GHC.Real (div, fromIntegral)
  import GHC.Show (Show)
  import GHC.Word (Word32, Word64)
+import GHC.Windows
  
  -- ----------------------------------------------------------------------------
  -- Thread waiting
@@ -236,7 +236,7 @@ service_loop wakeup old_delays = do
  
    r <- c_WaitForSingleObject wakeup timeout
    case r of
-    0xffffffff -> do c_maperrno; throwErrno "service_loop"
+    0xffffffff -> do throwGetLastError "service_loop"
      0 -> do
          r2 <- c_readIOManagerEvent
          exit <-
@@ -312,15 +312,6 @@ getDelay now all@(d : rest)
              milli_seconds = (micro_seconds + 999) `div` 1000
          in return (all, fromIntegral milli_seconds)
  
--- ToDo: this just duplicates part of System.Win32.Types, which isn't
--- available yet.  We should move some Win32 functionality down here,
--- maybe as part of the grand reorganisation of the base package...
-type HANDLE       = Ptr ()
-type DWORD        = Word32
-
-iNFINITE :: DWORD
-iNFINITE = 0xFFFFFFFF -- urgh
-
  foreign import ccall unsafe "getIOManagerEvent" -- in the RTS (ThrIOManager.c)
    c_getIOManagerEvent :: IO HANDLE
  
@@ -330,8 +321,5 @@ foreign import ccall unsafe "readIOManagerEvent" -- in the RTS (ThrIOManager.c)
  foreign import ccall unsafe "sendIOManagerEvent" -- in the RTS (ThrIOManager.c)
    c_sendIOManagerEvent :: Word32 -> IO ()
  
-foreign import ccall unsafe "maperrno"             -- in Win32Utils.c
-   c_maperrno :: IO ()
-
  foreign import stdcall "WaitForSingleObject"
     c_WaitForSingleObject :: HANDLE -> DWORD -> IO DWORD
diff --git a/GHC/Environment.hs b/GHC/Environment.hs

index 60325b3..73f85ed 100644 (file)
--- a/GHC/Environment.hs
+++ b/GHC/Environment.hs
@@ -1,12 +1,42 @@
-{-# LANGUAGE ForeignFunctionInterface #-}
+{-# LANGUAGE CPP, ForeignFunctionInterface #-}
  
  module GHC.Environment (getFullArgs) where
  
  import Prelude
  import Foreign
  import Foreign.C
+
+#ifdef mingw32_HOST_OS
+import GHC.IO (finally)
+import GHC.Windows
+
+-- Ignore the arguments to hs_init on Windows for the sake of Unicode compat
+getFullArgs :: IO [String]
+getFullArgs = do
+    p_arg_string <- c_GetCommandLine
+    alloca $ \p_argc -> do
+     p_argv <- c_CommandLineToArgv p_arg_string p_argc
+     if p_argv == nullPtr
+      then throwGetLastError "getFullArgs"
+      else flip finally (c_LocalFree p_argv) $ do
+       argc <- peek p_argc
+       p_argvs <- peekArray (fromIntegral argc) p_argv
+       mapM peekCWString p_argvs
+
+foreign import stdcall unsafe "windows.h GetCommandLineW"
+    c_GetCommandLine :: IO (Ptr CWString)
+
+foreign import stdcall unsafe "windows.h CommandLineToArgvW"
+    c_CommandLineToArgv :: Ptr CWString -> Ptr CInt -> IO (Ptr CWString)
+
+foreign import stdcall unsafe "Windows.h LocalFree"
+    c_LocalFree :: Ptr a -> IO (Ptr a)
+#else
  import Control.Monad
  
+import GHC.IO.Encoding
+import qualified GHC.Foreign as GHC
+
  getFullArgs :: IO [String]
  getFullArgs =
    alloca $ \ p_argc ->
@@ -14,8 +44,8 @@ getFullArgs =
     getFullProgArgv p_argc p_argv
     p    <- fromIntegral `liftM` peek p_argc
     argv <- peek p_argv
-   peekArray (p - 1) (advancePtr argv 1) >>= mapM peekCString
+   peekArray (p - 1) (advancePtr argv 1) >>= mapM (GHC.peekCString fileSystemEncoding)
  
  foreign import ccall unsafe "getFullProgArgv"
      getFullProgArgv :: Ptr CInt -> Ptr (Ptr CString) -> IO ()
-
+#endif
\ No newline at end of file
diff --git a/GHC/Foreign.hs b/GHC/Foreign.hs

new file mode 100644 (file)

index 0000000..109fa83
--- /dev/null
+++ b/GHC/Foreign.hs
@@ -0,0 +1,256 @@
+{-# LANGUAGE CPP, NoImplicitPrelude #-}
+
+-----------------------------------------------------------------------------
+-- |
+-- Module      :  GHC.IO.Encoding
+-- Copyright   :  (c) The University of Glasgow, 2008-2011
+-- License     :  see libraries/base/LICENSE
+-- 
+-- Maintainer  :  libraries@haskell.org
+-- Stability   :  internal
+-- Portability :  non-portable
+--
+-- Foreign marshalling support for CStrings with configurable encodings
+--
+-----------------------------------------------------------------------------
+
+module GHC.Foreign (
+    -- * C strings with a configurable encoding
+    
+    -- conversion of C strings into Haskell strings
+    --
+    peekCString,       -- :: TextEncoding -> CString    -> IO String
+    peekCStringLen,    -- :: TextEncoding -> CStringLen -> IO String
+    
+    -- conversion of Haskell strings into C strings
+    --
+    newCString,        -- :: TextEncoding -> String -> IO CString
+    newCStringLen,     -- :: TextEncoding -> String -> IO CStringLen
+    
+    -- conversion of Haskell strings into C strings using temporary storage
+    --
+    withCString,       -- :: TextEncoding -> String -> (CString    -> IO a) -> IO a
+    withCStringLen,    -- :: TextEncoding -> String -> (CStringLen -> IO a) -> IO a
+    
+    charIsRepresentable, -- :: TextEncoding -> Char -> IO Bool
+  ) where
+
+import Foreign.Marshal.Array
+import Foreign.C.Types
+import Foreign.Ptr
+import Foreign.Storable
+
+import Data.Word
+
+-- Imports for the locale-encoding version of marshallers
+import Control.Monad
+
+import Data.Tuple (fst)
+import Data.Maybe
+
+import {-# SOURCE #-} System.Posix.Internals (puts)
+import GHC.Show ( show )
+
+import Foreign.Marshal.Alloc
+import Foreign.ForeignPtr
+
+import GHC.Err (undefined)
+import GHC.List
+import GHC.Num
+import GHC.Base
+
+import GHC.IO
+import GHC.IO.Exception
+import GHC.IO.Buffer
+import GHC.IO.Encoding.Failure (surrogatifyRoundtripCharacter, desurrogatifyRoundtripCharacter)
+import GHC.IO.Encoding.Types
+
+
+c_DEBUG_DUMP :: Bool
+c_DEBUG_DUMP = False
+
+putDebugMsg :: String -> IO ()
+putDebugMsg | c_DEBUG_DUMP = puts
+            | otherwise    = const (return ())
+
+
+-- These definitions are identical to those in Foreign.C.String, but copied in here to avoid a cycle:
+type CString    = Ptr CChar
+type CStringLen = (Ptr CChar, Int)
+
+-- exported functions
+-- ------------------
+
+-- | Marshal a NUL terminated C string into a Haskell string.
+--
+peekCString    :: TextEncoding -> CString -> IO String
+peekCString enc cp = do
+    sz <- lengthArray0 nUL cp
+    peekEncodedCString enc (cp, sz * cCharSize)
+
+-- | Marshal a C string with explicit length into a Haskell string.
+--
+peekCStringLen           :: TextEncoding -> CStringLen -> IO String
+peekCStringLen = peekEncodedCString
+
+-- | Marshal a Haskell string into a NUL terminated C string.
+--
+-- * the Haskell string may /not/ contain any NUL characters
+--
+-- * new storage is allocated for the C string and must be
+--   explicitly freed using 'Foreign.Marshal.Alloc.free' or
+--   'Foreign.Marshal.Alloc.finalizerFree'.
+--
+newCString :: TextEncoding -> String -> IO CString
+newCString enc = liftM fst . newEncodedCString enc True
+
+-- | Marshal a Haskell string into a C string (ie, character array) with
+-- explicit length information.
+--
+-- * new storage is allocated for the C string and must be
+--   explicitly freed using 'Foreign.Marshal.Alloc.free' or
+--   'Foreign.Marshal.Alloc.finalizerFree'.
+--
+newCStringLen     :: TextEncoding -> String -> IO CStringLen
+newCStringLen enc = newEncodedCString enc False
+
+-- | Marshal a Haskell string into a NUL terminated C string using temporary
+-- storage.
+--
+-- * the Haskell string may /not/ contain any NUL characters
+--
+-- * the memory is freed when the subcomputation terminates (either
+--   normally or via an exception), so the pointer to the temporary
+--   storage must /not/ be used after this.
+--
+withCString :: TextEncoding -> String -> (CString -> IO a) -> IO a
+withCString enc s act = withEncodedCString enc True s $ \(cp, _sz) -> act cp
+
+-- | Marshal a Haskell string into a C string (ie, character array)
+-- in temporary storage, with explicit length information.
+--
+-- * the memory is freed when the subcomputation terminates (either
+--   normally or via an exception), so the pointer to the temporary
+--   storage must /not/ be used after this.
+--
+withCStringLen         :: TextEncoding -> String -> (CStringLen -> IO a) -> IO a
+withCStringLen enc = withEncodedCString enc False
+
+
+-- | Determines whether a character can be accurately encoded in a 'CString'.
+--
+-- Pretty much anyone who uses this function is in a state of sin because
+-- whether or not a character is encodable will, in general, depend on the
+-- context in which it occurs.
+charIsRepresentable :: TextEncoding -> Char -> IO Bool
+charIsRepresentable enc c = withCString enc [c] (fmap (== [c]) . peekCString enc) `catchException` (\e -> let _ = e :: IOException in return False)
+
+-- auxiliary definitions
+-- ----------------------
+
+-- C's end of string character
+nUL :: CChar
+nUL  = 0
+
+-- Size of a CChar in bytes
+cCharSize :: Int
+cCharSize = sizeOf (undefined :: CChar)
+
+
+{-# INLINE peekEncodedCString #-}
+peekEncodedCString :: TextEncoding -- ^ Encoding of CString
+                   -> CStringLen
+                   -> IO String    -- ^ String in Haskell terms
+peekEncodedCString (TextEncoding { mkTextDecoder = mk_decoder }) (p, sz_bytes)
+  = bracket mk_decoder close $ \decoder -> do
+      let chunk_size = sz_bytes `max` 1 -- Decode buffer chunk size in characters: one iteration only for ASCII
+      from0 <- fmap (\fp -> bufferAdd sz_bytes (emptyBuffer fp sz_bytes ReadBuffer)) $ newForeignPtr_ (castPtr p)
+      to <- newCharBuffer chunk_size WriteBuffer
+
+      let go iteration from = do
+            (why, from', to') <- encode decoder from to
+            if isEmptyBuffer from'
+             then
+              -- No input remaining: @why@ will be InputUnderflow, but we don't care
+              fmap (map desurrogatifyRoundtripCharacter) $ withBuffer to' $ peekArray (bufferElems to')
+             else do
+              -- Input remaining: what went wrong?
+              putDebugMsg ("peekEncodedCString: " ++ show iteration ++ " " ++ show why)
+              (from'', to'') <- case why of InvalidSequence -> recover decoder from' to' -- These conditions are equally bad because
+                                            InputUnderflow  -> recover decoder from' to' -- they indicate malformed/truncated input
+                                            OutputUnderflow -> return (from', to')       -- We will have more space next time round
+              putDebugMsg ("peekEncodedCString: from " ++ summaryBuffer from ++ " " ++ summaryBuffer from' ++ " " ++ summaryBuffer from'')
+              putDebugMsg ("peekEncodedCString: to " ++ summaryBuffer to ++ " " ++ summaryBuffer to' ++ " " ++ summaryBuffer to'')
+              to_chars <- withBuffer to'' $ peekArray (bufferElems to'')
+              fmap (map desurrogatifyRoundtripCharacter to_chars++) $ go (iteration + 1) from''
+
+      go (0 :: Int) from0
+
+{-# INLINE withEncodedCString #-}
+withEncodedCString :: TextEncoding         -- ^ Encoding of CString to create
+                   -> Bool                 -- ^ Null-terminate?
+                   -> String               -- ^ String to encode
+                   -> (CStringLen -> IO a) -- ^ Worker that can safely use the allocated memory
+                   -> IO a
+withEncodedCString (TextEncoding { mkTextEncoder = mk_encoder }) null_terminate s act
+  = bracket mk_encoder close $ \encoder -> withArrayLen (map surrogatifyRoundtripCharacter s) $ \sz p -> do
+      from <- fmap (\fp -> bufferAdd sz (emptyBuffer fp sz ReadBuffer)) $ newForeignPtr_ p
+
+      let go iteration to_sz_bytes = do
+           putDebugMsg ("withEncodedCString: " ++ show iteration)
+           allocaBytes to_sz_bytes $ \to_p -> do
+            mb_res <- tryFillBufferAndCall encoder null_terminate from to_p to_sz_bytes act
+            case mb_res of
+              Nothing  -> go (iteration + 1) (to_sz_bytes * 2)
+              Just res -> return res
+
+      -- If the input string is ASCII, this value will ensure we only allocate once
+      go (0 :: Int) (cCharSize * (sz + 1))
+
+{-# INLINE newEncodedCString #-}
+newEncodedCString :: TextEncoding  -- ^ Encoding of CString to create
+                  -> Bool          -- ^ Null-terminate?
+                  -> String        -- ^ String to encode
+                  -> IO CStringLen
+newEncodedCString (TextEncoding { mkTextEncoder = mk_encoder }) null_terminate s
+  = bracket mk_encoder close $ \encoder -> withArrayLen (map surrogatifyRoundtripCharacter s) $ \sz p -> do
+      from <- fmap (\fp -> bufferAdd sz (emptyBuffer fp sz ReadBuffer)) $ newForeignPtr_ p
+
+      let go iteration to_p to_sz_bytes = do
+           putDebugMsg ("newEncodedCString: " ++ show iteration)
+           mb_res <- tryFillBufferAndCall encoder null_terminate from to_p to_sz_bytes return
+           case mb_res of
+             Nothing  -> do
+                 let to_sz_bytes' = to_sz_bytes * 2
+                 to_p' <- reallocBytes to_p to_sz_bytes'
+                 go (iteration + 1) to_p' to_sz_bytes'
+             Just res -> return res
+
+      -- If the input string is ASCII, this value will ensure we only allocate once
+      let to_sz_bytes = cCharSize * (sz + 1)
+      to_p <- mallocBytes to_sz_bytes
+      go (0 :: Int) to_p to_sz_bytes
+
+
+tryFillBufferAndCall :: TextEncoder dstate -> Bool -> Buffer Char -> Ptr Word8 -> Int
+                     -> (CStringLen -> IO a) -> IO (Maybe a)
+tryFillBufferAndCall encoder null_terminate from0 to_p to_sz_bytes act = do
+    to_fp <- newForeignPtr_ to_p
+    go (0 :: Int) (from0, emptyBuffer to_fp to_sz_bytes WriteBuffer)
+  where
+    go iteration (from, to) = do
+      (why, from', to') <- encode encoder from to
+      putDebugMsg ("tryFillBufferAndCall: " ++ show iteration ++ " " ++ show why ++ " " ++ summaryBuffer from ++ " " ++ summaryBuffer from')
+      if isEmptyBuffer from'
+       then if null_terminate && bufferAvailable to' == 0
+             then return Nothing -- We had enough for the string but not the terminator: ask the caller for more buffer
+             else do
+               -- Awesome, we had enough buffer
+               let bytes = bufferElems to'
+               withBuffer to' $ \to_ptr -> do
+                   when null_terminate $ pokeElemOff to_ptr (bufR to') 0
+                   fmap Just $ act (castPtr to_ptr, bytes) -- NB: the length information is specified as being in *bytes*
+       else case why of -- We didn't consume all of the input
+              InputUnderflow  -> recover encoder from' to' >>= go (iteration + 1) -- These conditions are equally bad
+              InvalidSequence -> recover encoder from' to' >>= go (iteration + 1) -- since the input was truncated/invalid
+              OutputUnderflow -> return Nothing -- Oops, out of buffer during decoding: ask the caller for more
diff --git a/GHC/IO.hs b/GHC/IO.hs

index 7295a2c..ad98a5e 100644 (file)
--- a/GHC/IO.hs
+++ b/GHC/IO.hs
@@ -36,7 +36,7 @@ module GHC.IO (
      mask, mask_, uninterruptibleMask, uninterruptibleMask_, 
      MaskingState(..), getMaskingState,
      block, unblock, blocked, unsafeUnmask,
-    onException, finally, evaluate
+    onException, bracket, finally, evaluate
    ) where
  
  import GHC.Base
@@ -432,6 +432,18 @@ uninterruptibleMask io = do
      MaskedInterruptible   -> blockUninterruptible $ io block
      MaskedUninterruptible -> io id
  
+bracket
+        :: IO a         -- ^ computation to run first (\"acquire resource\")
+        -> (a -> IO b)  -- ^ computation to run last (\"release resource\")
+        -> (a -> IO c)  -- ^ computation to run in-between
+        -> IO c         -- returns the value from the in-between computation
+bracket before after thing =
+  mask $ \restore -> do
+    a <- before
+    r <- restore (thing a) `onException` after a
+    _ <- after a
+    return r
+
  finally :: IO a         -- ^ computation to run first
          -> IO b         -- ^ computation to run afterward (even if an exception
                          -- was raised)
diff --git a/GHC/IO/Encoding.hs b/GHC/IO/Encoding.hs

index 5d8ecb4..505824e 100644 (file)
--- a/GHC/IO/Encoding.hs
+++ b/GHC/IO/Encoding.hs
@@ -16,18 +16,20 @@
  -----------------------------------------------------------------------------
  
  module GHC.IO.Encoding (
-  BufferCodec(..), TextEncoding(..), TextEncoder, TextDecoder,
+  BufferCodec(..), TextEncoding(..), TextEncoder, TextDecoder, CodingProgress(..),
    latin1, latin1_encode, latin1_decode,
    utf8, utf8_bom,
    utf16, utf16le, utf16be,
    utf32, utf32le, utf32be, 
-  localeEncoding,
+  localeEncoding, fileSystemEncoding, foreignEncoding,
    mkTextEncoding,
    ) where
  
  import GHC.Base
  --import GHC.IO
+import GHC.IO.Exception
  import GHC.IO.Buffer
+import GHC.IO.Encoding.Failure
  import GHC.IO.Encoding.Types
  import GHC.Word
  #if !defined(mingw32_HOST_OS)
@@ -41,10 +43,8 @@ import qualified GHC.IO.Encoding.UTF8   as UTF8
  import qualified GHC.IO.Encoding.UTF16  as UTF16
  import qualified GHC.IO.Encoding.UTF32  as UTF32
  
-#if defined(mingw32_HOST_OS)
+import Data.List
  import Data.Maybe
-import GHC.IO.Exception
-#endif
  
  -- -----------------------------------------------------------------------------
  
@@ -97,11 +97,32 @@ utf32be  :: TextEncoding
  utf32be = UTF32.utf32be
  
  -- | The Unicode encoding of the current locale
-localeEncoding  :: TextEncoding
+localeEncoding :: TextEncoding
+
+-- | The Unicode encoding of the current locale, but allowing arbitrary
+-- undecodable bytes to be round-tripped through it.
+--
+-- This 'TextEncoding' is used to decode and encode command line arguments
+-- and environment variables on non-Windows platforms.
+--
+-- On Windows, this encoding *should not* be used if possible because
+-- the use of code pages is deprecated: Strings should be retrieved
+-- via the "wide" W-family of UTF-16 APIs instead
+fileSystemEncoding :: TextEncoding
+
+-- | The Unicode encoding of the current locale, but where undecodable
+-- bytes are replaced with their closest visual match. Used for
+-- the 'CString' marshalling functions in "Foreign.C.String"
+foreignEncoding :: TextEncoding
+
  #if !defined(mingw32_HOST_OS)
  localeEncoding = Iconv.localeEncoding
+fileSystemEncoding = Iconv.mkLocaleEncoding RoundtripFailure
+foreignEncoding = Iconv.mkLocaleEncoding IgnoreCodingFailure
  #else
  localeEncoding = CodePage.localeEncoding
+fileSystemEncoding = CodePage.mkLocaleEncoding RoundtripFailure
+foreignEncoding = CodePage.mkLocaleEncoding IgnoreCodingFailure
  #endif
  
  -- | Look up the named Unicode encoding.  May fail with 
@@ -131,27 +152,40 @@ localeEncoding = CodePage.localeEncoding
  -- @CP@; for example, @\"CP1250\"@.
  --
  mkTextEncoding :: String -> IO TextEncoding
-#if !defined(mingw32_HOST_OS)
-mkTextEncoding = Iconv.mkTextEncoding
+mkTextEncoding e = case mb_coding_failure_mode of
+  Nothing -> unknown_encoding
+  Just cfm -> case enc of
+    "UTF-8"    -> return $ UTF8.mkUTF8 cfm
+    "UTF-16"   -> return $ UTF16.mkUTF16 cfm
+    "UTF-16LE" -> return $ UTF16.mkUTF16le cfm
+    "UTF-16BE" -> return $ UTF16.mkUTF16be cfm
+    "UTF-32"   -> return $ UTF32.mkUTF32 cfm
+    "UTF-32LE" -> return $ UTF32.mkUTF32le cfm
+    "UTF-32BE" -> return $ UTF32.mkUTF32be cfm
+#if defined(mingw32_HOST_OS)
+    'C':'P':n | [(cp,"")] <- reads n -> return $ CodePage.mkCodePageEncoding cfm cp
+    _ -> unknown_encoding
  #else
-mkTextEncoding "UTF-8"    = return utf8
-mkTextEncoding "UTF-16"   = return utf16
-mkTextEncoding "UTF-16LE" = return utf16le
-mkTextEncoding "UTF-16BE" = return utf16be
-mkTextEncoding "UTF-32"   = return utf32
-mkTextEncoding "UTF-32LE" = return utf32le
-mkTextEncoding "UTF-32BE" = return utf32be
-mkTextEncoding ('C':'P':n)
-    | [(cp,"")] <- reads n = return $ CodePage.codePageEncoding cp
-mkTextEncoding e = ioException
-     (IOError Nothing NoSuchThing "mkTextEncoding"
-          ("unknown encoding:" ++ e)  Nothing Nothing)
+    _ -> Iconv.mkIconvEncoding cfm enc
  #endif
+  where
+    -- The only problem with actually documenting //IGNORE and //TRANSLIT as
+    -- supported suffixes is that they are not necessarily supported with non-GNU iconv
+    (enc, suffix) = span (/= '/') e
+    mb_coding_failure_mode = case suffix of
+        ""            -> Just ErrorOnCodingFailure
+        "//IGNORE"    -> Just IgnoreCodingFailure
+        "//TRANSLIT"  -> Just TransliterateCodingFailure
+        "//ROUNDTRIP" -> Just RoundtripFailure
+        _             -> Nothing
+    
+    unknown_encoding = ioException (IOError Nothing NoSuchThing "mkTextEncoding"
+                                            ("unknown encoding:" ++ e)  Nothing Nothing)
  
  latin1_encode :: CharBuffer -> Buffer Word8 -> IO (CharBuffer, Buffer Word8)
-latin1_encode = Latin1.latin1_encode -- unchecked, used for binary
+latin1_encode input output = fmap (\(_why,input',output') -> (input',output')) $ Latin1.latin1_encode input output -- unchecked, used for binary
  --latin1_encode = unsafePerformIO $ do mkTextEncoder Iconv.latin1 >>= return.encode
  
  latin1_decode :: Buffer Word8 -> CharBuffer -> IO (Buffer Word8, CharBuffer)
-latin1_decode = Latin1.latin1_decode
+latin1_decode input output = fmap (\(_why,input',output') -> (input',output')) $ Latin1.latin1_decode input output
  --latin1_decode = unsafePerformIO $ do mkTextDecoder Iconv.latin1 >>= return.encode
diff --git a/GHC/IO/Encoding.hs-boot b/GHC/IO/Encoding.hs-boot

new file mode 100644 (file)

index 0000000..9223cc3
--- /dev/null
+++ b/GHC/IO/Encoding.hs-boot
@@ -0,0 +1,6 @@
+{-# LANGUAGE NoImplicitPrelude #-}
+module GHC.IO.Encoding where
+
+import GHC.IO.Encoding.Types
+
+localeEncoding, fileSystemEncoding, foreignEncoding :: TextEncoding
\ No newline at end of file
diff --git a/GHC/IO/Encoding/CodePage.hs b/GHC/IO/Encoding/CodePage.hs

index 9ecc1fc..a6b4e95 100644 (file)
--- a/GHC/IO/Encoding/CodePage.hs
+++ b/GHC/IO/Encoding/CodePage.hs
@@ -4,8 +4,8 @@ module GHC.IO.Encoding.CodePage(
  #if !defined(mingw32_HOST_OS)
   ) where
  #else
-                        codePageEncoding,
-                        localeEncoding
+                        codePageEncoding, mkCodePageEncoding,
+                        localeEncoding, mkLocaleEncoding
                              ) where
  
  import GHC.Base
@@ -14,19 +14,19 @@ import GHC.Num
  import GHC.Enum
  import GHC.Word
  import GHC.IO (unsafePerformIO)
+import GHC.IO.Encoding.Failure
  import GHC.IO.Encoding.Types
  import GHC.IO.Buffer
-import GHC.IO.Exception
  import Data.Bits
  import Data.Maybe
  import Data.List (lookup)
  
  import GHC.IO.Encoding.CodePage.Table
  
-import GHC.IO.Encoding.Latin1 (latin1)
-import GHC.IO.Encoding.UTF8 (utf8)
-import GHC.IO.Encoding.UTF16 (utf16le, utf16be)
-import GHC.IO.Encoding.UTF32 (utf32le, utf32be)
+import GHC.IO.Encoding.Latin1 (mkLatin1)
+import GHC.IO.Encoding.UTF8 (mkUTF8)
+import GHC.IO.Encoding.UTF16 (mkUTF16le, mkUTF16be)
+import GHC.IO.Encoding.UTF32 (mkUTF32le, mkUTF32be)
  
  -- note CodePage = UInt which might not work on Win64.  But the Win32 package
  -- also has this issue.
@@ -44,43 +44,59 @@ foreign import stdcall unsafe "windows.h GetConsoleCP"
  foreign import stdcall unsafe "windows.h GetACP"
      getACP :: IO Word32
  
-{-# NOINLINE localeEncoding #-}
+{-# NOINLINE currentCodePage #-}
+currentCodePage :: Word32
+currentCodePage = unsafePerformIO getCurrentCodePage
+
  localeEncoding :: TextEncoding
-localeEncoding = unsafePerformIO $ fmap codePageEncoding getCurrentCodePage
-    
+localeEncoding = mkLocaleEncoding ErrorOnCodingFailure
+
+mkLocaleEncoding :: CodingFailureMode -> TextEncoding
+mkLocaleEncoding cfm = mkCodePageEncoding cfm currentCodePage
+
  
  codePageEncoding :: Word32 -> TextEncoding
-codePageEncoding 65001 = utf8
-codePageEncoding 1200 = utf16le
-codePageEncoding 1201 = utf16be
-codePageEncoding 12000 = utf32le
-codePageEncoding 12001 = utf32be
-codePageEncoding cp = maybe latin1 (buildEncoding cp) (lookup cp codePageMap)
-
-buildEncoding :: Word32 -> CodePageArrays -> TextEncoding
-buildEncoding cp SingleByteCP {decoderArray = dec, encoderArray = enc}
+codePageEncoding = mkCodePageEncoding ErrorOnCodingFailure
+
+mkCodePageEncoding :: CodingFailureMode -> Word32 -> TextEncoding
+mkCodePageEncoding cfm 65001 = mkUTF8 cfm
+mkCodePageEncoding cfm 1200 = mkUTF16le cfm
+mkCodePageEncoding cfm 1201 = mkUTF16be cfm
+mkCodePageEncoding cfm 12000 = mkUTF32le cfm
+mkCodePageEncoding cfm 12001 = mkUTF32be cfm
+mkCodePageEncoding cfm cp = maybe (mkLatin1 cfm) (buildEncoding cfm cp) (lookup cp codePageMap)
+
+buildEncoding :: CodingFailureMode -> Word32 -> CodePageArrays -> TextEncoding
+buildEncoding cfm cp SingleByteCP {decoderArray = dec, encoderArray = enc}
    = TextEncoding {
-    textEncodingName = "CP" ++ show cp,
-    mkTextDecoder = return $ simpleCodec
-        $ decodeFromSingleByte dec
-    , mkTextEncoder = return $ simpleCodec $ encodeToSingleByte enc
+      textEncodingName = "CP" ++ show cp
+    , mkTextDecoder = return $ simpleCodec (recoverDecode cfm) $ decodeFromSingleByte dec
+    , mkTextEncoder = return $ simpleCodec (recoverEncode cfm) $ encodeToSingleByte enc
      }
  
  simpleCodec :: (Buffer from -> Buffer to -> IO (Buffer from, Buffer to))
+            -> (Buffer from -> Buffer to -> IO (CodingProgress, Buffer from, Buffer to))
                  -> BufferCodec from to ()
-simpleCodec f = BufferCodec {encode = f, close = return (), getState = return (),
-                                    setState = return }
+simpleCodec r f = BufferCodec {
+    encode = f,
+    recover = r,
+    close = return (),
+    getState = return (),
+    setState = return
+  }
  
  decodeFromSingleByte :: ConvArray Char -> DecodeBuffer
  decodeFromSingleByte convArr
      input@Buffer  { bufRaw=iraw, bufL=ir0, bufR=iw,  bufSize=_  }
      output@Buffer { bufRaw=oraw, bufL=_,   bufR=ow0, bufSize=os }
    = let
-        done !ir !ow = return (if ir==iw then input{ bufL=0, bufR=0}
-                                            else input{ bufL=ir},
-                                    output {bufR=ow})
+        done why !ir !ow = return (why,
+                                   if ir==iw then input{ bufL=0, bufR=0}
+                                             else input{ bufL=ir},
+                                   output {bufR=ow})
          loop !ir !ow
-            | ow >= os  || ir >= iw     = done ir ow
+            | ow >= os  = done OutputUnderflow ir ow
+            | ir >= iw  = done InputUnderflow ir ow
              | otherwise = do
                  b <- readWord8Buf iraw ir
                  let c = lookupConv convArr b
@@ -88,7 +104,7 @@ decodeFromSingleByte convArr
                  ow' <- writeCharBuf oraw ow c
                  loop (ir+1) ow'
            where
-            invalid = if ir > ir0 then done ir ow else ioe_decodingError
+            invalid = done InvalidSequence ir ow
      in loop ir0 ow0
  
  encodeToSingleByte :: CompactArray Char Word8 -> EncodeBuffer
@@ -98,11 +114,13 @@ encodeToSingleByte CompactArray { encoderMax = maxChar,
      input@Buffer{ bufRaw=iraw, bufL=ir0, bufR=iw, bufSize=_ }
      output@Buffer{ bufRaw=oraw, bufL=_,   bufR=ow0, bufSize=os }
    = let
-        done !ir !ow = return (if ir==iw then input { bufL=0, bufR=0 }
-                                            else input { bufL=ir },
-                                output {bufR=ow})
+        done why !ir !ow = return (why,
+                                   if ir==iw then input { bufL=0, bufR=0 }
+                                             else input { bufL=ir },
+                                   output {bufR=ow})
          loop !ir !ow
-            | ow >= os || ir >= iw  = done ir ow
+            | ow >= os  = done OutputUnderflow ir ow
+            | ir >= iw  = done InputUnderflow ir ow
              | otherwise = do
                  (c,ir') <- readCharBuf iraw ir
                  case lookupCompact maxChar indices values c of
@@ -112,20 +130,10 @@ encodeToSingleByte CompactArray { encoderMax = maxChar,
                          writeWord8Buf oraw ow b
                          loop ir' (ow+1)
              where
-                invalid = if ir > ir0 then done ir ow else ioe_encodingError
+                invalid = done InvalidSequence ir ow
      in
      loop ir0 ow0
  
-ioe_decodingError :: IO a
-ioe_decodingError = ioException
-    (IOError Nothing InvalidArgument "codePageEncoding"
-        "invalid code page byte sequence" Nothing Nothing)
-
-ioe_encodingError :: IO a
-ioe_encodingError = ioException
-    (IOError Nothing InvalidArgument "codePageEncoding"
-        "character is not in the code page" Nothing Nothing)
-
  
  --------------------------------------------
  -- Array access functions
diff --git a/GHC/IO/Encoding/Failure.hs b/GHC/IO/Encoding/Failure.hs

new file mode 100644 (file)

index 0000000..a27650b
--- /dev/null
+++ b/GHC/IO/Encoding/Failure.hs
@@ -0,0 +1,173 @@
+{-# LANGUAGE NoImplicitPrelude, PatternGuards #-}
+-----------------------------------------------------------------------------
+-- |
+-- Module      :  GHC.IO.Encoding.Failure
+-- Copyright   :  (c) The University of Glasgow, 2008-2011
+-- License     :  see libraries/base/LICENSE
+-- 
+-- Maintainer  :  libraries@haskell.org
+-- Stability   :  internal
+-- Portability :  non-portable
+--
+-- Types for specifying how text encoding/decoding fails
+--
+-----------------------------------------------------------------------------
+
+module GHC.IO.Encoding.Failure (
+    CodingFailureMode(..), codingFailureModeSuffix,
+    isSurrogate,
+    surrogatifyRoundtripCharacter, desurrogatifyRoundtripCharacter,
+    recoverDecode, recoverEncode
+  ) where
+
+import GHC.IO
+import GHC.IO.Buffer
+import GHC.IO.Exception
+
+import GHC.Base
+import GHC.Word
+import GHC.Show
+import GHC.Num
+import GHC.Real ( fromIntegral )
+
+--import System.Posix.Internals
+
+import Data.Maybe
+
+-- | The 'CodingFailureMode' is used to construct 'TextEncoding's, and specifies
+-- how they handle illegal sequences.
+data CodingFailureMode = ErrorOnCodingFailure         -- ^ Throw an error when an illegal sequence is encountered
+                       | IgnoreCodingFailure          -- ^ Attempt to ignore and recover if an illegal sequence is encountered
+                       | TransliterateCodingFailure   -- ^ Replace with the closest visual match upon an illegal sequence
+                       | RoundtripFailure             -- ^ Use the private-use escape mechanism to attempt to allow illegal sequences to be roundtripped.
+                       deriving (Show)                -- This will only work properly for those encodings which are strict supersets of ASCII in the sense
+                                                      -- that valid ASCII data is also valid in that encoding. This is not true for e.g. UTF-16, because
+                                                      -- ASCII characters must be padded to two bytes to retain their meaning.
+
+-- Note [Roundtripping]
+-- ~~~~~~~~~~~~~~~~~~~~
+--
+-- Roundtripping is based on the ideas of PEP383. However, unlike PEP383 we do not wish to use lone surrogate codepoints
+-- to escape undecodable bytes, because that may confuse Unicode processing software written in Haskell. Instead, we use
+-- the range of private-use characters from 0xEF80 to 0xEFFF designated for "encoding hacks" by the ConScript Unicode Registery.
+--
+-- This introduces a technical problem when it comes to encoding back to bytes using iconv. The iconv code will not fail when
+-- it tries to encode a private-use character (as it would if trying to encode a surrogate), which means that we won't get a
+-- chance to replace it with the byte we originally escaped.
+--
+-- To work around this, when filling the buffer to be encoded (in writeBlocks/withEncodedCString/newEncodedCString), we replace
+-- the private-use characters with lone surrogates again! Likewise, when reading from a buffer (unpack/unpack_nl/peekEncodedCString)
+-- we have to do the inverse process.
+--
+-- The user of String should never see these lone surrogates, but it ensures that iconv will throw an error when encountering them.
+-- We use lone surrogates in the range 0xDC00 to 0xDCFF for this purpose.
+
+codingFailureModeSuffix :: CodingFailureMode -> String
+codingFailureModeSuffix ErrorOnCodingFailure       = ""
+codingFailureModeSuffix IgnoreCodingFailure        = "//IGNORE"
+codingFailureModeSuffix TransliterateCodingFailure = "//TRANSLIT"
+codingFailureModeSuffix RoundtripFailure           = "//ROUNDTRIP"
+
+-- | In transliterate mode, we use this character when decoding unknown bytes.
+--
+-- This is the defined Unicode replacement character: <http://www.fileformat.info/info/unicode/char/0fffd/index.htm>
+unrepresentableChar :: Char
+unrepresentableChar = '\xFFFD'
+
+-- | Some characters are actually "surrogate" codepoints defined for use in UTF-16. We need to signal an
+-- invalid character if we detect them when encoding a sequence of 'Char's into 'Word8's because they won't
+-- give valid Unicode.
+--
+-- We may also need to signal an invalid character if we detect them when encoding a sequence of 'Char's into 'Word8's
+-- because the 'RoundtripFailure' mode creates these to round-trip bytes through our internal UTF-16 encoding.
+isSurrogate :: Char -> Bool
+isSurrogate c = (0xD800 <= x && x <= 0xDBFF) || (0xDC00 <= x && x <= 0xDFFF)
+  where x = ord c
+
+-- | We use some private-use characters for roundtripping unknown bytes through a String
+isRoundtripEscapeChar :: Char -> Bool
+isRoundtripEscapeChar c = 0xEF00 <= x && x < 0xF000
+  where x = ord c
+
+-- | We use some surrogate characters for roundtripping unknown bytes through a String
+isRoundtripEscapeSurrogateChar :: Char -> Bool
+isRoundtripEscapeSurrogateChar c = 0xDC00 <= x && x < 0xDD00
+  where x = ord c
+
+-- Private use characters (in Strings) --> lone surrogates (in Buffer CharBufElem)
+surrogatifyRoundtripCharacter :: Char -> Char
+surrogatifyRoundtripCharacter c | isRoundtripEscapeChar c = chr (ord c - 0xEF00 + 0xDC00)
+                                | otherwise               = c
+
+-- Lone surrogates (in Buffer CharBufElem) --> private use characters (in Strings)
+desurrogatifyRoundtripCharacter :: Char -> Char
+desurrogatifyRoundtripCharacter c | isRoundtripEscapeSurrogateChar c = chr (ord c - 0xDC00 + 0xEF00)
+                                  | otherwise                        = c
+
+-- Bytes (in Buffer Word8) --> lone surrogates (in Buffer CharBufElem)
+escapeToRoundtripCharacterSurrogate :: Word8 -> Char
+escapeToRoundtripCharacterSurrogate b
+  | b < 128   = chr (fromIntegral b) -- Disallow 'smuggling' of ASCII bytes. For roundtripping to work, this assumes encoding is ASCII-superset.
+  | otherwise = chr (0xDC00 + fromIntegral b)
+
+-- Lone surrogates (in Buffer CharBufElem) --> bytes (in Buffer Word8)
+unescapeRoundtripCharacterSurrogate :: Char -> Maybe Word8
+unescapeRoundtripCharacterSurrogate c
+    | 0xDC80 <= x && x < 0xDD00 = Just (fromIntegral x) -- Discard high byte
+    | otherwise                 = Nothing
+  where x = ord c
+
+recoverDecode :: CodingFailureMode -> Buffer Word8 -> Buffer Char -> IO (Buffer Word8, Buffer Char)
+recoverDecode cfm input@Buffer{  bufRaw=iraw, bufL=ir, bufR=_  }
+                  output@Buffer{ bufRaw=oraw, bufL=_,  bufR=ow } = do
+ --puts $ "recoverDecode " ++ show ir
+ case cfm of
+  ErrorOnCodingFailure       -> ioe_decodingError
+  IgnoreCodingFailure        -> return (input { bufL=ir+1 }, output)
+  TransliterateCodingFailure -> do
+      ow' <- writeCharBuf oraw ow unrepresentableChar
+      return (input { bufL=ir+1 }, output { bufR=ow' })
+  RoundtripFailure           -> do
+      b <- readWord8Buf iraw ir
+      ow' <- writeCharBuf oraw ow (escapeToRoundtripCharacterSurrogate b)
+      return (input { bufL=ir+1 }, output { bufR=ow' })
+
+recoverEncode :: CodingFailureMode -> Buffer Char -> Buffer Word8 -> IO (Buffer Char, Buffer Word8)
+recoverEncode cfm input@Buffer{  bufRaw=iraw, bufL=ir, bufR=_  }
+                  output@Buffer{ bufRaw=oraw, bufL=_,  bufR=ow } = do
+  (c,ir') <- readCharBuf iraw ir
+  --puts $ "recoverEncode " ++ show ir ++ " " ++ show ir'
+  case cfm of
+    IgnoreCodingFailure        -> return (input { bufL=ir' }, output)
+    TransliterateCodingFailure -> do
+        if c == '?'
+         then return (input { bufL=ir' }, output)
+         else do
+          -- XXX: evil hack! To implement transliteration, we just poke an
+          -- ASCII ? into the input buffer and tell the caller to try and decode
+          -- again. This is *probably* safe given current uses of TextEncoding.
+          --
+          -- The "if" test above ensures we skip if the encoding fails to deal with
+          -- the ?, though this should never happen in practice as all encodings are
+          -- in fact capable of reperesenting all ASCII characters.
+          _ir' <- writeCharBuf iraw ir '?'
+          return (input, output)
+        
+        -- This implementation does not work because e.g. UTF-16 requires 2 bytes to
+        -- encode a simple ASCII value
+        --writeWord8Buf oraw ow unrepresentableByte
+        --return (input { bufL=ir' }, output { bufR=ow+1 })
+    RoundtripFailure | Just x <- unescapeRoundtripCharacterSurrogate c -> do
+        writeWord8Buf oraw ow x
+        return (input { bufL=ir' }, output { bufR=ow+1 })
+    _                          -> ioe_encodingError
+
+ioe_decodingError :: IO a
+ioe_decodingError = ioException
+    (IOError Nothing InvalidArgument "recoverDecode"
+        "invalid byte sequence" Nothing Nothing)
+
+ioe_encodingError :: IO a
+ioe_encodingError = ioException
+    (IOError Nothing InvalidArgument "recoverEncode"
+        "invalid character" Nothing Nothing)
diff --git a/GHC/IO/Encoding/Iconv.hs b/GHC/IO/Encoding/Iconv.hs

index 6d87595..d919071 100644 (file)
--- a/GHC/IO/Encoding/Iconv.hs
+++ b/GHC/IO/Encoding/Iconv.hs
@@ -21,12 +21,8 @@
  -- #hide
  module GHC.IO.Encoding.Iconv (
  #if !defined(mingw32_HOST_OS)
-   mkTextEncoding,
-   latin1,
-   utf8, 
-   utf16, utf16le, utf16be,
-   utf32, utf32le, utf32be,
-   localeEncoding
+   iconvEncoding, mkIconvEncoding,
+   localeEncoding, mkLocaleEncoding
  #endif
   ) where
  
@@ -40,6 +36,7 @@ import Foreign.C
  import Data.Maybe
  import GHC.Base
  import GHC.IO.Buffer
+import GHC.IO.Encoding.Failure
  import GHC.IO.Encoding.Types
  import GHC.List (span)
  import GHC.Num
@@ -56,47 +53,9 @@ iconv_trace s
   | c_DEBUG_DUMP = puts s
   | otherwise    = return ()
  
-puts :: String -> IO ()
-puts s = do _ <- withCAStringLen (s ++ "\n") $ \(p, len) ->
-                      -- In reality should be withCString, but assume ASCII to avoid loop
-                     c_write 1 (castPtr p) (fromIntegral len)
-            return ()
-
  -- -----------------------------------------------------------------------------
  -- iconv encoders/decoders
  
-{-# NOINLINE latin1 #-}
-latin1 :: TextEncoding
-latin1 = unsafePerformIO (mkTextEncoding "Latin1")
-
-{-# NOINLINE utf8 #-}
-utf8 :: TextEncoding
-utf8 = unsafePerformIO (mkTextEncoding "UTF8")
-
-{-# NOINLINE utf16 #-}
-utf16 :: TextEncoding
-utf16 = unsafePerformIO (mkTextEncoding "UTF16")
-
-{-# NOINLINE utf16le #-}
-utf16le :: TextEncoding
-utf16le = unsafePerformIO (mkTextEncoding "UTF16LE")
-
-{-# NOINLINE utf16be #-}
-utf16be :: TextEncoding
-utf16be = unsafePerformIO (mkTextEncoding "UTF16BE")
-
-{-# NOINLINE utf32 #-}
-utf32 :: TextEncoding
-utf32 = unsafePerformIO (mkTextEncoding "UTF32")
-
-{-# NOINLINE utf32le #-}
-utf32le :: TextEncoding
-utf32le = unsafePerformIO (mkTextEncoding "UTF32LE")
-
-{-# NOINLINE utf32be #-}
-utf32be :: TextEncoding
-utf32be = unsafePerformIO (mkTextEncoding "UTF32BE")
-
  {-# NOINLINE localeEncodingName #-}
  localeEncodingName :: String
  localeEncodingName = unsafePerformIO $ do
@@ -105,9 +64,11 @@ localeEncodingName = unsafePerformIO $ do
     cstr <- c_localeEncoding
     peekCAString cstr -- Assume charset names are ASCII
  
-{-# NOINLINE localeEncoding #-}
  localeEncoding :: TextEncoding
-localeEncoding = unsafePerformIO $ mkTextEncoding localeEncodingName
+localeEncoding = mkLocaleEncoding ErrorOnCodingFailure
+
+mkLocaleEncoding :: CodingFailureMode -> TextEncoding
+mkLocaleEncoding cfm = unsafePerformIO $ mkIconvEncoding cfm localeEncodingName
  
  -- We hope iconv_t is a storable type.  It should be, since it has at least the
  -- value -1, which is a possible return value from iconv_open.
@@ -139,21 +100,25 @@ char_shift :: Int
  char_shift | charSize == 2 = 1
             | otherwise     = 2
  
-mkTextEncoding :: String -> IO TextEncoding
-mkTextEncoding charset = do
+iconvEncoding :: String -> IO TextEncoding
+iconvEncoding = mkIconvEncoding ErrorOnCodingFailure
+
+mkIconvEncoding :: CodingFailureMode -> String -> IO TextEncoding
+mkIconvEncoding cfm charset = do
    return (TextEncoding { 
                  textEncodingName = charset,
-               mkTextDecoder = newIConv raw_charset (haskellChar ++ suffix) iconvDecode,
-               mkTextEncoder = newIConv haskellChar charset iconvEncode})
+               mkTextDecoder = newIConv raw_charset (haskellChar ++ suffix) (recoverDecode cfm) iconvDecode,
+               mkTextEncoder = newIConv haskellChar charset                 (recoverEncode cfm) iconvEncode})
    where
      -- An annoying feature of GNU iconv is that the //PREFIXES only take
      -- effect when they appear on the tocode parameter to iconv_open:
      (raw_charset, suffix) = span (/= '/') charset
  
  newIConv :: String -> String
-   -> (IConv -> Buffer a -> Buffer b -> IO (Buffer a, Buffer b))
+   -> (Buffer a -> Buffer b -> IO (Buffer a, Buffer b))
+   -> (IConv -> Buffer a -> Buffer b -> IO (CodingProgress, Buffer a, Buffer b))
     -> IO (BufferCodec a b ())
-newIConv from to fn =
+newIConv from to rec fn =
    -- Assume charset names are ASCII
    withCAString from $ \ from_str ->
    withCAString to   $ \ to_str -> do
@@ -161,22 +126,21 @@ newIConv from to fn =
      let iclose = throwErrnoIfMinus1_ "Iconv.close" $ hs_iconv_close iconvt
      return BufferCodec{
                  encode = fn iconvt,
+                recover = rec,
                  close  = iclose,
                  -- iconv doesn't supply a way to save/restore the state
                  getState = return (),
                  setState = const $ return ()
                  }
  
-iconvDecode :: IConv -> Buffer Word8 -> Buffer CharBufElem
-            -> IO (Buffer Word8, Buffer CharBufElem)
+iconvDecode :: IConv -> DecodeBuffer
  iconvDecode iconv_t ibuf obuf = iconvRecode iconv_t ibuf 0 obuf char_shift
  
-iconvEncode :: IConv -> Buffer CharBufElem -> Buffer Word8
-            -> IO (Buffer CharBufElem, Buffer Word8)
+iconvEncode :: IConv -> EncodeBuffer
  iconvEncode iconv_t ibuf obuf = iconvRecode iconv_t ibuf char_shift obuf 0
  
  iconvRecode :: IConv -> Buffer a -> Int -> Buffer b -> Int 
-  -> IO (Buffer a, Buffer b)
+            -> IO (CodingProgress, Buffer a, Buffer b)
  iconvRecode iconv_t
    input@Buffer{  bufRaw=iraw, bufL=ir, bufR=iw, bufSize=_  }  iscale
    output@Buffer{ bufRaw=oraw, bufL=_,  bufR=ow, bufSize=os }  oscale
@@ -205,29 +169,23 @@ iconvRecode iconv_t
        iconv_trace ("iconvRecode after,  output=" ++ show (summaryBuffer new_output))
        if (res /= -1)
         then do -- all input translated
-          return (new_input, new_output)
+          return (InputUnderflow, new_input, new_output)
         else do
        errno <- getErrno
        case errno of
-        e |  e == eINVAL || e == e2BIG
-          || e == eILSEQ && new_inleft' /= (iw-ir) -> do
-            iconv_trace ("iconv ignoring error: " ++ show (errnoToIOError "iconv" e Nothing Nothing))
-                -- Output overflow is harmless
-                --
-                -- Similarly, we ignore EILSEQ unless we converted no
-                -- characters.  Sometimes iconv reports EILSEQ for a
-                -- character in the input even when there is no room
-                -- in the output; in this case we might be about to
-                -- change the encoding anyway, so the following bytes
-                -- could very well be in a different encoding.
-                -- This also helps with pinpointing EILSEQ errors: we
-                -- don't report it until the rest of the characters in
-                -- the buffer have been drained.
-            return (new_input, new_output)
-
-        e -> do
-                iconv_trace ("iconv returned error: " ++ show (errnoToIOError "iconv" e Nothing Nothing))
-                throwErrno "iconvRecoder"
-                       -- illegal sequence, or some other error
+        e | e == e2BIG  -> return (OutputUnderflow, new_input, new_output)
+          | e == eINVAL -> return (InputUnderflow, new_input, new_output)
+           -- Sometimes iconv reports EILSEQ for a
+           -- character in the input even when there is no room
+           -- in the output; in this case we might be about to
+           -- change the encoding anyway, so the following bytes
+           -- could very well be in a different encoding.
+           --
+           -- Because we can only say InvalidSequence if there is at least
+           -- one element left in the output, we have to special case this.
+          | e == eILSEQ -> return (if new_outleft' == 0 then OutputUnderflow else InvalidSequence, new_input, new_output)
+          | otherwise -> do
+              iconv_trace ("iconv returned error: " ++ show (errnoToIOError "iconv" e Nothing Nothing))
+              throwErrno "iconvRecoder"
  
  #endif /* !mingw32_HOST_OS */
diff --git a/GHC/IO/Encoding/Latin1.hs b/GHC/IO/Encoding/Latin1.hs

index 6bf18c8..197222e 100644 (file)
--- a/GHC/IO/Encoding/Latin1.hs
+++ b/GHC/IO/Encoding/Latin1.hs
@@ -23,8 +23,8 @@
  -----------------------------------------------------------------------------
  
  module GHC.IO.Encoding.Latin1 (
-  latin1,
-  latin1_checked,
+  latin1, mkLatin1,
+  latin1_checked, mkLatin1_checked,
    latin1_decode,
    latin1_encode,
    latin1_checked_encode,
@@ -34,46 +34,54 @@ import GHC.Base
  import GHC.Real
  import GHC.Num
  -- import GHC.IO
-import GHC.IO.Exception
  import GHC.IO.Buffer
+import GHC.IO.Encoding.Failure
  import GHC.IO.Encoding.Types
-import Data.Maybe
  
  -- -----------------------------------------------------------------------------
  -- Latin1
  
  latin1 :: TextEncoding
-latin1 = TextEncoding { textEncodingName = "ISO8859-1",
-                        mkTextDecoder = latin1_DF,
-                        mkTextEncoder = latin1_EF }
+latin1 = mkLatin1 ErrorOnCodingFailure
  
-latin1_DF :: IO (TextDecoder ())
-latin1_DF =
+mkLatin1 :: CodingFailureMode -> TextEncoding
+mkLatin1 cfm = TextEncoding { textEncodingName = "ISO8859-1",
+                              mkTextDecoder = latin1_DF cfm,
+                              mkTextEncoder = latin1_EF cfm }
+
+latin1_DF :: CodingFailureMode -> IO (TextDecoder ())
+latin1_DF cfm =
    return (BufferCodec {
               encode   = latin1_decode,
+             recover  = recoverDecode cfm,
               close    = return (),
               getState = return (),
               setState = const $ return ()
            })
  
-latin1_EF :: IO (TextEncoder ())
-latin1_EF =
+latin1_EF :: CodingFailureMode -> IO (TextEncoder ())
+latin1_EF cfm =
    return (BufferCodec {
               encode   = latin1_encode,
+             recover  = recoverEncode cfm,
               close    = return (),
               getState = return (),
               setState = const $ return ()
            })
  
  latin1_checked :: TextEncoding
-latin1_checked = TextEncoding { textEncodingName = "ISO8859-1(checked)",
-                                mkTextDecoder = latin1_DF,
-                                mkTextEncoder = latin1_checked_EF }
+latin1_checked = mkLatin1_checked ErrorOnCodingFailure
+
+mkLatin1_checked :: CodingFailureMode -> TextEncoding
+mkLatin1_checked cfm = TextEncoding { textEncodingName = "ISO8859-1(checked)",
+                                      mkTextDecoder = latin1_DF cfm,
+                                      mkTextEncoder = latin1_checked_EF cfm }
  
-latin1_checked_EF :: IO (TextEncoder ())
-latin1_checked_EF =
+latin1_checked_EF :: CodingFailureMode -> IO (TextEncoder ())
+latin1_checked_EF cfm =
    return (BufferCodec {
               encode   = latin1_checked_encode,
+             recover  = recoverEncode cfm,
               close    = return (),
               getState = return (),
               setState = const $ return ()
@@ -86,16 +94,18 @@ latin1_decode
    output@Buffer{ bufRaw=oraw, bufL=_,   bufR=ow0, bufSize=os }
   = let 
         loop !ir !ow
-         | ow >= os || ir >= iw =  done ir ow
+         | ow >= os = done OutputUnderflow ir ow
+         | ir >= iw = done InputUnderflow ir ow
           | otherwise = do
                c0 <- readWord8Buf iraw ir
                ow' <- writeCharBuf oraw ow (unsafeChr (fromIntegral c0))
                loop (ir+1) ow'
  
         -- lambda-lifted, to avoid thunks being built in the inner-loop:
-       done !ir !ow = return (if ir == iw then input{ bufL=0, bufR=0 }
-                                          else input{ bufL=ir },
-                         output{ bufR=ow })
+       done why !ir !ow = return (why,
+                                  if ir == iw then input{ bufL=0, bufR=0 }
+                                              else input{ bufL=ir },
+                                  output{ bufR=ow })
      in
      loop ir0 ow0
  
@@ -104,11 +114,13 @@ latin1_encode
    input@Buffer{  bufRaw=iraw, bufL=ir0, bufR=iw,  bufSize=_  }
    output@Buffer{ bufRaw=oraw, bufL=_,   bufR=ow0, bufSize=os }
   = let
-      done !ir !ow = return (if ir == iw then input{ bufL=0, bufR=0 }
-                                         else input{ bufL=ir },
-                             output{ bufR=ow })
+      done why !ir !ow = return (why,
+                                 if ir == iw then input{ bufL=0, bufR=0 }
+                                             else input{ bufL=ir },
+                                 output{ bufR=ow })
        loop !ir !ow
-        | ow >= os || ir >= iw =  done ir ow
+        | ow >= os = done OutputUnderflow ir ow
+        | ir >= iw = done InputUnderflow ir ow
          | otherwise = do
             (c,ir') <- readCharBuf iraw ir
             writeWord8Buf oraw ow (fromIntegral (ord c))
@@ -121,22 +133,19 @@ latin1_checked_encode
    input@Buffer{  bufRaw=iraw, bufL=ir0, bufR=iw,  bufSize=_  }
    output@Buffer{ bufRaw=oraw, bufL=_,   bufR=ow0, bufSize=os }
   = let
-      done !ir !ow = return (if ir == iw then input{ bufL=0, bufR=0 }
-                                         else input{ bufL=ir },
-                             output{ bufR=ow })
+      done why !ir !ow = return (why,
+                                 if ir == iw then input{ bufL=0, bufR=0 }
+                                             else input{ bufL=ir },
+                                 output{ bufR=ow })
        loop !ir !ow
-        | ow >= os || ir >= iw =  done ir ow
+        | ow >= os = done OutputUnderflow ir ow
+        | ir >= iw = done InputUnderflow ir ow
          | otherwise = do
             (c,ir') <- readCharBuf iraw ir
             if ord c > 0xff then invalid else do
             writeWord8Buf oraw ow (fromIntegral (ord c))
             loop ir' (ow+1)
          where
-           invalid = if ir > ir0 then done ir ow else ioe_encodingError
+           invalid = done InvalidSequence ir ow
      in
      loop ir0 ow0
-
-ioe_encodingError :: IO a
-ioe_encodingError = ioException
-     (IOError Nothing InvalidArgument "latin1_checked_encode"
-          "character is out of range for this encoding" Nothing Nothing)
diff --git a/GHC/IO/Encoding/Types.hs b/GHC/IO/Encoding/Types.hs

index ac9147a..ebce578 100644 (file)
--- a/GHC/IO/Encoding/Types.hs
+++ b/GHC/IO/Encoding/Types.hs
@@ -20,6 +20,7 @@ module GHC.IO.Encoding.Types (
      TextEncoding(..),
      TextEncoder, TextDecoder,
      EncodeBuffer, DecodeBuffer,
+    CodingProgress(..)
    ) where
  
  import GHC.Base
@@ -32,22 +33,41 @@ import GHC.IO.Buffer
  -- Text encoders/decoders
  
  data BufferCodec from to state = BufferCodec {
-  encode :: Buffer from -> Buffer to -> IO (Buffer from, Buffer to),
+  encode :: Buffer from -> Buffer to -> IO (CodingProgress, Buffer from, Buffer to),
     -- ^ The @encode@ function translates elements of the buffer @from@
     -- to the buffer @to@.  It should translate as many elements as possible
     -- given the sizes of the buffers, including translating zero elements
     -- if there is either not enough room in @to@, or @from@ does not
     -- contain a complete multibyte sequence.
-   -- 
-   -- @encode@ should raise an exception if, and only if, @from@
-   -- begins with an illegal sequence, or the first element of @from@
-   -- is not representable in the encoding of @to@.  That is, if any
-   -- elements can be successfully translated before an error is
-   -- encountered, then @encode@ should translate as much as it can
-   -- and not throw an exception.  This behaviour is used by the IO
+   --
+   -- The fact that as many elements as possible are translated is used by the IO
     -- library in order to report translation errors at the point they
     -- actually occur, rather than when the buffer is translated.
     --
+   -- To allow us to use iconv as a BufferCode efficiently, character buffers are
+   -- defined to contain lone surrogates instead of those private use characters that
+   -- are used for roundtripping. Thus, Chars poked and peeked from a character buffer
+   -- must undergo surrogatifyRoundtripCharacter and desurrogatifyRoundtripCharacter
+   -- respectively.
+   --
+   -- For more information on this, see Note [Roundtripping] in GHC.IO.Encoding.Failure.
+  
+  recover :: Buffer from -> Buffer to -> IO (Buffer from, Buffer to),
+   -- ^ The @recover@ function is used to continue decoding
+   -- in the presence of invalid or unrepresentable sequences. This includes
+   -- both those detected by @encode@ returning @InvalidSequence@ and those
+   -- that occur because the input byte sequence appears to be truncated.
+   --
+   -- Progress will usually be made by skipping the first element of the @from@
+   -- buffer. This function should only be called if you are certain that you
+   -- wish to do this skipping, and if the @to@ buffer has at least one element
+   -- of free space.
+   --
+   -- @recover@ may raise an exception rather than skipping anything.
+   --
+   -- Currently, some implementations of @recover@ may mutate the input buffer.
+   -- In particular, this feature is used to implement transliteration.
+  
    close  :: IO (),
     -- ^ Resources associated with the encoding may now be released.
     -- The @encode@ function may not be called again after calling
@@ -66,16 +86,16 @@ data BufferCodec from to state = BufferCodec {
     -- beginning), and if not, whether to use the big or little-endian
     -- encoding.
  
-  setState :: state -> IO()
+  setState :: state -> IO ()
     -- restore the state of the codec using the state from a previous
     -- call to 'getState'.
   }
  
  type DecodeBuffer = Buffer Word8 -> Buffer Char
-                  -> IO (Buffer Word8, Buffer Char)
+                  -> IO (CodingProgress, Buffer Word8, Buffer Char)
  
  type EncodeBuffer = Buffer Char -> Buffer Word8
-                  -> IO (Buffer Char, Buffer Word8)
+                  -> IO (CodingProgress, Buffer Char, Buffer Word8)
  
  type TextDecoder state = BufferCodec Word8 CharBufElem state
  type TextEncoder state = BufferCodec CharBufElem Word8 state
@@ -101,3 +121,11 @@ data TextEncoding
  instance Show TextEncoding where
    -- | Returns the value of 'textEncodingName'
    show te = textEncodingName te
+
+data CodingProgress = InputUnderflow  -- ^ Stopped because the input contains insufficient available elements,
+                                      -- or all of the input sequence has been sucessfully translated.
+                    | OutputUnderflow -- ^ Stopped because the output contains insufficient free elements
+                    | InvalidSequence -- ^ Stopped because there are sufficient free elements in the output
+                                      -- to output at least one encoded ASCII character, but the input contains
+                                      -- an invalid or unrepresentable sequence
+                    deriving (Eq, Show)
diff --git a/GHC/IO/Encoding/UTF16.hs b/GHC/IO/Encoding/UTF16.hs

index 5cc55f5..1932220 100644 (file)
--- a/GHC/IO/Encoding/UTF16.hs
+++ b/GHC/IO/Encoding/UTF16.hs
@@ -25,15 +25,15 @@
  -----------------------------------------------------------------------------
  
  module GHC.IO.Encoding.UTF16 (
-  utf16,
+  utf16, mkUTF16,
    utf16_decode,
    utf16_encode,
  
-  utf16be,
+  utf16be, mkUTF16be,
    utf16be_decode,
    utf16be_encode,
  
-  utf16le,
+  utf16le, mkUTF16le,
    utf16le_decode,
    utf16le_encode,
    ) where
@@ -42,50 +42,42 @@ import GHC.Base
  import GHC.Real
  import GHC.Num
  -- import GHC.IO
-import GHC.IO.Exception
  import GHC.IO.Buffer
+import GHC.IO.Encoding.Failure
  import GHC.IO.Encoding.Types
  import GHC.Word
  import Data.Bits
  import Data.Maybe
  import GHC.IORef
  
-#if DEBUG
-import System.Posix.Internals
-import Foreign.C
-import GHC.Show
-import GHC.Ptr
-
-puts :: String -> IO ()
- -- In reality should be withCString, but assume ASCII to avoid possible loop
-puts s = do withCAStringLen (s++"\n") $ \(p,len) ->
-                c_write 1 (castPtr p) (fromIntegral len)
-            return ()
-#endif
-
  -- -----------------------------------------------------------------------------
  -- The UTF-16 codec: either UTF16BE or UTF16LE with a BOM
  
  utf16  :: TextEncoding
-utf16 = TextEncoding { textEncodingName = "UTF-16",
-                       mkTextDecoder = utf16_DF,
-                      mkTextEncoder = utf16_EF }
+utf16 = mkUTF16 ErrorOnCodingFailure
+
+mkUTF16 :: CodingFailureMode -> TextEncoding
+mkUTF16 cfm =  TextEncoding { textEncodingName = "UTF-16",
+                              mkTextDecoder = utf16_DF cfm,
+                              mkTextEncoder = utf16_EF cfm }
  
-utf16_DF :: IO (TextDecoder (Maybe DecodeBuffer))
-utf16_DF = do
+utf16_DF :: CodingFailureMode -> IO (TextDecoder (Maybe DecodeBuffer))
+utf16_DF cfm = do
    seen_bom <- newIORef Nothing
    return (BufferCodec {
               encode   = utf16_decode seen_bom,
+             recover  = recoverDecode cfm,
               close    = return (),
               getState = readIORef seen_bom,
               setState = writeIORef seen_bom
            })
  
-utf16_EF :: IO (TextEncoder Bool)
-utf16_EF = do
+utf16_EF :: CodingFailureMode -> IO (TextEncoder Bool)
+utf16_EF cfm = do
    done_bom <- newIORef False
    return (BufferCodec {
               encode   = utf16_encode done_bom,
+             recover  = recoverEncode cfm,
               close    = return (),
               getState = readIORef done_bom,
               setState = writeIORef done_bom
@@ -98,7 +90,7 @@ utf16_encode done_bom input
    b <- readIORef done_bom
    if b then utf16_native_encode input output
         else if os - ow < 2
-               then return (input,output)
+               then return (OutputUnderflow,input,output)
                 else do
                      writeIORef done_bom True
                      writeWord8Buf oraw ow     bom1
@@ -114,7 +106,7 @@ utf16_decode seen_bom
     case mb of
       Just decode -> decode input output
       Nothing ->
-       if iw - ir < 2 then return (input,output) else do
+       if iw - ir < 2 then return (InputUnderflow,input,output) else do
         c0 <- readWord8Buf iraw ir
         c1 <- readWord8Buf iraw (ir+1)
         case () of
@@ -147,46 +139,56 @@ bom2 = bomL
  -- UTF16LE and UTF16BE
  
  utf16be :: TextEncoding
-utf16be = TextEncoding { textEncodingName = "UTF-16BE",
-                         mkTextDecoder = utf16be_DF,
-                        mkTextEncoder = utf16be_EF }
+utf16be = mkUTF16be ErrorOnCodingFailure
  
-utf16be_DF :: IO (TextDecoder ())
-utf16be_DF =
+mkUTF16be :: CodingFailureMode -> TextEncoding
+mkUTF16be cfm = TextEncoding { textEncodingName = "UTF-16BE",
+                               mkTextDecoder = utf16be_DF cfm,
+                               mkTextEncoder = utf16be_EF cfm }
+
+utf16be_DF :: CodingFailureMode -> IO (TextDecoder ())
+utf16be_DF cfm =
    return (BufferCodec {
               encode   = utf16be_decode,
+             recover  = recoverDecode cfm,
               close    = return (),
               getState = return (),
               setState = const $ return ()
            })
  
-utf16be_EF :: IO (TextEncoder ())
-utf16be_EF =
+utf16be_EF :: CodingFailureMode -> IO (TextEncoder ())
+utf16be_EF cfm =
    return (BufferCodec {
               encode   = utf16be_encode,
+             recover  = recoverEncode cfm,
               close    = return (),
               getState = return (),
               setState = const $ return ()
            })
  
  utf16le :: TextEncoding
-utf16le = TextEncoding { textEncodingName = "UTF16-LE",
-                         mkTextDecoder = utf16le_DF,
-                        mkTextEncoder = utf16le_EF }
+utf16le = mkUTF16le ErrorOnCodingFailure
+
+mkUTF16le :: CodingFailureMode -> TextEncoding
+mkUTF16le cfm = TextEncoding { textEncodingName = "UTF16-LE",
+                               mkTextDecoder = utf16le_DF cfm,
+                               mkTextEncoder = utf16le_EF cfm }
  
-utf16le_DF :: IO (TextDecoder ())
-utf16le_DF =
+utf16le_DF :: CodingFailureMode -> IO (TextDecoder ())
+utf16le_DF cfm =
    return (BufferCodec {
               encode   = utf16le_decode,
+             recover  = recoverDecode cfm,
               close    = return (),
               getState = return (),
               setState = const $ return ()
            })
  
-utf16le_EF :: IO (TextEncoder ())
-utf16le_EF =
+utf16le_EF :: CodingFailureMode -> IO (TextEncoder ())
+utf16le_EF cfm =
    return (BufferCodec {
               encode   = utf16le_encode,
+             recover  = recoverEncode cfm,
               close    = return (),
               getState = return (),
               setState = const $ return ()
@@ -199,8 +201,9 @@ utf16be_decode
    output@Buffer{ bufRaw=oraw, bufL=_,   bufR=ow0, bufSize=os }
   = let 
         loop !ir !ow
-         | ow >= os || ir >= iw  =  done ir ow
-         | ir + 1 == iw          =  done ir ow
+         | ow >= os     = done OutputUnderflow ir ow
+         | ir >= iw     = done InputUnderflow ir ow
+         | ir + 1 == iw = done InputUnderflow ir ow
           | otherwise = do
                c0 <- readWord8Buf iraw ir
                c1 <- readWord8Buf iraw (ir+1)
@@ -208,7 +211,7 @@ utf16be_decode
                if validate1 x1
                   then do ow' <- writeCharBuf oraw ow (unsafeChr (fromIntegral x1))
                           loop (ir+2) ow'
-                 else if iw - ir < 4 then done ir ow else do
+                 else if iw - ir < 4 then done InputUnderflow ir ow else do
                        c2 <- readWord8Buf iraw (ir+2)
                        c3 <- readWord8Buf iraw (ir+3)
                        let x2 = fromIntegral c2 `shiftL` 8 + fromIntegral c3
@@ -216,12 +219,13 @@ utf16be_decode
                        ow' <- writeCharBuf oraw ow (chr2 x1 x2)
                        loop (ir+4) ow'
           where
-           invalid = if ir > ir0 then done ir ow else ioe_decodingError
+           invalid = done InvalidSequence ir ow
  
         -- lambda-lifted, to avoid thunks being built in the inner-loop:
-       done !ir !ow = return (if ir == iw then input{ bufL=0, bufR=0 }
-                                          else input{ bufL=ir },
-                         output{ bufR=ow })
+       done why !ir !ow = return (why,
+                                  if ir == iw then input{ bufL=0, bufR=0 }
+                                              else input{ bufL=ir },
+                                  output{ bufR=ow })
      in
      loop ir0 ow0
  
@@ -231,8 +235,9 @@ utf16le_decode
    output@Buffer{ bufRaw=oraw, bufL=_,   bufR=ow0, bufSize=os }
   = let 
         loop !ir !ow
-         | ow >= os || ir >= iw  =  done ir ow
-         | ir + 1 == iw          =  done ir ow
+         | ow >= os     = done OutputUnderflow ir ow
+         | ir >= iw     = done InputUnderflow ir ow
+         | ir + 1 == iw = done InputUnderflow ir ow
           | otherwise = do
                c0 <- readWord8Buf iraw ir
                c1 <- readWord8Buf iraw (ir+1)
@@ -240,7 +245,7 @@ utf16le_decode
                if validate1 x1
                   then do ow' <- writeCharBuf oraw ow (unsafeChr (fromIntegral x1))
                           loop (ir+2) ow'
-                 else if iw - ir < 4 then done ir ow else do
+                 else if iw - ir < 4 then done InputUnderflow ir ow else do
                        c2 <- readWord8Buf iraw (ir+2)
                        c3 <- readWord8Buf iraw (ir+3)
                        let x2 = fromIntegral c3 `shiftL` 8 + fromIntegral c2
@@ -248,40 +253,37 @@ utf16le_decode
                        ow' <- writeCharBuf oraw ow (chr2 x1 x2)
                        loop (ir+4) ow'
           where
-           invalid = if ir > ir0 then done ir ow else ioe_decodingError
+           invalid = done InvalidSequence ir ow
  
         -- lambda-lifted, to avoid thunks being built in the inner-loop:
-       done !ir !ow = return (if ir == iw then input{ bufL=0, bufR=0 }
-                                          else input{ bufL=ir },
-                         output{ bufR=ow })
+       done why !ir !ow = return (why,
+                                  if ir == iw then input{ bufL=0, bufR=0 }
+                                              else input{ bufL=ir },
+                                  output{ bufR=ow })
      in
      loop ir0 ow0
  
-ioe_decodingError :: IO a
-ioe_decodingError = ioException
-     (IOError Nothing InvalidArgument "utf16_decode"
-          "invalid UTF-16 byte sequence" Nothing Nothing)
-
  utf16be_encode :: EncodeBuffer
  utf16be_encode
    input@Buffer{  bufRaw=iraw, bufL=ir0, bufR=iw,  bufSize=_  }
    output@Buffer{ bufRaw=oraw, bufL=_,   bufR=ow0, bufSize=os }
   = let 
-      done !ir !ow = return (if ir == iw then input{ bufL=0, bufR=0 }
-                                         else input{ bufL=ir },
-                             output{ bufR=ow })
+      done why !ir !ow = return (why,
+                                 if ir == iw then input{ bufL=0, bufR=0 }
+                                             else input{ bufL=ir },
+                                 output{ bufR=ow })
        loop !ir !ow
-        | ir >= iw     =  done ir ow
-        | os - ow < 2  =  done ir ow
+        | ir >= iw     =  done InputUnderflow ir ow
+        | os - ow < 2  =  done OutputUnderflow ir ow
          | otherwise = do
             (c,ir') <- readCharBuf iraw ir
             case ord c of
-             x | x < 0x10000 -> do
+             x | x < 0x10000 -> if isSurrogate c then done InvalidSequence ir ow else do
                      writeWord8Buf oraw ow     (fromIntegral (x `shiftR` 8))
                      writeWord8Buf oraw (ow+1) (fromIntegral x)
                      loop ir' (ow+2)
                 | otherwise -> do
-                    if os - ow < 4 then done ir ow else do
+                    if os - ow < 4 then done OutputUnderflow ir ow else do
                      let 
                           n1 = x - 0x10000
                           c1 = fromIntegral (n1 `shiftR` 18 + 0xD8)
@@ -303,21 +305,22 @@ utf16le_encode
    input@Buffer{  bufRaw=iraw, bufL=ir0, bufR=iw,  bufSize=_  }
    output@Buffer{ bufRaw=oraw, bufL=_,   bufR=ow0, bufSize=os }
   = let
-      done !ir !ow = return (if ir == iw then input{ bufL=0, bufR=0 }
-                                         else input{ bufL=ir },
-                             output{ bufR=ow })
+      done why !ir !ow = return (why,
+                                 if ir == iw then input{ bufL=0, bufR=0 }
+                                             else input{ bufL=ir },
+                                 output{ bufR=ow })
        loop !ir !ow
-        | ir >= iw     =  done ir ow
-        | os - ow < 2  =  done ir ow
+        | ir >= iw     =  done InputUnderflow ir ow
+        | os - ow < 2  =  done OutputUnderflow ir ow
          | otherwise = do
             (c,ir') <- readCharBuf iraw ir
             case ord c of
-             x | x < 0x10000 -> do
+             x | x < 0x10000 -> if isSurrogate c then done InvalidSequence ir ow else do
                      writeWord8Buf oraw ow     (fromIntegral x)
                      writeWord8Buf oraw (ow+1) (fromIntegral (x `shiftR` 8))
                      loop ir' (ow+2)
                 | otherwise ->
-                    if os - ow < 4 then done ir ow else do
+                    if os - ow < 4 then done OutputUnderflow ir ow else do
                      let 
                           n1 = x - 0x10000
                           c1 = fromIntegral (n1 `shiftR` 18 + 0xD8)
diff --git a/GHC/IO/Encoding/UTF32.hs b/GHC/IO/Encoding/UTF32.hs

index 1eef105..89a0d11 100644 (file)
--- a/GHC/IO/Encoding/UTF32.hs
+++ b/GHC/IO/Encoding/UTF32.hs
@@ -24,15 +24,15 @@
  -----------------------------------------------------------------------------
  
  module GHC.IO.Encoding.UTF32 (
-  utf32,
+  utf32, mkUTF32,
    utf32_decode,
    utf32_encode,
  
-  utf32be,
+  utf32be, mkUTF32be,
    utf32be_decode,
    utf32be_encode,
  
-  utf32le,
+  utf32le, mkUTF32le,
    utf32le_decode,
    utf32le_encode,
    ) where
@@ -41,8 +41,8 @@ import GHC.Base
  import GHC.Real
  import GHC.Num
  -- import GHC.IO
-import GHC.IO.Exception
  import GHC.IO.Buffer
+import GHC.IO.Encoding.Failure
  import GHC.IO.Encoding.Types
  import GHC.Word
  import Data.Bits
@@ -53,25 +53,30 @@ import GHC.IORef
  -- The UTF-32 codec: either UTF-32BE or UTF-32LE with a BOM
  
  utf32  :: TextEncoding
-utf32 = TextEncoding { textEncodingName = "UTF-32",
-                       mkTextDecoder = utf32_DF,
-                      mkTextEncoder = utf32_EF }
+utf32 = mkUTF32 ErrorOnCodingFailure
  
-utf32_DF :: IO (TextDecoder (Maybe DecodeBuffer))
-utf32_DF = do
+mkUTF32 :: CodingFailureMode -> TextEncoding
+mkUTF32 cfm = TextEncoding { textEncodingName = "UTF-32",
+                             mkTextDecoder = utf32_DF cfm,
+                             mkTextEncoder = utf32_EF cfm }
+
+utf32_DF :: CodingFailureMode -> IO (TextDecoder (Maybe DecodeBuffer))
+utf32_DF cfm = do
    seen_bom <- newIORef Nothing
    return (BufferCodec {
               encode   = utf32_decode seen_bom,
+             recover  = recoverDecode cfm,
               close    = return (),
               getState = readIORef seen_bom,
               setState = writeIORef seen_bom
            })
  
-utf32_EF :: IO (TextEncoder Bool)
-utf32_EF = do
+utf32_EF :: CodingFailureMode -> IO (TextEncoder Bool)
+utf32_EF cfm = do
    done_bom <- newIORef False
    return (BufferCodec {
               encode   = utf32_encode done_bom,
+             recover  = recoverEncode cfm,
               close    = return (),
               getState = readIORef done_bom,
               setState = writeIORef done_bom
@@ -84,7 +89,7 @@ utf32_encode done_bom input
    b <- readIORef done_bom
    if b then utf32_native_encode input output
         else if os - ow < 4
-               then return (input,output)
+               then return (OutputUnderflow, input,output)
                 else do
                      writeIORef done_bom True
                      writeWord8Buf oraw ow     bom0
@@ -102,7 +107,7 @@ utf32_decode seen_bom
     case mb of
       Just decode -> decode input output
       Nothing ->
-       if iw - ir < 4 then return (input,output) else do
+       if iw - ir < 4 then return (InputUnderflow, input,output) else do
         c0 <- readWord8Buf iraw ir
         c1 <- readWord8Buf iraw (ir+1)
         c2 <- readWord8Buf iraw (ir+2)
@@ -136,23 +141,28 @@ utf32_native_encode = utf32be_encode
  -- UTF32LE and UTF32BE
  
  utf32be :: TextEncoding
-utf32be = TextEncoding { textEncodingName = "UTF-32BE",
-                         mkTextDecoder = utf32be_DF,
-                        mkTextEncoder = utf32be_EF }
+utf32be = mkUTF32be ErrorOnCodingFailure
+
+mkUTF32be :: CodingFailureMode -> TextEncoding
+mkUTF32be cfm = TextEncoding { textEncodingName = "UTF-32BE",
+                               mkTextDecoder = utf32be_DF cfm,
+                               mkTextEncoder = utf32be_EF cfm }
  
-utf32be_DF :: IO (TextDecoder ())
-utf32be_DF =
+utf32be_DF :: CodingFailureMode -> IO (TextDecoder ())
+utf32be_DF cfm =
    return (BufferCodec {
               encode   = utf32be_decode,
+             recover  = recoverDecode cfm,
               close    = return (),
               getState = return (),
               setState = const $ return ()
            })
  
-utf32be_EF :: IO (TextEncoder ())
-utf32be_EF =
+utf32be_EF :: CodingFailureMode -> IO (TextEncoder ())
+utf32be_EF cfm =
    return (BufferCodec {
               encode   = utf32be_encode,
+             recover  = recoverEncode cfm,
               close    = return (),
               getState = return (),
               setState = const $ return ()
@@ -160,23 +170,28 @@ utf32be_EF =
  
  
  utf32le :: TextEncoding
-utf32le = TextEncoding { textEncodingName = "UTF-32LE",
-                         mkTextDecoder = utf32le_DF,
-                        mkTextEncoder = utf32le_EF }
+utf32le = mkUTF32le ErrorOnCodingFailure
  
-utf32le_DF :: IO (TextDecoder ())
-utf32le_DF =
+mkUTF32le :: CodingFailureMode -> TextEncoding
+mkUTF32le cfm = TextEncoding { textEncodingName = "UTF-32LE",
+                               mkTextDecoder = utf32le_DF cfm,
+                               mkTextEncoder = utf32le_EF cfm }
+
+utf32le_DF :: CodingFailureMode -> IO (TextDecoder ())
+utf32le_DF cfm =
    return (BufferCodec {
               encode   = utf32le_decode,
+             recover  = recoverDecode cfm,
               close    = return (),
               getState = return (),
               setState = const $ return ()
            })
  
-utf32le_EF :: IO (TextEncoder ())
-utf32le_EF =
+utf32le_EF :: CodingFailureMode -> IO (TextEncoder ())
+utf32le_EF cfm =
    return (BufferCodec {
               encode   = utf32le_encode,
+             recover  = recoverEncode cfm,
               close    = return (),
               getState = return (),
               setState = const $ return ()
@@ -189,7 +204,8 @@ utf32be_decode
    output@Buffer{ bufRaw=oraw, bufL=_,   bufR=ow0, bufSize=os }
   = let 
         loop !ir !ow
-         | ow >= os || iw - ir < 4 =  done ir ow
+         | ow >= os    = done OutputUnderflow ir ow
+         | iw - ir < 4 = done InputUnderflow  ir ow
           | otherwise = do
                c0 <- readWord8Buf iraw ir
                c1 <- readWord8Buf iraw (ir+1)
@@ -200,12 +216,13 @@ utf32be_decode
                ow' <- writeCharBuf oraw ow x1
                loop (ir+4) ow'
           where
-           invalid = if ir > ir0 then done ir ow else ioe_decodingError
+           invalid = done InvalidSequence ir ow
  
         -- lambda-lifted, to avoid thunks being built in the inner-loop:
-       done !ir !ow = return (if ir == iw then input{ bufL=0, bufR=0 }
-                                          else input{ bufL=ir },
-                         output{ bufR=ow })
+       done why !ir !ow = return (why,
+                                  if ir == iw then input{ bufL=0, bufR=0 }
+                                              else input{ bufL=ir },
+                                  output{ bufR=ow })
      in
      loop ir0 ow0
  
@@ -215,7 +232,8 @@ utf32le_decode
    output@Buffer{ bufRaw=oraw, bufL=_,   bufR=ow0, bufSize=os }
   = let 
         loop !ir !ow
-         | ow >= os || iw - ir < 4 =  done ir ow
+         | ow >= os    = done OutputUnderflow ir ow
+         | iw - ir < 4 = done InputUnderflow  ir ow
           | otherwise = do
                c0 <- readWord8Buf iraw ir
                c1 <- readWord8Buf iraw (ir+1)
@@ -226,39 +244,37 @@ utf32le_decode
                ow' <- writeCharBuf oraw ow x1
                loop (ir+4) ow'
           where
-           invalid = if ir > ir0 then done ir ow else ioe_decodingError
+           invalid = done InvalidSequence ir ow
  
         -- lambda-lifted, to avoid thunks being built in the inner-loop:
-       done !ir !ow = return (if ir == iw then input{ bufL=0, bufR=0 }
-                                          else input{ bufL=ir },
-                         output{ bufR=ow })
+       done why !ir !ow = return (why,
+                                  if ir == iw then input{ bufL=0, bufR=0 }
+                                              else input{ bufL=ir },
+                                  output{ bufR=ow })
      in
      loop ir0 ow0
  
-ioe_decodingError :: IO a
-ioe_decodingError = ioException
-     (IOError Nothing InvalidArgument "utf32_decode"
-          "invalid UTF-32 byte sequence" Nothing Nothing)
-
  utf32be_encode :: EncodeBuffer
  utf32be_encode
    input@Buffer{  bufRaw=iraw, bufL=ir0, bufR=iw,  bufSize=_  }
    output@Buffer{ bufRaw=oraw, bufL=_,   bufR=ow0, bufSize=os }
   = let 
-      done !ir !ow = return (if ir == iw then input{ bufL=0, bufR=0 }
-                                         else input{ bufL=ir },
-                             output{ bufR=ow })
+      done why !ir !ow = return (why,
+                                 if ir == iw then input{ bufL=0, bufR=0 }
+                                             else input{ bufL=ir },
+                                 output{ bufR=ow })
        loop !ir !ow
-        | ir >= iw     =  done ir ow
-        | os - ow < 4  =  done ir ow
+        | ir >= iw    = done InputUnderflow  ir ow
+        | os - ow < 4 = done OutputUnderflow ir ow
          | otherwise = do
             (c,ir') <- readCharBuf iraw ir
-           let (c0,c1,c2,c3) = ord4 c
-           writeWord8Buf oraw ow     c0
-           writeWord8Buf oraw (ow+1) c1
-           writeWord8Buf oraw (ow+2) c2
-           writeWord8Buf oraw (ow+3) c3
-           loop ir' (ow+4)
+           if isSurrogate c then done InvalidSequence ir ow else do
+             let (c0,c1,c2,c3) = ord4 c
+             writeWord8Buf oraw ow     c0
+             writeWord8Buf oraw (ow+1) c1
+             writeWord8Buf oraw (ow+2) c2
+             writeWord8Buf oraw (ow+3) c3
+             loop ir' (ow+4)
      in
      loop ir0 ow0
  
@@ -267,20 +283,22 @@ utf32le_encode
    input@Buffer{  bufRaw=iraw, bufL=ir0, bufR=iw,  bufSize=_  }
    output@Buffer{ bufRaw=oraw, bufL=_,   bufR=ow0, bufSize=os }
   = let
-      done !ir !ow = return (if ir == iw then input{ bufL=0, bufR=0 }
-                                         else input{ bufL=ir },
-                             output{ bufR=ow })
+      done why !ir !ow = return (why,
+                                 if ir == iw then input{ bufL=0, bufR=0 }
+                                             else input{ bufL=ir },
+                                 output{ bufR=ow })
        loop !ir !ow
-        | ir >= iw     =  done ir ow
-        | os - ow < 4  =  done ir ow
+        | ir >= iw    = done InputUnderflow  ir ow
+        | os - ow < 4 = done OutputUnderflow ir ow
          | otherwise = do
             (c,ir') <- readCharBuf iraw ir
-           let (c0,c1,c2,c3) = ord4 c
-           writeWord8Buf oraw ow     c3
-           writeWord8Buf oraw (ow+1) c2
-           writeWord8Buf oraw (ow+2) c1
-           writeWord8Buf oraw (ow+3) c0
-           loop ir' (ow+4)
+           if isSurrogate c then done InvalidSequence ir ow else do
+             let (c0,c1,c2,c3) = ord4 c
+             writeWord8Buf oraw ow     c3
+             writeWord8Buf oraw (ow+1) c2
+             writeWord8Buf oraw (ow+2) c1
+             writeWord8Buf oraw (ow+3) c0
+             loop ir' (ow+4)
      in
      loop ir0 ow0
  
diff --git a/GHC/IO/Encoding/UTF8.hs b/GHC/IO/Encoding/UTF8.hs

index dea4fde..55d09c8 100644 (file)
--- a/GHC/IO/Encoding/UTF8.hs
+++ b/GHC/IO/Encoding/UTF8.hs
@@ -24,8 +24,8 @@
  -----------------------------------------------------------------------------
  
  module GHC.IO.Encoding.UTF8 (
-  utf8,
-  utf8_bom,
+  utf8, mkUTF8,
+  utf8_bom, mkUTF8_bom
    ) where
  
  import GHC.Base
@@ -33,56 +33,66 @@ import GHC.Real
  import GHC.Num
  import GHC.IORef
  -- import GHC.IO
-import GHC.IO.Exception
  import GHC.IO.Buffer
+import GHC.IO.Encoding.Failure
  import GHC.IO.Encoding.Types
  import GHC.Word
  import Data.Bits
-import Data.Maybe
  
  utf8 :: TextEncoding
-utf8 = TextEncoding { textEncodingName = "UTF-8",
-                      mkTextDecoder = utf8_DF,
-                     mkTextEncoder = utf8_EF }
+utf8 = mkUTF8 ErrorOnCodingFailure
  
-utf8_DF :: IO (TextDecoder ())
-utf8_DF =
+mkUTF8 :: CodingFailureMode -> TextEncoding
+mkUTF8 cfm = TextEncoding { textEncodingName = "UTF-8",
+                            mkTextDecoder = utf8_DF cfm,
+                            mkTextEncoder = utf8_EF cfm }
+
+
+utf8_DF :: CodingFailureMode -> IO (TextDecoder ())
+utf8_DF cfm =
    return (BufferCodec {
               encode   = utf8_decode,
+             recover  = recoverDecode cfm,
               close    = return (),
               getState = return (),
               setState = const $ return ()
            })
  
-utf8_EF :: IO (TextEncoder ())
-utf8_EF =
+utf8_EF :: CodingFailureMode -> IO (TextEncoder ())
+utf8_EF cfm =
    return (BufferCodec {
               encode   = utf8_encode,
+             recover  = recoverEncode cfm,
               close    = return (),
               getState = return (),
               setState = const $ return ()
            })
  
  utf8_bom :: TextEncoding
-utf8_bom = TextEncoding { textEncodingName = "UTF-8BOM",
-                          mkTextDecoder = utf8_bom_DF,
-                          mkTextEncoder = utf8_bom_EF }
+utf8_bom = mkUTF8_bom ErrorOnCodingFailure
  
-utf8_bom_DF :: IO (TextDecoder Bool)
-utf8_bom_DF = do
+mkUTF8_bom :: CodingFailureMode -> TextEncoding
+mkUTF8_bom cfm = TextEncoding { textEncodingName = "UTF-8BOM",
+                                mkTextDecoder = utf8_bom_DF cfm,
+                                mkTextEncoder = utf8_bom_EF cfm }
+
+utf8_bom_DF :: CodingFailureMode -> IO (TextDecoder Bool)
+utf8_bom_DF cfm = do
     ref <- newIORef True
     return (BufferCodec {
               encode   = utf8_bom_decode ref,
+             recover  = recoverDecode cfm,
               close    = return (),
               getState = readIORef ref,
               setState = writeIORef ref
            })
  
-utf8_bom_EF :: IO (TextEncoder Bool)
-utf8_bom_EF = do
+utf8_bom_EF :: CodingFailureMode -> IO (TextEncoder Bool)
+utf8_bom_EF cfm = do
     ref <- newIORef True
     return (BufferCodec {
               encode   = utf8_bom_encode ref,
+             recover  = recoverEncode cfm,
               close    = return (),
               getState = readIORef ref,
               setState = writeIORef ref
@@ -98,13 +108,13 @@ utf8_bom_decode ref
        then utf8_decode input output
        else do
         let no_bom = do writeIORef ref False; utf8_decode input output
-       if iw - ir < 1 then return (input,output) else do
+       if iw - ir < 1 then return (InputUnderflow,input,output) else do
         c0 <- readWord8Buf iraw ir
         if (c0 /= bom0) then no_bom else do
-       if iw - ir < 2 then return (input,output) else do
+       if iw - ir < 2 then return (InputUnderflow,input,output) else do
         c1 <- readWord8Buf iraw (ir+1)
         if (c1 /= bom1) then no_bom else do
-       if iw - ir < 3 then return (input,output) else do
+       if iw - ir < 3 then return (InputUnderflow,input,output) else do
         c2 <- readWord8Buf iraw (ir+2)
         if (c2 /= bom2) then no_bom else do
         -- found a BOM, ignore it and carry on
@@ -118,7 +128,7 @@ utf8_bom_encode ref input
    b <- readIORef ref
    if not b then utf8_encode input output
             else if os - ow < 3
-                  then return (input,output)
+                  then return (OutputUnderflow,input,output)
                    else do
                      writeIORef ref False
                      writeWord8Buf oraw ow     bom0
@@ -137,7 +147,8 @@ utf8_decode
    output@Buffer{ bufRaw=oraw, bufL=_,   bufR=ow0, bufSize=os }
   = let 
         loop !ir !ow
-         | ow >= os || ir >= iw = done ir ow
+         | ow >= os = done OutputUnderflow ir ow
+         | ir >= iw = done InputUnderflow ir ow
           | otherwise = do
                c0 <- readWord8Buf iraw ir
                case c0 of
@@ -145,19 +156,19 @@ utf8_decode
                             ow' <- writeCharBuf oraw ow (unsafeChr (fromIntegral c0))
                             loop (ir+1) ow'
                    | c0 >= 0xc0 && c0 <= 0xdf ->
-                           if iw - ir < 2 then done ir ow else do
+                           if iw - ir < 2 then done InputUnderflow ir ow else do
                             c1 <- readWord8Buf iraw (ir+1)
                             if (c1 < 0x80 || c1 >= 0xc0) then invalid else do
                             ow' <- writeCharBuf oraw ow (chr2 c0 c1)
                             loop (ir+2) ow'
                    | c0 >= 0xe0 && c0 <= 0xef ->
                        case iw - ir of
-                        1 -> done ir ow
+                        1 -> done InputUnderflow ir ow
                          2 -> do -- check for an error even when we don't have
                                  -- the full sequence yet (#3341)
                             c1 <- readWord8Buf iraw (ir+1)
                             if not (validate3 c0 c1 0x80) 
-                              then invalid else done ir ow
+                              then invalid else done InputUnderflow ir ow
                          _ -> do
                             c1 <- readWord8Buf iraw (ir+1)
                             c2 <- readWord8Buf iraw (ir+2)
@@ -166,17 +177,17 @@ utf8_decode
                             loop (ir+3) ow'
                    | c0 >= 0xf0 ->
                        case iw - ir of
-                        1 -> done ir ow
+                        1 -> done InputUnderflow ir ow
                          2 -> do -- check for an error even when we don't have
                                  -- the full sequence yet (#3341)
                             c1 <- readWord8Buf iraw (ir+1)
                             if not (validate4 c0 c1 0x80 0x80)
-                              then invalid else done ir ow
+                              then invalid else done InputUnderflow ir ow
                          3 -> do
                             c1 <- readWord8Buf iraw (ir+1)
                             c2 <- readWord8Buf iraw (ir+2)
                             if not (validate4 c0 c1 c2 0x80)
-                              then invalid else done ir ow
+                              then invalid else done InputUnderflow ir ow
                          _ -> do
                             c1 <- readWord8Buf iraw (ir+1)
                             c2 <- readWord8Buf iraw (ir+2)
@@ -187,30 +198,28 @@ utf8_decode
                    | otherwise ->
                             invalid
           where
-           invalid = if ir > ir0 then done ir ow else ioe_decodingError
+           invalid = done InvalidSequence ir ow
  
         -- lambda-lifted, to avoid thunks being built in the inner-loop:
-       done !ir !ow = return (if ir == iw then input{ bufL=0, bufR=0 }
-                                          else input{ bufL=ir },
-                         output{ bufR=ow })
+       done why !ir !ow = return (why,
+                                  if ir == iw then input{ bufL=0, bufR=0 }
+                                              else input{ bufL=ir },
+                                  output{ bufR=ow })
     in
     loop ir0 ow0
  
-ioe_decodingError :: IO a
-ioe_decodingError = ioException
-     (IOError Nothing InvalidArgument "utf8_decode"
-          "invalid UTF-8 byte sequence" Nothing Nothing)
-
  utf8_encode :: EncodeBuffer
  utf8_encode
    input@Buffer{  bufRaw=iraw, bufL=ir0, bufR=iw,  bufSize=_  }
    output@Buffer{ bufRaw=oraw, bufL=_,   bufR=ow0, bufSize=os }
   = let 
-      done !ir !ow = return (if ir == iw then input{ bufL=0, bufR=0 }
-                                         else input{ bufL=ir },
-                             output{ bufR=ow })
+      done why !ir !ow = return (why,
+                                 if ir == iw then input{ bufL=0, bufR=0 }
+                                             else input{ bufL=ir },
+                                 output{ bufR=ow })
        loop !ir !ow
-        | ow >= os || ir >= iw = done ir ow
+        | ow >= os = done OutputUnderflow ir ow
+        | ir >= iw = done InputUnderflow ir ow
          | otherwise = do
             (c,ir') <- readCharBuf iraw ir
             case ord c of
@@ -218,20 +227,20 @@ utf8_encode
                      writeWord8Buf oraw ow (fromIntegral x)
                      loop ir' (ow+1)
                 | x <= 0x07FF ->
-                    if os - ow < 2 then done ir ow else do
+                    if os - ow < 2 then done OutputUnderflow ir ow else do
                      let (c1,c2) = ord2 c
                      writeWord8Buf oraw ow     c1
                      writeWord8Buf oraw (ow+1) c2
                      loop ir' (ow+2)
-               | x <= 0xFFFF -> do
-                    if os - ow < 3 then done ir ow else do
+               | x <= 0xFFFF -> if isSurrogate c then done InvalidSequence ir ow else do
+                    if os - ow < 3 then done OutputUnderflow ir ow else do
                      let (c1,c2,c3) = ord3 c
                      writeWord8Buf oraw ow     c1
                      writeWord8Buf oraw (ow+1) c2
                      writeWord8Buf oraw (ow+2) c3
                      loop ir' (ow+3)
                 | otherwise -> do
-                    if os - ow < 4 then done ir ow else do
+                    if os - ow < 4 then done OutputUnderflow ir ow else do
                      let (c1,c2,c3,c4) = ord4 c
                      writeWord8Buf oraw ow     c1
                      writeWord8Buf oraw (ow+1) c2
diff --git a/GHC/IO/FD.hs b/GHC/IO/FD.hs

index 1730885..65ed913 100644 (file)
--- a/GHC/IO/FD.hs
+++ b/GHC/IO/FD.hs
@@ -47,6 +47,9 @@ import qualified GHC.IO.Device
  import GHC.IO.Device (SeekMode(..), IODeviceType(..))
  import GHC.Conc.IO
  import GHC.IO.Exception
+#ifdef mingw32_HOST_OS
+import GHC.Windows
+#endif
  
  import Foreign
  import Foreign.C
@@ -619,9 +622,6 @@ blockingWriteRawBufferPtr loc fd buf off len
        -- for this case.  We need to detect EPIPE correctly, because it
        -- shouldn't be reported as an error when it happens on stdout.
  
-foreign import ccall unsafe "maperrno"             -- in Win32Utils.c
-   c_maperrno :: IO ()
-
  -- NOTE: "safe" versions of the read/write calls for use by the threaded RTS.
  -- These calls may block, but that's ok.
  
@@ -664,8 +664,3 @@ foreign import ccall unsafe "lockFile"
  foreign import ccall unsafe "unlockFile"
    unlockFile :: CInt -> IO CInt
  #endif
-
-puts :: String -> IO ()
-puts s = do _ <- withCStringLen s $ \(p,len) ->
-                     c_write 1 (castPtr p) (fromIntegral len)
-            return ()
diff --git a/GHC/IO/Handle/Internals.hs b/GHC/IO/Handle/Internals.hs

index 1dbd4bb..a2b644f 100644 (file)
--- a/GHC/IO/Handle/Internals.hs
+++ b/GHC/IO/Handle/Internals.hs
@@ -355,6 +355,38 @@ ioe_bufsiz n = ioException
          ("illegal buffer size " ++ showsPrec 9 n []) Nothing Nothing)
                                  -- 9 => should be parens'ified.
  
+-- ---------------------------------------------------------------------------
+-- Wrapper for Handle encoding/decoding.
+
+-- The interface for TextEncoding changed so that a TextEncoding doesn't raise
+-- an exception if it encounters an invalid sequnce. Furthermore, encoding
+-- returns a reason as to why encoding stopped, letting us know if it was due
+-- to input/output underflow or an invalid sequence.
+--
+-- This code adapts this elaborated interface back to the original TextEncoding
+-- interface.
+--
+-- FIXME: it is possible that Handle code using the haDecoder/haEncoder fields
+-- could be made clearer by using the 'encode' interface directly. I have not
+-- looked into this.
+--
+-- FIXME: we should use recover to deal with EOF, rather than always throwing an
+-- IOException (ioe_invalidCharacter).
+
+streamEncode :: BufferCodec from to state
+             -> Buffer from -> Buffer to
+             -> IO (Buffer from, Buffer to)
+streamEncode codec from to = go (from, to)
+  where 
+    go (from, to) = do
+      (why, from', to') <- encode codec from to
+      -- When we are dealing with Handles, we don't care about input/output
+      -- underflow particularly, and we want to delay errors about invalid
+      -- sequences as far as possible.
+      case why of
+        Encoding.InvalidSequence | bufL from == bufL from' -> recover codec from' to' >>= go
+        _ -> return (from', to')
+
  -- -----------------------------------------------------------------------------
  -- Handle Finalizers
  
@@ -476,7 +508,7 @@ writeCharBuffer h_@Handle__{..} !cbuf = do
  
    (cbuf',bbuf') <- case haEncoder of
      Nothing      -> latin1_encode cbuf bbuf
-    Just encoder -> (encode encoder) cbuf bbuf
+    Just encoder -> (streamEncode encoder) cbuf bbuf
  
    debugIO ("writeCharBuffer after encoding: cbuf=" ++ summaryBuffer cbuf' ++
          " bbuf=" ++ summaryBuffer bbuf')
@@ -537,7 +569,7 @@ flushCharReadBuffer Handle__{..} = do
        -- restore the codec state
        setState decoder codec_state
      
-      (bbuf1,cbuf1) <- (encode decoder) bbuf0
+      (bbuf1,cbuf1) <- (streamEncode decoder) bbuf0
                                 cbuf0{ bufL=0, bufR=0, bufSize = bufL cbuf0 }
      
        debugIO ("finished, bbuf=" ++ summaryBuffer bbuf1 ++
@@ -801,7 +833,7 @@ readTextDevice h_@Handle__{..} cbuf = do
            Just decoder -> do
                 state <- getState decoder
                 writeIORef haLastDecode (state, bbuf1)
-               (encode decoder) bbuf1 cbuf
+               (streamEncode decoder) bbuf1 cbuf
  
    debugIO ("readTextDevice after decoding: cbuf=" ++ summaryBuffer cbuf' ++ 
          " bbuf=" ++ summaryBuffer bbuf2)
@@ -835,7 +867,7 @@ readTextDevice' h_@Handle__{..} bbuf0 cbuf = do
            Just decoder -> do
                 state <- getState decoder
                 writeIORef haLastDecode (state, bbuf2)
-               (encode decoder) bbuf2 cbuf
+               (streamEncode decoder) bbuf2 cbuf
  
    debugIO ("readTextDevice' after decoding: cbuf=" ++ summaryBuffer cbuf' ++ 
          " bbuf=" ++ summaryBuffer bbuf3)
@@ -872,7 +904,7 @@ decodeByteBuf h_@Handle__{..} cbuf = do
            Just decoder -> do
                 state <- getState decoder
                 writeIORef haLastDecode (state, bbuf0)
-               (encode decoder) bbuf0 cbuf
+               (streamEncode decoder) bbuf0 cbuf
  
    writeIORef haByteBuffer bbuf2
    return cbuf'
diff --git a/GHC/IO/Handle/Text.hs b/GHC/IO/Handle/Text.hs

index 160e9d7..0d0e05b 100644 (file)
--- a/GHC/IO/Handle/Text.hs
+++ b/GHC/IO/Handle/Text.hs
@@ -38,6 +38,7 @@ import GHC.IO.FD
  import GHC.IO.Buffer
  import qualified GHC.IO.BufferedIO as Buffered
  import GHC.IO.Exception
+import GHC.IO.Encoding.Failure (surrogatifyRoundtripCharacter, desurrogatifyRoundtripCharacter)
  import GHC.Exception
  import GHC.IO.Handle.Types
  import GHC.IO.Handle.Internals
@@ -279,10 +280,10 @@ unpack !buf !r !w acc0
                   else do c1 <- peekElemOff pbuf (i-1)
                           let c = (fromIntegral c1 - 0xd800) * 0x400 +
                                   (fromIntegral c2 - 0xdc00) + 0x10000
-                         unpackRB (unsafeChr c : acc) (i-2)
+                         unpackRB (desurrogatifyRoundtripCharacter (unsafeChr c) : acc) (i-2)
  #else
                c <- peekElemOff pbuf i
-              unpackRB (c:acc) (i-1)
+              unpackRB (desurrogatifyRoundtripCharacter c:acc) (i-1)
  #endif
       in
       unpackRB acc0 (w-1)
@@ -305,7 +306,7 @@ unpack_nl !buf !r !w acc0
                              then unpackRB ('\n':acc) (i-2)
                              else unpackRB ('\n':acc) (i-1)
                   else do
-                         unpackRB (c:acc) (i-1)
+                         unpackRB (desurrogatifyRoundtripCharacter c:acc) (i-1)
       in do
       c <- peekElemOff pbuf (w-1)
       if (c == '\r')
@@ -585,7 +586,7 @@ writeBlocks hdl line_buffered add_nl nl
             else do
                 shoveString n' cs rest
       | otherwise = do
-        n' <- writeCharBuf raw n c
+        n' <- writeCharBuf raw n (surrogatifyRoundtripCharacter c)
          shoveString n' cs rest
    in
    shoveString 0 s (if add_nl then "\n" else "")
diff --git a/GHC/Windows.hs b/GHC/Windows.hs

new file mode 100644 (file)

index 0000000..2a74a5f
--- /dev/null
+++ b/GHC/Windows.hs
@@ -0,0 +1,44 @@
+{-# LANGUAGE NoImplicitPrelude, ForeignFunctionInterface #-}
+-----------------------------------------------------------------------------
+-- |
+-- Module      :  GHC.Windows
+-- Copyright   :  (c) The University of Glasgow, 2009
+-- License     :  see libraries/base/LICENSE
+-- 
+-- Maintainer  :  libraries@haskell.org
+-- Stability   :  internal
+-- Portability :  non-portable
+--
+-- Windows functionality used by several modules.
+--
+-- ToDo: this just duplicates part of System.Win32.Types, which isn't
+-- available yet.  We should move some Win32 functionality down here,
+-- maybe as part of the grand reorganisation of the base package...
+--
+-----------------------------------------------------------------------------
+
+module GHC.Windows where
+
+import GHC.Base
+import GHC.Ptr
+
+import Data.Word
+
+import Foreign.C.Error (throwErrno)
+import Foreign.C.Types
+
+
+type HANDLE       = Ptr ()
+type DWORD        = Word32
+
+type LPTSTR = Ptr CWchar
+
+iNFINITE :: DWORD
+iNFINITE = 0xFFFFFFFF -- urgh
+
+throwGetLastError :: String -> IO a
+throwGetLastError where_from = c_maperrno >> throwErrno where_from
+
+foreign import ccall unsafe "maperrno"             -- in Win32Utils.c
+   c_maperrno :: IO ()
+
diff --git a/System/Environment.hs b/System/Environment.hs

index 2b6dec2..cf73c3d 100644 (file)
--- a/System/Environment.hs
+++ b/System/Environment.hs
@@ -31,13 +31,20 @@ module System.Environment
  import Prelude
  
  #ifdef __GLASGOW_HASKELL__
-import Data.List
  import Foreign
  import Foreign.C
  import Control.Exception.Base   ( bracket )
-import Control.Monad
  -- import GHC.IO
  import GHC.IO.Exception
+import GHC.IO.Encoding (fileSystemEncoding)
+import qualified GHC.Foreign as GHC
+import Data.List
+#ifdef mingw32_HOST_OS
+import GHC.Environment
+import GHC.Windows
+#else
+import Control.Monad
+#endif
  #endif
  
  #ifdef __HUGS__
@@ -52,25 +59,78 @@ import System
    )
  #endif
  
+#ifdef __GLASGOW_HASKELL__
  -- ---------------------------------------------------------------------------
  -- getArgs, getProgName, getEnv
  
+#ifdef mingw32_HOST_OS
+
+-- Ignore the arguments to hs_init on Windows for the sake of Unicode compat
+
+getWin32ProgArgv_certainly :: IO [String]
+getWin32ProgArgv_certainly = do
+       mb_argv <- getWin32ProgArgv
+       case mb_argv of
+         Nothing   -> fmap dropRTSArgs getFullArgs
+         Just argv -> return argv
+
+withWin32ProgArgv :: [String] -> IO a -> IO a
+withWin32ProgArgv argv act = bracket begin setWin32ProgArgv (\_ -> act)
+  where
+    begin = do
+         mb_old_argv <- getWin32ProgArgv
+         setWin32ProgArgv (Just argv)
+         return mb_old_argv
+
+getWin32ProgArgv :: IO (Maybe [String])
+getWin32ProgArgv = alloca $ \p_argc -> alloca $ \p_argv -> do
+       c_getWin32ProgArgv p_argc p_argv
+       argc <- peek p_argc
+       argv_p <- peek p_argv
+       if argv_p == nullPtr
+        then return Nothing
+        else do
+         argv_ps <- peekArray (fromIntegral argc) argv_p
+         fmap Just $ mapM peekCWString argv_ps
+
+setWin32ProgArgv :: Maybe [String] -> IO ()
+setWin32ProgArgv Nothing = c_setWin32ProgArgv 0 nullPtr
+setWin32ProgArgv (Just argv) = withMany withCWString argv $ \argv_ps -> withArrayLen argv_ps $ \argc argv_p -> do
+       c_setWin32ProgArgv (fromIntegral argc) argv_p
+
+foreign import ccall unsafe "getWin32ProgArgv"
+  c_getWin32ProgArgv :: Ptr CInt -> Ptr (Ptr CWString) -> IO ()
+
+foreign import ccall unsafe "setWin32ProgArgv"
+  c_setWin32ProgArgv :: CInt -> Ptr CWString -> IO ()
+
+dropRTSArgs :: [String] -> [String]
+dropRTSArgs []             = []
+dropRTSArgs ("+RTS":rest)  = dropRTSArgs (dropWhile (/= "-RTS") rest)
+dropRTSArgs ("--RTS":rest) = rest
+dropRTSArgs ("-RTS":rest)  = dropRTSArgs rest
+dropRTSArgs (arg:rest)     = arg : dropRTSArgs rest
+
+#endif
+
  -- | Computation 'getArgs' returns a list of the program's command
  -- line arguments (not including the program name).
-
-#ifdef __GLASGOW_HASKELL__
  getArgs :: IO [String]
+
+#ifdef mingw32_HOST_OS
+getArgs =  fmap tail getWin32ProgArgv_certainly
+#else
  getArgs =
    alloca $ \ p_argc ->
    alloca $ \ p_argv -> do
     getProgArgv p_argc p_argv
     p    <- fromIntegral `liftM` peek p_argc
     argv <- peek p_argv
-   peekArray (p - 1) (advancePtr argv 1) >>= mapM peekCString
-
+   peekArray (p - 1) (advancePtr argv 1) >>= mapM (GHC.peekCString fileSystemEncoding)
  
  foreign import ccall unsafe "getProgArgv"
    getProgArgv :: Ptr CInt -> Ptr (Ptr CString) -> IO ()
+#endif
  
  {-|
  Computation 'getProgName' returns the name of the program as it was
@@ -83,6 +143,10 @@ between platforms: on Windows, for example, a program invoked as foo
  is probably really @FOO.EXE@, and that is what 'getProgName' will return.
  -}
  getProgName :: IO String
+#ifdef mingw32_HOST_OS
+-- Ignore the arguments to hs_init on Windows for the sake of Unicode compat
+getProgName = fmap (basename . head) getWin32ProgArgv_certainly
+#else
  getProgName =
    alloca $ \ p_argc ->
    alloca $ \ p_argv -> do
@@ -92,23 +156,24 @@ getProgName =
  
  unpackProgName  :: Ptr (Ptr CChar) -> IO String   -- argv[0]
  unpackProgName argv = do
-  s <- peekElemOff argv 0 >>= peekCString
+  s <- peekElemOff argv 0 >>= GHC.peekCString fileSystemEncoding
    return (basename s)
-  where
-   basename :: String -> String
-   basename f = go f f
-    where
-      go acc [] = acc
-      go acc (x:xs)
-        | isPathSeparator x = go xs xs
-        | otherwise         = go acc xs
-
-   isPathSeparator :: Char -> Bool
-   isPathSeparator '/'  = True
-#ifdef mingw32_HOST_OS 
-   isPathSeparator '\\' = True
  #endif
-   isPathSeparator _    = False
+
+basename :: FilePath -> FilePath
+basename f = go f f
+ where
+  go acc [] = acc
+  go acc (x:xs)
+    | isPathSeparator x = go xs xs
+    | otherwise         = go acc xs
+
+  isPathSeparator :: Char -> Bool
+  isPathSeparator '/'  = True
+#ifdef mingw32_HOST_OS
+  isPathSeparator '\\' = True
+#endif
+  isPathSeparator _    = False
  
  
  -- | Computation 'getEnv' @var@ returns the value
@@ -120,16 +185,43 @@ unpackProgName argv = do
  --    does not exist.
  
  getEnv :: String -> IO String
+#ifdef mingw32_HOST_OS
+getEnv name = withCWString name $ \s -> try_size s 256
+  where
+    try_size s size = allocaArray (fromIntegral size) $ \p_value -> do
+      res <- c_GetEnvironmentVariable s p_value size
+      case res of
+        0 -> do
+                 err <- c_GetLastError
+                 if err == eRROR_ENVVAR_NOT_FOUND
+                  then ioe_missingEnvVar name
+                  else throwGetLastError "getEnv"
+        _ | res > size -> try_size s res -- Rare: size increased between calls to GetEnvironmentVariable
+          | otherwise  -> peekCWString p_value
+
+eRROR_ENVVAR_NOT_FOUND :: DWORD
+eRROR_ENVVAR_NOT_FOUND = 203
+
+foreign import stdcall unsafe "windows.h GetLastError"
+  c_GetLastError:: IO DWORD
+
+foreign import stdcall unsafe "windows.h GetEnvironmentVariableW"
+  c_GetEnvironmentVariable :: LPTSTR -> LPTSTR -> DWORD -> IO DWORD
+#else
  getEnv name =
      withCString name $ \s -> do
        litstring <- c_getenv s
        if litstring /= nullPtr
-        then peekCString litstring
-        else ioException (IOError Nothing NoSuchThing "getEnv"
-                          "no environment variable" Nothing (Just name))
+        then GHC.peekCString fileSystemEncoding litstring
+        else ioe_missingEnvVar name
  
  foreign import ccall unsafe "getenv"
     c_getenv :: CString -> IO (Ptr CChar)
+#endif
+
+ioe_missingEnvVar :: String -> IO a
+ioe_missingEnvVar name = ioException (IOError Nothing NoSuchThing "getEnv"
+                                                                                         "no environment variable" Nothing (Just name))
  
  {-|
  'withArgs' @args act@ - while executing action @act@, have 'getArgs'
@@ -153,48 +245,93 @@ withProgName nm act = do
  -- the duration of an action.
  
  withArgv :: [String] -> IO a -> IO a
-withArgv new_args act = do
+
+#ifdef mingw32_HOST_OS
+-- We have to reflect the updated arguments in the RTS-side variables as
+-- well, because the RTS still consults them for error messages and the like.
+-- If we don't do this then ghc-e005 fails.
+withArgv new_args act = withWin32ProgArgv new_args $ withProgArgv new_args act
+#else
+withArgv = withProgArgv
+#endif
+
+withProgArgv :: [String] -> IO a -> IO a
+withProgArgv new_args act = do
    pName <- System.Environment.getProgName
    existing_args <- System.Environment.getArgs
-  bracket (setArgs new_args)
-          (\argv -> do _ <- setArgs (pName:existing_args)
-                       freeArgv argv)
+  bracket (setProgArgv new_args)
+          (\argv -> do _ <- setProgArgv (pName:existing_args)
+                       freeProgArgv argv)
            (const act)
  
-freeArgv :: Ptr CString -> IO ()
-freeArgv argv = do
+freeProgArgv :: Ptr CString -> IO ()
+freeProgArgv argv = do
    size <- lengthArray0 nullPtr argv
    sequence_ [peek (argv `advancePtr` i) >>= free | i <- [size, size-1 .. 0]]
    free argv
  
-setArgs :: [String] -> IO (Ptr CString)
-setArgs argv = do
-  vs <- mapM newCString argv >>= newArray0 nullPtr
-  setArgsPrim (genericLength argv) vs
+setProgArgv :: [String] -> IO (Ptr CString)
+setProgArgv argv = do
+  vs <- mapM (GHC.newCString fileSystemEncoding) argv >>= newArray0 nullPtr
+  c_setProgArgv (genericLength argv) vs
    return vs
  
  foreign import ccall unsafe "setProgArgv" 
-  setArgsPrim  :: CInt -> Ptr CString -> IO ()
+  c_setProgArgv  :: CInt -> Ptr CString -> IO ()
  
  -- |'getEnvironment' retrieves the entire environment as a
  -- list of @(key,value)@ pairs.
  --
  -- If an environment entry does not contain an @\'=\'@ character,
  -- the @key@ is the whole entry and the @value@ is the empty string.
-
  getEnvironment :: IO [(String, String)]
+
+#ifdef mingw32_HOST_OS
+getEnvironment = bracket c_GetEnvironmentStrings c_FreeEnvironmentStrings $ \pBlock ->
+    if pBlock == nullPtr then return []
+     else go pBlock
+  where
+    go pBlock = do
+        -- The block is terminated by a null byte where there
+        -- should be an environment variable of the form X=Y
+        c <- peek pBlock
+        if c == 0 then return []
+         else do
+          -- Seek the next pair (or terminating null):
+          pBlock' <- seekNull pBlock False
+          -- We now know the length in bytes, but ignore it when
+          -- getting the actual String:
+          str <- peekCWString pBlock
+          fmap (divvy str :) $ go pBlock'
+    
+    -- Returns pointer to the byte *after* the next null
+    seekNull pBlock done = do
+        let pBlock' = pBlock `plusPtr` sizeOf (undefined :: CWchar)
+        if done then return pBlock'
+         else do
+           c <- peek pBlock'
+           seekNull pBlock' (c == (0 :: Word8 ))
+
+foreign import stdcall unsafe "windows.h GetEnvironmentStringsW"
+  c_GetEnvironmentStrings :: IO (Ptr CWchar)
+
+foreign import stdcall unsafe "windows.h FreeEnvironmentStringsW"
+  c_FreeEnvironmentStrings :: Ptr CWchar -> IO Bool
+#else
  getEnvironment = do
     pBlock <- getEnvBlock
     if pBlock == nullPtr then return []
      else do
-      stuff <- peekArray0 nullPtr pBlock >>= mapM peekCString
+      stuff <- peekArray0 nullPtr pBlock >>= mapM (GHC.peekCString fileSystemEncoding)
        return (map divvy stuff)
-  where
-   divvy str =
-      case break (=='=') str of
-        (xs,[])        -> (xs,[]) -- don't barf (like Posix.getEnvironment)
-        (name,_:value) -> (name,value)
  
  foreign import ccall unsafe "__hscore_environ" 
    getEnvBlock :: IO (Ptr CString)
+#endif
+
+divvy :: String -> (String, String)
+divvy str =
+  case break (=='=') str of
+    (xs,[])        -> (xs,[]) -- don't barf (like Posix.getEnvironment)
+    (name,_:value) -> (name,value)
  #endif  /* __GLASGOW_HASKELL__ */
diff --git a/System/IO.hs b/System/IO.hs

index f60ecad..ab52244 100644 (file)
--- a/System/IO.hs
+++ b/System/IO.hs
@@ -245,7 +245,7 @@ import System.Posix.Types
  
  #ifdef __GLASGOW_HASKELL__
  import GHC.Base
-import GHC.IO hiding ( onException )
+import GHC.IO hiding ( bracket, onException )
  import GHC.IO.IOMode
  import GHC.IO.Handle.FD
  import qualified GHC.IO.FD as FD
diff --git a/System/Posix/Internals.hs b/System/Posix/Internals.hs

index 4a83635..6a30ba0 100644 (file)
--- a/System/Posix/Internals.hs
+++ b/System/Posix/Internals.hs
@@ -51,6 +51,10 @@ import GHC.IO
  import GHC.IO.IOMode
  import GHC.IO.Exception
  import GHC.IO.Device
+#ifndef mingw32_HOST_OS
+import {-# SOURCE #-} GHC.IO.Encoding (fileSystemEncoding)
+import qualified GHC.Foreign as GHC
+#endif
  #elif __HUGS__
  import Hugs.Prelude (IOException(..), IOErrorType(..))
  import Hugs.IO (IOMode(..))
@@ -65,6 +69,18 @@ import DIOError
  {-# CFILES cbits/PrelIOUtils.c cbits/consUtils.c #-}
  #endif
  
+
+-- ---------------------------------------------------------------------------
+-- Debugging the base package
+
+puts :: String -> IO ()
+puts s = withCAStringLen (s ++ "\n") $ \(p, len) -> do
+            -- In reality should be withCString, but assume ASCII to avoid loop
+            -- if this is called by GHC.Foreign
+           _ <- c_write 1 (castPtr p) (fromIntegral len)
+           return ()
+
+
  -- ---------------------------------------------------------------------------
  -- Types
  
@@ -171,10 +187,26 @@ fdGetMode fd = do
  
  #ifdef mingw32_HOST_OS
  withFilePath :: FilePath -> (CWString -> IO a) -> IO a
-withFilePath = withCWString 
+withFilePath = withCWString
+
+peekFilePath :: CWString -> IO FilePath
+peekFilePath = peekCWString
  #else
+
  withFilePath :: FilePath -> (CString -> IO a) -> IO a
+peekFilePath :: CString -> IO FilePath
+peekFilePathLen :: CStringLen -> IO FilePath
+
+#if __GLASGOW_HASKELL__
+withFilePath = GHC.withCString fileSystemEncoding
+peekFilePath = GHC.peekCString fileSystemEncoding
+peekFilePathLen = GHC.peekCStringLen fileSystemEncoding
+#else
  withFilePath = withCString
+peekFilePath = peekCString
+peekFilePathLen = peekCStringLen
+#endif
+
  #endif
  
  -- ---------------------------------------------------------------------------
diff --git a/System/Posix/Internals.hs-boot b/System/Posix/Internals.hs-boot

new file mode 100644 (file)

index 0000000..40415fc
--- /dev/null
+++ b/System/Posix/Internals.hs-boot
@@ -0,0 +1,7 @@
+{-# LANGUAGE NoImplicitPrelude #-}
+module System.Posix.Internals where
+
+import GHC.IO
+import GHC.Base
+
+puts :: String -> IO ()
diff --git a/base.cabal b/base.cabal

index 5478cc3..6bb96f6 100644 (file)
--- a/base.cabal
+++ b/base.cabal
@@ -53,6 +53,7 @@ Library {
              GHC.Exts,
              GHC.Float,
              GHC.Float.RealFracMethods,
+            GHC.Foreign,
              GHC.Float.ConversionUtils,
              GHC.ForeignPtr,
              GHC.MVar,
@@ -71,6 +72,7 @@ Library {
              GHC.IO.Encoding.Types,
              GHC.IO.Encoding.Iconv,
              GHC.IO.Encoding.CodePage,
+            GHC.IO.Encoding.Failure,
              GHC.IO.Handle,
              GHC.IO.Handle.Types,
              GHC.IO.Handle.Internals,
@@ -101,6 +103,7 @@ Library {
          if os(windows)
              exposed-modules: GHC.IO.Encoding.CodePage.Table
                               GHC.Conc.Windows
+                             GHC.Windows
      }
      exposed-modules:
          Control.Applicative,
author	Simon Peyton Jones <simonpj@microsoft.com>
	Thu, 19 May 2011 07:38:35 +0000 (08:38 +0100)
committer	Simon Peyton Jones <simonpj@microsoft.com>
	Thu, 19 May 2011 07:38:35 +0000 (08:38 +0100)
Control/Exception/Base.hs		patch \| blob \| history
Foreign/C/String.hs		patch \| blob \| history
GHC/Conc/Windows.hs		patch \| blob \| history
GHC/Environment.hs		patch \| blob \| history
GHC/Foreign.hs	[new file with mode: 0644]	patch \| blob
GHC/IO.hs		patch \| blob \| history
GHC/IO/Encoding.hs		patch \| blob \| history
GHC/IO/Encoding.hs-boot	[new file with mode: 0644]	patch \| blob
GHC/IO/Encoding/CodePage.hs		patch \| blob \| history
GHC/IO/Encoding/Failure.hs	[new file with mode: 0644]	patch \| blob
GHC/IO/Encoding/Iconv.hs		patch \| blob \| history
GHC/IO/Encoding/Latin1.hs		patch \| blob \| history
GHC/IO/Encoding/Types.hs		patch \| blob \| history
GHC/IO/Encoding/UTF16.hs		patch \| blob \| history
GHC/IO/Encoding/UTF32.hs		patch \| blob \| history
GHC/IO/Encoding/UTF8.hs		patch \| blob \| history
GHC/IO/FD.hs		patch \| blob \| history
GHC/IO/Handle/Internals.hs		patch \| blob \| history
GHC/IO/Handle/Text.hs		patch \| blob \| history
GHC/Windows.hs	[new file with mode: 0644]	patch \| blob
System/Environment.hs		patch \| blob \| history
System/IO.hs		patch \| blob \| history
System/Posix/Internals.hs		patch \| blob \| history
System/Posix/Internals.hs-boot	[new file with mode: 0644]	patch \| blob
base.cabal		patch \| blob \| history