patch series fixes #5061, #1414, #3309, #3308, #3307, #4006 and #4855.
The major changes are:
1) Make Foreign.C.String.*CString use the locale encoding
This change follows the FFI specification in Haskell 98, which
has never actually been implemented before.
The functions exported from Foreign.C.String are partially-applied
versions of those from GHC.Foreign, which allows the user to supply
their own TextEncoding.
We also introduce foreignEncoding as the name of the text encoding
that follows the FFI appendix in that it transliterates encoding
errors.
2) I also changed the code so that mkTextEncoding always tries the
native-Haskell decoders in preference to those from iconv, even on
non-Windows. The motivation here is simply that it is better for
compatibility if we do this, and those are the ones you get for
the utf* and latin1* predefined TextEncodings anyway.
3) Implement surrogate-byte error handling mode for TextEncoding
This implements PEP383-like behaviour so that we are able to
roundtrip byte strings through Strings without loss of information.
The withFilePath function now uses this encoding to get to/from CStrings,
so any code that uses that will get the right PEP383 behaviour automatically.
4) Implement three other coding failure modes: ignore, throw error, transliterate
These mimic the behaviour of the GNU Iconv extensions.
#ifdef __GLASGOW_HASKELL__
import GHC.Base
-import GHC.IO hiding (finally,onException)
+import GHC.IO hiding (bracket,finally,onException)
import GHC.IO.Exception
import GHC.Exception
import GHC.Show
-----------------------------------------------------------------------------
module Foreign.C.String ( -- representation of strings in C
-
-- * C strings
CString, -- = Ptr CChar
-- ** Using a locale-dependent encoding
+#ifndef __GLASGOW_HASKELL__
-- | Currently these functions are identical to their @CAString@ counterparts;
-- eventually they will use an encoding determined by the current locale.
+#else
+ -- | These functions are different from their @CAString@ counterparts
+ -- in that they will use an encoding determined by the current locale,
+ -- rather than always assuming ASCII.
+#endif
-- conversion of C strings into Haskell strings
--
import Data.Word
#ifdef __GLASGOW_HASKELL__
+import Control.Monad
+
import GHC.List
import GHC.Real
import GHC.Num
import GHC.Base
+
+import {-# SOURCE #-} GHC.IO.Encoding
+import qualified GHC.Foreign as GHC
#else
import Data.Char ( chr, ord )
#define unsafeChr chr
-- | Marshal a NUL terminated C string into a Haskell string.
--
peekCString :: CString -> IO String
+#ifndef __GLASGOW_HASKELL__
peekCString = peekCAString
+#else
+peekCString = GHC.peekCString foreignEncoding
+#endif
-- | Marshal a C string with explicit length into a Haskell string.
--
peekCStringLen :: CStringLen -> IO String
+#ifndef __GLASGOW_HASKELL__
peekCStringLen = peekCAStringLen
+#else
+peekCStringLen = GHC.peekCStringLen foreignEncoding
+#endif
-- | Marshal a Haskell string into a NUL terminated C string.
--
-- 'Foreign.Marshal.Alloc.finalizerFree'.
--
newCString :: String -> IO CString
+#ifndef __GLASGOW_HASKELL__
newCString = newCAString
+#else
+newCString = GHC.newCString foreignEncoding
+#endif
-- | Marshal a Haskell string into a C string (ie, character array) with
-- explicit length information.
-- 'Foreign.Marshal.Alloc.finalizerFree'.
--
newCStringLen :: String -> IO CStringLen
+#ifndef __GLASGOW_HASKELL__
newCStringLen = newCAStringLen
+#else
+newCStringLen = GHC.newCStringLen foreignEncoding
+#endif
-- | Marshal a Haskell string into a NUL terminated C string using temporary
-- storage.
-- storage must /not/ be used after this.
--
withCString :: String -> (CString -> IO a) -> IO a
+#ifndef __GLASGOW_HASKELL__
withCString = withCAString
+#else
+withCString = GHC.withCString foreignEncoding
+#endif
-- | Marshal a Haskell string into a C string (ie, character array)
-- in temporary storage, with explicit length information.
-- storage must /not/ be used after this.
--
withCStringLen :: String -> (CStringLen -> IO a) -> IO a
+#ifndef __GLASGOW_HASKELL__
withCStringLen = withCAStringLen
+#else
+withCStringLen = GHC.withCStringLen foreignEncoding
+#endif
+
+#ifndef __GLASGOW_HASKELL__
-- | Determines whether a character can be accurately encoded in a 'CString'.
-- Unrepresentable characters are converted to @\'?\'@.
--
-- Currently only Latin-1 characters are representable.
charIsRepresentable :: Char -> IO Bool
charIsRepresentable c = return (ord c < 256)
+#else
+-- -- | Determines whether a character can be accurately encoded in a 'CString'.
+-- -- Unrepresentable characters are converted to '?' or their nearest visual equivalent.
+charIsRepresentable :: Char -> IO Bool
+charIsRepresentable = GHC.charIsRepresentable foreignEncoding
+#endif
-- single byte characters
-- ----------------------
import Data.Bits (shiftR)
import Data.Maybe (Maybe(..))
import Data.Typeable
-import Foreign.C.Error (throwErrno)
import GHC.Base
import GHC.Conc.Sync
import GHC.Enum (Enum)
import GHC.Real (div, fromIntegral)
import GHC.Show (Show)
import GHC.Word (Word32, Word64)
+import GHC.Windows
-- ----------------------------------------------------------------------------
-- Thread waiting
r <- c_WaitForSingleObject wakeup timeout
case r of
- 0xffffffff -> do c_maperrno; throwErrno "service_loop"
+ 0xffffffff -> do throwGetLastError "service_loop"
0 -> do
r2 <- c_readIOManagerEvent
exit <-
milli_seconds = (micro_seconds + 999) `div` 1000
in return (all, fromIntegral milli_seconds)
--- ToDo: this just duplicates part of System.Win32.Types, which isn't
--- available yet. We should move some Win32 functionality down here,
--- maybe as part of the grand reorganisation of the base package...
-type HANDLE = Ptr ()
-type DWORD = Word32
-
-iNFINITE :: DWORD
-iNFINITE = 0xFFFFFFFF -- urgh
-
foreign import ccall unsafe "getIOManagerEvent" -- in the RTS (ThrIOManager.c)
c_getIOManagerEvent :: IO HANDLE
foreign import ccall unsafe "sendIOManagerEvent" -- in the RTS (ThrIOManager.c)
c_sendIOManagerEvent :: Word32 -> IO ()
-foreign import ccall unsafe "maperrno" -- in Win32Utils.c
- c_maperrno :: IO ()
-
foreign import stdcall "WaitForSingleObject"
c_WaitForSingleObject :: HANDLE -> DWORD -> IO DWORD
-{-# LANGUAGE ForeignFunctionInterface #-}
+{-# LANGUAGE CPP, ForeignFunctionInterface #-}
module GHC.Environment (getFullArgs) where
import Prelude
import Foreign
import Foreign.C
+
+#ifdef mingw32_HOST_OS
+import GHC.IO (finally)
+import GHC.Windows
+
+-- Ignore the arguments to hs_init on Windows for the sake of Unicode compat
+getFullArgs :: IO [String]
+getFullArgs = do
+ p_arg_string <- c_GetCommandLine
+ alloca $ \p_argc -> do
+ p_argv <- c_CommandLineToArgv p_arg_string p_argc
+ if p_argv == nullPtr
+ then throwGetLastError "getFullArgs"
+ else flip finally (c_LocalFree p_argv) $ do
+ argc <- peek p_argc
+ p_argvs <- peekArray (fromIntegral argc) p_argv
+ mapM peekCWString p_argvs
+
+foreign import stdcall unsafe "windows.h GetCommandLineW"
+ c_GetCommandLine :: IO (Ptr CWString)
+
+foreign import stdcall unsafe "windows.h CommandLineToArgvW"
+ c_CommandLineToArgv :: Ptr CWString -> Ptr CInt -> IO (Ptr CWString)
+
+foreign import stdcall unsafe "Windows.h LocalFree"
+ c_LocalFree :: Ptr a -> IO (Ptr a)
+#else
import Control.Monad
+import GHC.IO.Encoding
+import qualified GHC.Foreign as GHC
+
getFullArgs :: IO [String]
getFullArgs =
alloca $ \ p_argc ->
getFullProgArgv p_argc p_argv
p <- fromIntegral `liftM` peek p_argc
argv <- peek p_argv
- peekArray (p - 1) (advancePtr argv 1) >>= mapM peekCString
+ peekArray (p - 1) (advancePtr argv 1) >>= mapM (GHC.peekCString fileSystemEncoding)
foreign import ccall unsafe "getFullProgArgv"
getFullProgArgv :: Ptr CInt -> Ptr (Ptr CString) -> IO ()
-
+#endif
\ No newline at end of file
--- /dev/null
+{-# LANGUAGE CPP, NoImplicitPrelude #-}
+
+-----------------------------------------------------------------------------
+-- |
+-- Module : GHC.IO.Encoding
+-- Copyright : (c) The University of Glasgow, 2008-2011
+-- License : see libraries/base/LICENSE
+--
+-- Maintainer : libraries@haskell.org
+-- Stability : internal
+-- Portability : non-portable
+--
+-- Foreign marshalling support for CStrings with configurable encodings
+--
+-----------------------------------------------------------------------------
+
+module GHC.Foreign (
+ -- * C strings with a configurable encoding
+
+ -- conversion of C strings into Haskell strings
+ --
+ peekCString, -- :: TextEncoding -> CString -> IO String
+ peekCStringLen, -- :: TextEncoding -> CStringLen -> IO String
+
+ -- conversion of Haskell strings into C strings
+ --
+ newCString, -- :: TextEncoding -> String -> IO CString
+ newCStringLen, -- :: TextEncoding -> String -> IO CStringLen
+
+ -- conversion of Haskell strings into C strings using temporary storage
+ --
+ withCString, -- :: TextEncoding -> String -> (CString -> IO a) -> IO a
+ withCStringLen, -- :: TextEncoding -> String -> (CStringLen -> IO a) -> IO a
+
+ charIsRepresentable, -- :: TextEncoding -> Char -> IO Bool
+ ) where
+
+import Foreign.Marshal.Array
+import Foreign.C.Types
+import Foreign.Ptr
+import Foreign.Storable
+
+import Data.Word
+
+-- Imports for the locale-encoding version of marshallers
+import Control.Monad
+
+import Data.Tuple (fst)
+import Data.Maybe
+
+import {-# SOURCE #-} System.Posix.Internals (puts)
+import GHC.Show ( show )
+
+import Foreign.Marshal.Alloc
+import Foreign.ForeignPtr
+
+import GHC.Err (undefined)
+import GHC.List
+import GHC.Num
+import GHC.Base
+
+import GHC.IO
+import GHC.IO.Exception
+import GHC.IO.Buffer
+import GHC.IO.Encoding.Types
+
+
+c_DEBUG_DUMP :: Bool
+c_DEBUG_DUMP = False
+
+putDebugMsg :: String -> IO ()
+putDebugMsg | c_DEBUG_DUMP = puts
+ | otherwise = const (return ())
+
+
+-- These definitions are identical to those in Foreign.C.String, but copied in here to avoid a cycle:
+type CString = Ptr CChar
+type CStringLen = (Ptr CChar, Int)
+
+-- exported functions
+-- ------------------
+
+-- | Marshal a NUL terminated C string into a Haskell string.
+--
+peekCString :: TextEncoding -> CString -> IO String
+peekCString enc cp = do
+ sz <- lengthArray0 nUL cp
+ peekEncodedCString enc (cp, sz * cCharSize)
+
+-- | Marshal a C string with explicit length into a Haskell string.
+--
+peekCStringLen :: TextEncoding -> CStringLen -> IO String
+peekCStringLen = peekEncodedCString
+
+-- | Marshal a Haskell string into a NUL terminated C string.
+--
+-- * the Haskell string may /not/ contain any NUL characters
+--
+-- * new storage is allocated for the C string and must be
+-- explicitly freed using 'Foreign.Marshal.Alloc.free' or
+-- 'Foreign.Marshal.Alloc.finalizerFree'.
+--
+newCString :: TextEncoding -> String -> IO CString
+newCString enc = liftM fst . newEncodedCString enc True
+
+-- | Marshal a Haskell string into a C string (ie, character array) with
+-- explicit length information.
+--
+-- * new storage is allocated for the C string and must be
+-- explicitly freed using 'Foreign.Marshal.Alloc.free' or
+-- 'Foreign.Marshal.Alloc.finalizerFree'.
+--
+newCStringLen :: TextEncoding -> String -> IO CStringLen
+newCStringLen enc = newEncodedCString enc False
+
+-- | Marshal a Haskell string into a NUL terminated C string using temporary
+-- storage.
+--
+-- * the Haskell string may /not/ contain any NUL characters
+--
+-- * the memory is freed when the subcomputation terminates (either
+-- normally or via an exception), so the pointer to the temporary
+-- storage must /not/ be used after this.
+--
+withCString :: TextEncoding -> String -> (CString -> IO a) -> IO a
+withCString enc s act = withEncodedCString enc True s $ \(cp, _sz) -> act cp
+
+-- | Marshal a Haskell string into a C string (ie, character array)
+-- in temporary storage, with explicit length information.
+--
+-- * the memory is freed when the subcomputation terminates (either
+-- normally or via an exception), so the pointer to the temporary
+-- storage must /not/ be used after this.
+--
+withCStringLen :: TextEncoding -> String -> (CStringLen -> IO a) -> IO a
+withCStringLen enc = withEncodedCString enc False
+
+
+-- | Determines whether a character can be accurately encoded in a 'CString'.
+--
+-- Pretty much anyone who uses this function is in a state of sin because
+-- whether or not a character is encodable will, in general, depend on the
+-- context in which it occurs.
+charIsRepresentable :: TextEncoding -> Char -> IO Bool
+charIsRepresentable enc c = withCString enc [c] (fmap (== [c]) . peekCString enc) `catchException` (\e -> let _ = e :: IOException in return False)
+
+-- auxiliary definitions
+-- ----------------------
+
+-- C's end of string character
+nUL :: CChar
+nUL = 0
+
+-- Size of a CChar in bytes
+cCharSize :: Int
+cCharSize = sizeOf (undefined :: CChar)
+
+
+{-# INLINE peekEncodedCString #-}
+peekEncodedCString :: TextEncoding -- ^ Encoding of CString
+ -> CStringLen
+ -> IO String -- ^ String in Haskell terms
+peekEncodedCString (TextEncoding { mkTextDecoder = mk_decoder }) (p, sz_bytes)
+ = bracket mk_decoder close $ \decoder -> do
+ let chunk_size = sz_bytes `max` 1 -- Decode buffer chunk size in characters: one iteration only for ASCII
+ from0 <- fmap (\fp -> bufferAdd sz_bytes (emptyBuffer fp sz_bytes ReadBuffer)) $ newForeignPtr_ (castPtr p)
+ to <- newCharBuffer chunk_size WriteBuffer
+
+ let go iteration from = do
+ (why, from', to') <- encode decoder from to
+ if isEmptyBuffer from'
+ then
+ -- No input remaining: @why@ will be InputUnderflow, but we don't care
+ withBuffer to' $ peekArray (bufferElems to')
+ else do
+ -- Input remaining: what went wrong?
+ putDebugMsg ("peekEncodedCString: " ++ show iteration ++ " " ++ show why)
+ (from'', to'') <- case why of InvalidSequence -> recover decoder from' to' -- These conditions are equally bad because
+ InputUnderflow -> recover decoder from' to' -- they indicate malformed/truncated input
+ OutputUnderflow -> return (from', to') -- We will have more space next time round
+ putDebugMsg ("peekEncodedCString: from " ++ summaryBuffer from ++ " " ++ summaryBuffer from' ++ " " ++ summaryBuffer from'')
+ putDebugMsg ("peekEncodedCString: to " ++ summaryBuffer to ++ " " ++ summaryBuffer to' ++ " " ++ summaryBuffer to'')
+ to_chars <- withBuffer to'' $ peekArray (bufferElems to'')
+ fmap (to_chars++) $ go (iteration + 1) from''
+
+ go (0 :: Int) from0
+
+{-# INLINE withEncodedCString #-}
+withEncodedCString :: TextEncoding -- ^ Encoding of CString to create
+ -> Bool -- ^ Null-terminate?
+ -> String -- ^ String to encode
+ -> (CStringLen -> IO a) -- ^ Worker that can safely use the allocated memory
+ -> IO a
+withEncodedCString (TextEncoding { mkTextEncoder = mk_encoder }) null_terminate s act
+ = bracket mk_encoder close $ \encoder -> withArrayLen s $ \sz p -> do
+ from <- fmap (\fp -> bufferAdd sz (emptyBuffer fp sz ReadBuffer)) $ newForeignPtr_ p
+
+ let go iteration to_sz_bytes = do
+ putDebugMsg ("withEncodedCString: " ++ show iteration)
+ allocaBytes to_sz_bytes $ \to_p -> do
+ mb_res <- tryFillBufferAndCall encoder null_terminate from to_p to_sz_bytes act
+ case mb_res of
+ Nothing -> go (iteration + 1) (to_sz_bytes * 2)
+ Just res -> return res
+
+ -- If the input string is ASCII, this value will ensure we only allocate once
+ go (0 :: Int) (cCharSize * (sz + 1))
+
+{-# INLINE newEncodedCString #-}
+newEncodedCString :: TextEncoding -- ^ Encoding of CString to create
+ -> Bool -- ^ Null-terminate?
+ -> String -- ^ String to encode
+ -> IO CStringLen
+newEncodedCString (TextEncoding { mkTextEncoder = mk_encoder }) null_terminate s
+ = bracket mk_encoder close $ \encoder -> withArrayLen s $ \sz p -> do
+ from <- fmap (\fp -> bufferAdd sz (emptyBuffer fp sz ReadBuffer)) $ newForeignPtr_ p
+
+ let go iteration to_p to_sz_bytes = do
+ putDebugMsg ("newEncodedCString: " ++ show iteration)
+ mb_res <- tryFillBufferAndCall encoder null_terminate from to_p to_sz_bytes return
+ case mb_res of
+ Nothing -> do
+ let to_sz_bytes' = to_sz_bytes * 2
+ to_p' <- reallocBytes to_p to_sz_bytes'
+ go (iteration + 1) to_p' to_sz_bytes'
+ Just res -> return res
+
+ -- If the input string is ASCII, this value will ensure we only allocate once
+ let to_sz_bytes = cCharSize * (sz + 1)
+ to_p <- mallocBytes to_sz_bytes
+ go (0 :: Int) to_p to_sz_bytes
+
+
+tryFillBufferAndCall :: TextEncoder dstate -> Bool -> Buffer Char -> Ptr Word8 -> Int
+ -> (CStringLen -> IO a) -> IO (Maybe a)
+tryFillBufferAndCall encoder null_terminate from0 to_p to_sz_bytes act = do
+ to_fp <- newForeignPtr_ to_p
+ go (0 :: Int) (from0, emptyBuffer to_fp to_sz_bytes WriteBuffer)
+ where
+ go iteration (from, to) = do
+ (why, from', to') <- encode encoder from to
+ putDebugMsg ("tryFillBufferAndCall: " ++ show iteration ++ " " ++ show why ++ " " ++ summaryBuffer from ++ " " ++ summaryBuffer from')
+ if isEmptyBuffer from'
+ then if null_terminate && bufferAvailable to' == 0
+ then return Nothing -- We had enough for the string but not the terminator: ask the caller for more buffer
+ else do
+ -- Awesome, we had enough buffer
+ let bytes = bufferElems to'
+ withBuffer to' $ \to_ptr -> do
+ when null_terminate $ pokeElemOff to_ptr (bufR to') 0
+ fmap Just $ act (castPtr to_ptr, bytes) -- NB: the length information is specified as being in *bytes*
+ else case why of -- We didn't consume all of the input
+ InputUnderflow -> recover encoder from' to' >>= go (iteration + 1) -- These conditions are equally bad
+ InvalidSequence -> recover encoder from' to' >>= go (iteration + 1) -- since the input was truncated/invalid
+ OutputUnderflow -> return Nothing -- Oops, out of buffer during decoding: ask the caller for more
mask, mask_, uninterruptibleMask, uninterruptibleMask_,
MaskingState(..), getMaskingState,
block, unblock, blocked, unsafeUnmask,
- onException, finally, evaluate
+ onException, bracket, finally, evaluate
) where
import GHC.Base
MaskedInterruptible -> blockUninterruptible $ io block
MaskedUninterruptible -> io id
+bracket
+ :: IO a -- ^ computation to run first (\"acquire resource\")
+ -> (a -> IO b) -- ^ computation to run last (\"release resource\")
+ -> (a -> IO c) -- ^ computation to run in-between
+ -> IO c -- returns the value from the in-between computation
+bracket before after thing =
+ mask $ \restore -> do
+ a <- before
+ r <- restore (thing a) `onException` after a
+ _ <- after a
+ return r
+
finally :: IO a -- ^ computation to run first
-> IO b -- ^ computation to run afterward (even if an exception
-- was raised)
-----------------------------------------------------------------------------
module GHC.IO.Encoding (
- BufferCodec(..), TextEncoding(..), TextEncoder, TextDecoder,
+ BufferCodec(..), TextEncoding(..), TextEncoder, TextDecoder, CodingProgress(..),
latin1, latin1_encode, latin1_decode,
utf8, utf8_bom,
utf16, utf16le, utf16be,
utf32, utf32le, utf32be,
- localeEncoding,
+ localeEncoding, fileSystemEncoding, foreignEncoding,
mkTextEncoding,
) where
import GHC.Base
--import GHC.IO
+import GHC.IO.Exception
import GHC.IO.Buffer
+import GHC.IO.Encoding.Failure
import GHC.IO.Encoding.Types
import GHC.Word
#if !defined(mingw32_HOST_OS)
import qualified GHC.IO.Encoding.UTF16 as UTF16
import qualified GHC.IO.Encoding.UTF32 as UTF32
-#if defined(mingw32_HOST_OS)
+import Data.List
import Data.Maybe
-import GHC.IO.Exception
-#endif
-- -----------------------------------------------------------------------------
utf32be = UTF32.utf32be
-- | The Unicode encoding of the current locale
-localeEncoding :: TextEncoding
+localeEncoding :: TextEncoding
+
+-- | The Unicode encoding of the current locale, but allowing arbitrary
+-- undecodable bytes to be round-tripped through it.
+--
+-- This 'TextEncoding' is used to decode and encode command line arguments
+-- and environment variables on non-Windows platforms.
+--
+-- On Windows, this encoding *should not* be used if possible because
+-- the use of code pages is deprecated: Strings should be retrieved
+-- via the "wide" W-family of UTF-16 APIs instead
+fileSystemEncoding :: TextEncoding
+
+-- | The Unicode encoding of the current locale, but where undecodable
+-- bytes are replaced with their closest visual match. Used for
+-- the 'CString' marshalling functions in "Foreign.C.String"
+foreignEncoding :: TextEncoding
+
#if !defined(mingw32_HOST_OS)
localeEncoding = Iconv.localeEncoding
+fileSystemEncoding = Iconv.mkLocaleEncoding SurrogateEscapeFailure
+foreignEncoding = Iconv.mkLocaleEncoding IgnoreCodingFailure
#else
localeEncoding = CodePage.localeEncoding
+fileSystemEncoding = CodePage.mkLocaleEncoding SurrogateEscapeFailure
+foreignEncoding = CodePage.mkLocaleEncoding IgnoreCodingFailure
#endif
-- | Look up the named Unicode encoding. May fail with
-- @CP@; for example, @\"CP1250\"@.
--
mkTextEncoding :: String -> IO TextEncoding
-#if !defined(mingw32_HOST_OS)
-mkTextEncoding = Iconv.mkTextEncoding
+mkTextEncoding e = case mb_coding_failure_mode of
+ Nothing -> unknown_encoding
+ Just cfm -> case enc of
+ "UTF-8" -> return $ UTF8.mkUTF8 cfm
+ "UTF-16" -> return $ UTF16.mkUTF16 cfm
+ "UTF-16LE" -> return $ UTF16.mkUTF16le cfm
+ "UTF-16BE" -> return $ UTF16.mkUTF16be cfm
+ "UTF-32" -> return $ UTF32.mkUTF32 cfm
+ "UTF-32LE" -> return $ UTF32.mkUTF32le cfm
+ "UTF-32BE" -> return $ UTF32.mkUTF32be cfm
+#if defined(mingw32_HOST_OS)
+ 'C':'P':n | [(cp,"")] <- reads n -> return $ CodePage.mkCodePageEncoding cfm cp
+ _ -> unknown_encoding
#else
-mkTextEncoding "UTF-8" = return utf8
-mkTextEncoding "UTF-16" = return utf16
-mkTextEncoding "UTF-16LE" = return utf16le
-mkTextEncoding "UTF-16BE" = return utf16be
-mkTextEncoding "UTF-32" = return utf32
-mkTextEncoding "UTF-32LE" = return utf32le
-mkTextEncoding "UTF-32BE" = return utf32be
-mkTextEncoding ('C':'P':n)
- | [(cp,"")] <- reads n = return $ CodePage.codePageEncoding cp
-mkTextEncoding e = ioException
- (IOError Nothing NoSuchThing "mkTextEncoding"
- ("unknown encoding:" ++ e) Nothing Nothing)
+ _ -> Iconv.mkIconvEncoding cfm enc
#endif
+ where
+ -- The only problem with actually documenting //IGNORE and //TRANSLIT as
+ -- supported suffixes is that they are not necessarily supported with non-GNU iconv
+ (enc, suffix) = span (/= '/') e
+ mb_coding_failure_mode = case suffix of
+ "" -> Just ErrorOnCodingFailure
+ "//IGNORE" -> Just IgnoreCodingFailure
+ "//TRANSLIT" -> Just TransliterateCodingFailure
+ "//SURROGATE" -> Just SurrogateEscapeFailure
+ _ -> Nothing
+
+ unknown_encoding = ioException (IOError Nothing NoSuchThing "mkTextEncoding"
+ ("unknown encoding:" ++ e) Nothing Nothing)
latin1_encode :: CharBuffer -> Buffer Word8 -> IO (CharBuffer, Buffer Word8)
-latin1_encode = Latin1.latin1_encode -- unchecked, used for binary
+latin1_encode input output = fmap (\(_why,input',output') -> (input',output')) $ Latin1.latin1_encode input output -- unchecked, used for binary
--latin1_encode = unsafePerformIO $ do mkTextEncoder Iconv.latin1 >>= return.encode
latin1_decode :: Buffer Word8 -> CharBuffer -> IO (Buffer Word8, CharBuffer)
-latin1_decode = Latin1.latin1_decode
+latin1_decode input output = fmap (\(_why,input',output') -> (input',output')) $ Latin1.latin1_decode input output
--latin1_decode = unsafePerformIO $ do mkTextDecoder Iconv.latin1 >>= return.encode
--- /dev/null
+{-# LANGUAGE NoImplicitPrelude #-}
+module GHC.IO.Encoding where
+
+import GHC.IO.Encoding.Types
+
+localeEncoding, fileSystemEncoding, foreignEncoding :: TextEncoding
\ No newline at end of file
#if !defined(mingw32_HOST_OS)
) where
#else
- codePageEncoding,
- localeEncoding
+ codePageEncoding, mkCodePageEncoding,
+ localeEncoding, mkLocaleEncoding
) where
import GHC.Base
import GHC.Enum
import GHC.Word
import GHC.IO (unsafePerformIO)
+import GHC.IO.Encoding.Failure
import GHC.IO.Encoding.Types
import GHC.IO.Buffer
-import GHC.IO.Exception
import Data.Bits
import Data.Maybe
import Data.List (lookup)
import GHC.IO.Encoding.CodePage.Table
-import GHC.IO.Encoding.Latin1 (latin1)
-import GHC.IO.Encoding.UTF8 (utf8)
-import GHC.IO.Encoding.UTF16 (utf16le, utf16be)
-import GHC.IO.Encoding.UTF32 (utf32le, utf32be)
+import GHC.IO.Encoding.Latin1 (mkLatin1)
+import GHC.IO.Encoding.UTF8 (mkUTF8)
+import GHC.IO.Encoding.UTF16 (mkUTF16le, mkUTF16be)
+import GHC.IO.Encoding.UTF32 (mkUTF32le, mkUTF32be)
-- note CodePage = UInt which might not work on Win64. But the Win32 package
-- also has this issue.
foreign import stdcall unsafe "windows.h GetACP"
getACP :: IO Word32
-{-# NOINLINE localeEncoding #-}
+{-# NOINLINE currentCodePage #-}
+currentCodePage :: Word32
+currentCodePage = unsafePerformIO getCurrentCodePage
+
localeEncoding :: TextEncoding
-localeEncoding = unsafePerformIO $ fmap codePageEncoding getCurrentCodePage
-
+localeEncoding = mkLocaleEncoding ErrorOnCodingFailure
+
+mkLocaleEncoding :: CodingFailureMode -> TextEncoding
+mkLocaleEncoding cfm = mkCodePageEncoding cfm currentCodePage
+
codePageEncoding :: Word32 -> TextEncoding
-codePageEncoding 65001 = utf8
-codePageEncoding 1200 = utf16le
-codePageEncoding 1201 = utf16be
-codePageEncoding 12000 = utf32le
-codePageEncoding 12001 = utf32be
-codePageEncoding cp = maybe latin1 (buildEncoding cp) (lookup cp codePageMap)
-
-buildEncoding :: Word32 -> CodePageArrays -> TextEncoding
-buildEncoding cp SingleByteCP {decoderArray = dec, encoderArray = enc}
+codePageEncoding = mkCodePageEncoding ErrorOnCodingFailure
+
+mkCodePageEncoding :: CodingFailureMode -> Word32 -> TextEncoding
+mkCodePageEncoding cfm 65001 = mkUTF8 cfm
+mkCodePageEncoding cfm 1200 = mkUTF16le cfm
+mkCodePageEncoding cfm 1201 = mkUTF16be cfm
+mkCodePageEncoding cfm 12000 = mkUTF32le cfm
+mkCodePageEncoding cfm 12001 = mkUTF32be cfm
+mkCodePageEncoding cfm cp = maybe (mkLatin1 cfm) (buildEncoding cfm cp) (lookup cp codePageMap)
+
+buildEncoding :: CodingFailureMode -> Word32 -> CodePageArrays -> TextEncoding
+buildEncoding cfm cp SingleByteCP {decoderArray = dec, encoderArray = enc}
= TextEncoding {
- textEncodingName = "CP" ++ show cp,
- mkTextDecoder = return $ simpleCodec
- $ decodeFromSingleByte dec
- , mkTextEncoder = return $ simpleCodec $ encodeToSingleByte enc
+ textEncodingName = "CP" ++ show cp
+ , mkTextDecoder = return $ simpleCodec (recoverDecode cfm) $ decodeFromSingleByte dec
+ , mkTextEncoder = return $ simpleCodec (recoverEncode cfm) $ encodeToSingleByte enc
}
simpleCodec :: (Buffer from -> Buffer to -> IO (Buffer from, Buffer to))
+ -> (Buffer from -> Buffer to -> IO (CodingProgress, Buffer from, Buffer to))
-> BufferCodec from to ()
-simpleCodec f = BufferCodec {encode = f, close = return (), getState = return (),
- setState = return }
+simpleCodec r f = BufferCodec {
+ encode = f,
+ recover = r,
+ close = return (),
+ getState = return (),
+ setState = return
+ }
decodeFromSingleByte :: ConvArray Char -> DecodeBuffer
decodeFromSingleByte convArr
input@Buffer { bufRaw=iraw, bufL=ir0, bufR=iw, bufSize=_ }
output@Buffer { bufRaw=oraw, bufL=_, bufR=ow0, bufSize=os }
= let
- done !ir !ow = return (if ir==iw then input{ bufL=0, bufR=0}
- else input{ bufL=ir},
- output {bufR=ow})
+ done why !ir !ow = return (why,
+ if ir==iw then input{ bufL=0, bufR=0}
+ else input{ bufL=ir},
+ output {bufR=ow})
loop !ir !ow
- | ow >= os || ir >= iw = done ir ow
+ | ow >= os = done OutputUnderflow ir ow
+ | ir >= iw = done InputUnderflow ir ow
| otherwise = do
b <- readWord8Buf iraw ir
let c = lookupConv convArr b
ow' <- writeCharBuf oraw ow c
loop (ir+1) ow'
where
- invalid = if ir > ir0 then done ir ow else ioe_decodingError
+ invalid = done InvalidSequence ir ow
in loop ir0 ow0
encodeToSingleByte :: CompactArray Char Word8 -> EncodeBuffer
input@Buffer{ bufRaw=iraw, bufL=ir0, bufR=iw, bufSize=_ }
output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow0, bufSize=os }
= let
- done !ir !ow = return (if ir==iw then input { bufL=0, bufR=0 }
- else input { bufL=ir },
- output {bufR=ow})
+ done why !ir !ow = return (why,
+ if ir==iw then input { bufL=0, bufR=0 }
+ else input { bufL=ir },
+ output {bufR=ow})
loop !ir !ow
- | ow >= os || ir >= iw = done ir ow
+ | ow >= os = done OutputUnderflow ir ow
+ | ir >= iw = done InputUnderflow ir ow
| otherwise = do
(c,ir') <- readCharBuf iraw ir
case lookupCompact maxChar indices values c of
writeWord8Buf oraw ow b
loop ir' (ow+1)
where
- invalid = if ir > ir0 then done ir ow else ioe_encodingError
+ invalid = done InvalidSequence ir ow
in
loop ir0 ow0
-ioe_decodingError :: IO a
-ioe_decodingError = ioException
- (IOError Nothing InvalidArgument "codePageEncoding"
- "invalid code page byte sequence" Nothing Nothing)
-
-ioe_encodingError :: IO a
-ioe_encodingError = ioException
- (IOError Nothing InvalidArgument "codePageEncoding"
- "character is not in the code page" Nothing Nothing)
-
--------------------------------------------
-- Array access functions
--- /dev/null
+{-# LANGUAGE NoImplicitPrelude, PatternGuards #-}
+-----------------------------------------------------------------------------
+-- |
+-- Module : GHC.IO.Encoding.Failure
+-- Copyright : (c) The University of Glasgow, 2008-2011
+-- License : see libraries/base/LICENSE
+--
+-- Maintainer : libraries@haskell.org
+-- Stability : internal
+-- Portability : non-portable
+--
+-- Types for specifying how text encoding/decoding fails
+--
+-----------------------------------------------------------------------------
+
+module GHC.IO.Encoding.Failure (
+ CodingFailureMode(..), codingFailureModeSuffix,
+ isSurrogate, recoverDecode, recoverEncode
+ ) where
+
+import GHC.IO
+import GHC.IO.Buffer
+import GHC.IO.Exception
+
+import GHC.Base
+import GHC.Word
+import GHC.Show
+import GHC.Num
+import GHC.Real ( fromIntegral )
+
+--import System.Posix.Internals
+
+import Data.Maybe
+
+-- | The 'CodingFailureMode' is used to construct 'TextEncoding's, and specifies
+-- how they handle illegal sequences.
+data CodingFailureMode = ErrorOnCodingFailure -- ^ Throw an error when an illegal sequence is encountered
+ | IgnoreCodingFailure -- ^ Attempt to ignore and recover if an illegal sequence is encountered
+ | TransliterateCodingFailure -- ^ Replace with the closest visual match upon an illegal sequence
+ | SurrogateEscapeFailure -- ^ Use the surrogate escape mechanism to attempt to allow illegal sequences to be roundtripped.
+ deriving (Show) -- This will only work properly for those encodings which are strict supersets of ASCII in the sense
+ -- that valid ASCII data is also valid in that encoding. This is not true for e.g. UTF-16, because
+ -- ASCII characters must be padded to two bytes to retain their meaning.
+
+codingFailureModeSuffix :: CodingFailureMode -> String
+codingFailureModeSuffix ErrorOnCodingFailure = ""
+codingFailureModeSuffix IgnoreCodingFailure = "//IGNORE"
+codingFailureModeSuffix TransliterateCodingFailure = "//TRANSLIT"
+codingFailureModeSuffix SurrogateEscapeFailure = "//SURROGATE"
+
+-- | In transliterate mode, we use this character when decoding unknown bytes.
+--
+-- This is the defined Unicode replacement character: <http://www.fileformat.info/info/unicode/char/0fffd/index.htm>
+unrepresentableChar :: Char
+unrepresentableChar = '\xFFFD'
+
+-- | Some characters are actually "surrogate" codepoints defined for use in UTF-16. We need to signal an
+-- invalid character if we detect them when encoding a sequence of 'Char's into 'Word8's because the
+-- 'SurrogateEscapeFailure' mode creates unpaired surrogates to round-trip bytes through our internal
+-- UTF-16 encoding.
+isSurrogate :: Char -> Bool
+isSurrogate c = (0xD800 <= x && x <= 0xDBFF) || (0xDC00 <= x && x <= 0xDFFF)
+ where x = ord c
+
+escapeToSurrogateCharacter :: Word8 -> Char
+escapeToSurrogateCharacter b
+ | b < 128 = chr (fromIntegral b) -- Disallow 'smuggling' of ASCII bytes. For roundtripping to work, this assumes encoding is ASCII-superset.
+ | otherwise = chr (0xDC00 + fromIntegral b)
+
+unescapeSurrogateCharacter :: Char -> Maybe Word8
+unescapeSurrogateCharacter c
+ | 0xDC80 <= x && x < 0xDD00 = Just (fromIntegral x) -- Discard high byte
+ | otherwise = Nothing
+ where x = ord c
+
+recoverDecode :: CodingFailureMode -> Buffer Word8 -> Buffer Char -> IO (Buffer Word8, Buffer Char)
+recoverDecode cfm input@Buffer{ bufRaw=iraw, bufL=ir, bufR=_ }
+ output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow } = do
+ --puts $ "recoverDecode " ++ show ir
+ case cfm of
+ ErrorOnCodingFailure -> ioe_decodingError
+ IgnoreCodingFailure -> return (input { bufL=ir+1 }, output)
+ TransliterateCodingFailure -> do
+ ow' <- writeCharBuf oraw ow unrepresentableChar
+ return (input { bufL=ir+1 }, output { bufR=ow' })
+ SurrogateEscapeFailure -> do
+ b <- readWord8Buf iraw ir
+ ow' <- writeCharBuf oraw ow (escapeToSurrogateCharacter b)
+ return (input { bufL=ir+1 }, output { bufR=ow' })
+
+recoverEncode :: CodingFailureMode -> Buffer Char -> Buffer Word8 -> IO (Buffer Char, Buffer Word8)
+recoverEncode cfm input@Buffer{ bufRaw=iraw, bufL=ir, bufR=_ }
+ output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow } = do
+ (c,ir') <- readCharBuf iraw ir
+ --puts $ "recoverEncode " ++ show ir ++ " " ++ show ir'
+ case cfm of
+ IgnoreCodingFailure -> return (input { bufL=ir' }, output)
+ TransliterateCodingFailure -> do
+ if c == '?'
+ then return (input { bufL=ir' }, output)
+ else do
+ -- XXX: evil hack! To implement transliteration, we just poke an
+ -- ASCII ? into the input buffer and tell the caller to try and decode
+ -- again. This is *probably* safe given current uses of TextEncoding.
+ --
+ -- The "if" test above ensures we skip if the encoding fails to deal with
+ -- the ?, though this should never happen in practice as all encodings are
+ -- in fact capable of reperesenting all ASCII characters.
+ _ir' <- writeCharBuf iraw ir '?'
+ return (input, output)
+
+ -- This implementation does not work because e.g. UTF-16 requires 2 bytes to
+ -- encode a simple ASCII value
+ --writeWord8Buf oraw ow unrepresentableByte
+ --return (input { bufL=ir' }, output { bufR=ow+1 })
+ SurrogateEscapeFailure | Just x <- unescapeSurrogateCharacter c -> do
+ writeWord8Buf oraw ow x
+ return (input { bufL=ir' }, output { bufR=ow+1 })
+ _ -> ioe_encodingError
+
+ioe_decodingError :: IO a
+ioe_decodingError = ioException
+ (IOError Nothing InvalidArgument "recoverDecode"
+ "invalid byte sequence" Nothing Nothing)
+
+ioe_encodingError :: IO a
+ioe_encodingError = ioException
+ (IOError Nothing InvalidArgument "recoverEncode"
+ "invalid character" Nothing Nothing)
-- #hide
module GHC.IO.Encoding.Iconv (
#if !defined(mingw32_HOST_OS)
- mkTextEncoding,
- latin1,
- utf8,
- utf16, utf16le, utf16be,
- utf32, utf32le, utf32be,
- localeEncoding
+ iconvEncoding, mkIconvEncoding,
+ localeEncoding, mkLocaleEncoding
#endif
) where
import Data.Maybe
import GHC.Base
import GHC.IO.Buffer
+import GHC.IO.Encoding.Failure
import GHC.IO.Encoding.Types
import GHC.List (span)
import GHC.Num
| c_DEBUG_DUMP = puts s
| otherwise = return ()
-puts :: String -> IO ()
-puts s = do _ <- withCAStringLen (s ++ "\n") $ \(p, len) ->
- -- In reality should be withCString, but assume ASCII to avoid loop
- c_write 1 (castPtr p) (fromIntegral len)
- return ()
-
-- -----------------------------------------------------------------------------
-- iconv encoders/decoders
-{-# NOINLINE latin1 #-}
-latin1 :: TextEncoding
-latin1 = unsafePerformIO (mkTextEncoding "Latin1")
-
-{-# NOINLINE utf8 #-}
-utf8 :: TextEncoding
-utf8 = unsafePerformIO (mkTextEncoding "UTF8")
-
-{-# NOINLINE utf16 #-}
-utf16 :: TextEncoding
-utf16 = unsafePerformIO (mkTextEncoding "UTF16")
-
-{-# NOINLINE utf16le #-}
-utf16le :: TextEncoding
-utf16le = unsafePerformIO (mkTextEncoding "UTF16LE")
-
-{-# NOINLINE utf16be #-}
-utf16be :: TextEncoding
-utf16be = unsafePerformIO (mkTextEncoding "UTF16BE")
-
-{-# NOINLINE utf32 #-}
-utf32 :: TextEncoding
-utf32 = unsafePerformIO (mkTextEncoding "UTF32")
-
-{-# NOINLINE utf32le #-}
-utf32le :: TextEncoding
-utf32le = unsafePerformIO (mkTextEncoding "UTF32LE")
-
-{-# NOINLINE utf32be #-}
-utf32be :: TextEncoding
-utf32be = unsafePerformIO (mkTextEncoding "UTF32BE")
-
{-# NOINLINE localeEncodingName #-}
localeEncodingName :: String
localeEncodingName = unsafePerformIO $ do
cstr <- c_localeEncoding
peekCAString cstr -- Assume charset names are ASCII
-{-# NOINLINE localeEncoding #-}
localeEncoding :: TextEncoding
-localeEncoding = unsafePerformIO $ mkTextEncoding localeEncodingName
+localeEncoding = mkLocaleEncoding ErrorOnCodingFailure
+
+mkLocaleEncoding :: CodingFailureMode -> TextEncoding
+mkLocaleEncoding cfm = unsafePerformIO $ mkIconvEncoding cfm localeEncodingName
-- We hope iconv_t is a storable type. It should be, since it has at least the
-- value -1, which is a possible return value from iconv_open.
char_shift | charSize == 2 = 1
| otherwise = 2
-mkTextEncoding :: String -> IO TextEncoding
-mkTextEncoding charset = do
+iconvEncoding :: String -> IO TextEncoding
+iconvEncoding = mkIconvEncoding ErrorOnCodingFailure
+
+mkIconvEncoding :: CodingFailureMode -> String -> IO TextEncoding
+mkIconvEncoding cfm charset = do
return (TextEncoding {
textEncodingName = charset,
- mkTextDecoder = newIConv raw_charset (haskellChar ++ suffix) iconvDecode,
- mkTextEncoder = newIConv haskellChar charset iconvEncode})
+ mkTextDecoder = newIConv raw_charset (haskellChar ++ suffix) (recoverDecode cfm) iconvDecode,
+ mkTextEncoder = newIConv haskellChar charset (recoverEncode cfm) iconvEncode})
where
-- An annoying feature of GNU iconv is that the //PREFIXES only take
-- effect when they appear on the tocode parameter to iconv_open:
(raw_charset, suffix) = span (/= '/') charset
newIConv :: String -> String
- -> (IConv -> Buffer a -> Buffer b -> IO (Buffer a, Buffer b))
+ -> (Buffer a -> Buffer b -> IO (Buffer a, Buffer b))
+ -> (IConv -> Buffer a -> Buffer b -> IO (CodingProgress, Buffer a, Buffer b))
-> IO (BufferCodec a b ())
-newIConv from to fn =
+newIConv from to rec fn =
-- Assume charset names are ASCII
withCAString from $ \ from_str ->
withCAString to $ \ to_str -> do
let iclose = throwErrnoIfMinus1_ "Iconv.close" $ hs_iconv_close iconvt
return BufferCodec{
encode = fn iconvt,
+ recover = rec,
close = iclose,
-- iconv doesn't supply a way to save/restore the state
getState = return (),
setState = const $ return ()
}
-iconvDecode :: IConv -> Buffer Word8 -> Buffer CharBufElem
- -> IO (Buffer Word8, Buffer CharBufElem)
+iconvDecode :: IConv -> DecodeBuffer
iconvDecode iconv_t ibuf obuf = iconvRecode iconv_t ibuf 0 obuf char_shift
-iconvEncode :: IConv -> Buffer CharBufElem -> Buffer Word8
- -> IO (Buffer CharBufElem, Buffer Word8)
+iconvEncode :: IConv -> EncodeBuffer
iconvEncode iconv_t ibuf obuf = iconvRecode iconv_t ibuf char_shift obuf 0
iconvRecode :: IConv -> Buffer a -> Int -> Buffer b -> Int
- -> IO (Buffer a, Buffer b)
+ -> IO (CodingProgress, Buffer a, Buffer b)
iconvRecode iconv_t
input@Buffer{ bufRaw=iraw, bufL=ir, bufR=iw, bufSize=_ } iscale
output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow, bufSize=os } oscale
iconv_trace ("iconvRecode after, output=" ++ show (summaryBuffer new_output))
if (res /= -1)
then do -- all input translated
- return (new_input, new_output)
+ return (InputUnderflow, new_input, new_output)
else do
errno <- getErrno
case errno of
- e | e == eINVAL || e == e2BIG
- || e == eILSEQ && new_inleft' /= (iw-ir) -> do
- iconv_trace ("iconv ignoring error: " ++ show (errnoToIOError "iconv" e Nothing Nothing))
- -- Output overflow is harmless
- --
- -- Similarly, we ignore EILSEQ unless we converted no
- -- characters. Sometimes iconv reports EILSEQ for a
- -- character in the input even when there is no room
- -- in the output; in this case we might be about to
- -- change the encoding anyway, so the following bytes
- -- could very well be in a different encoding.
- -- This also helps with pinpointing EILSEQ errors: we
- -- don't report it until the rest of the characters in
- -- the buffer have been drained.
- return (new_input, new_output)
-
- e -> do
- iconv_trace ("iconv returned error: " ++ show (errnoToIOError "iconv" e Nothing Nothing))
- throwErrno "iconvRecoder"
- -- illegal sequence, or some other error
+ e | e == e2BIG -> return (OutputUnderflow, new_input, new_output)
+ | e == eINVAL -> return (InputUnderflow, new_input, new_output)
+ -- Sometimes iconv reports EILSEQ for a
+ -- character in the input even when there is no room
+ -- in the output; in this case we might be about to
+ -- change the encoding anyway, so the following bytes
+ -- could very well be in a different encoding.
+ --
+ -- Because we can only say InvalidSequence if there is at least
+ -- one element left in the output, we have to special case this.
+ | e == eILSEQ -> return (if new_outleft' == 0 then OutputUnderflow else InvalidSequence, new_input, new_output)
+ | otherwise -> do
+ iconv_trace ("iconv returned error: " ++ show (errnoToIOError "iconv" e Nothing Nothing))
+ throwErrno "iconvRecoder"
#endif /* !mingw32_HOST_OS */
-----------------------------------------------------------------------------
module GHC.IO.Encoding.Latin1 (
- latin1,
- latin1_checked,
+ latin1, mkLatin1,
+ latin1_checked, mkLatin1_checked,
latin1_decode,
latin1_encode,
latin1_checked_encode,
import GHC.Real
import GHC.Num
-- import GHC.IO
-import GHC.IO.Exception
import GHC.IO.Buffer
+import GHC.IO.Encoding.Failure
import GHC.IO.Encoding.Types
-import Data.Maybe
-- -----------------------------------------------------------------------------
-- Latin1
latin1 :: TextEncoding
-latin1 = TextEncoding { textEncodingName = "ISO8859-1",
- mkTextDecoder = latin1_DF,
- mkTextEncoder = latin1_EF }
+latin1 = mkLatin1 ErrorOnCodingFailure
-latin1_DF :: IO (TextDecoder ())
-latin1_DF =
+mkLatin1 :: CodingFailureMode -> TextEncoding
+mkLatin1 cfm = TextEncoding { textEncodingName = "ISO8859-1",
+ mkTextDecoder = latin1_DF cfm,
+ mkTextEncoder = latin1_EF cfm }
+
+latin1_DF :: CodingFailureMode -> IO (TextDecoder ())
+latin1_DF cfm =
return (BufferCodec {
encode = latin1_decode,
+ recover = recoverDecode cfm,
close = return (),
getState = return (),
setState = const $ return ()
})
-latin1_EF :: IO (TextEncoder ())
-latin1_EF =
+latin1_EF :: CodingFailureMode -> IO (TextEncoder ())
+latin1_EF cfm =
return (BufferCodec {
encode = latin1_encode,
+ recover = recoverEncode cfm,
close = return (),
getState = return (),
setState = const $ return ()
})
latin1_checked :: TextEncoding
-latin1_checked = TextEncoding { textEncodingName = "ISO8859-1(checked)",
- mkTextDecoder = latin1_DF,
- mkTextEncoder = latin1_checked_EF }
+latin1_checked = mkLatin1_checked ErrorOnCodingFailure
+
+mkLatin1_checked :: CodingFailureMode -> TextEncoding
+mkLatin1_checked cfm = TextEncoding { textEncodingName = "ISO8859-1(checked)",
+ mkTextDecoder = latin1_DF cfm,
+ mkTextEncoder = latin1_checked_EF cfm }
-latin1_checked_EF :: IO (TextEncoder ())
-latin1_checked_EF =
+latin1_checked_EF :: CodingFailureMode -> IO (TextEncoder ())
+latin1_checked_EF cfm =
return (BufferCodec {
encode = latin1_checked_encode,
+ recover = recoverEncode cfm,
close = return (),
getState = return (),
setState = const $ return ()
output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow0, bufSize=os }
= let
loop !ir !ow
- | ow >= os || ir >= iw = done ir ow
+ | ow >= os = done OutputUnderflow ir ow
+ | ir >= iw = done InputUnderflow ir ow
| otherwise = do
c0 <- readWord8Buf iraw ir
ow' <- writeCharBuf oraw ow (unsafeChr (fromIntegral c0))
loop (ir+1) ow'
-- lambda-lifted, to avoid thunks being built in the inner-loop:
- done !ir !ow = return (if ir == iw then input{ bufL=0, bufR=0 }
- else input{ bufL=ir },
- output{ bufR=ow })
+ done why !ir !ow = return (why,
+ if ir == iw then input{ bufL=0, bufR=0 }
+ else input{ bufL=ir },
+ output{ bufR=ow })
in
loop ir0 ow0
input@Buffer{ bufRaw=iraw, bufL=ir0, bufR=iw, bufSize=_ }
output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow0, bufSize=os }
= let
- done !ir !ow = return (if ir == iw then input{ bufL=0, bufR=0 }
- else input{ bufL=ir },
- output{ bufR=ow })
+ done why !ir !ow = return (why,
+ if ir == iw then input{ bufL=0, bufR=0 }
+ else input{ bufL=ir },
+ output{ bufR=ow })
loop !ir !ow
- | ow >= os || ir >= iw = done ir ow
+ | ow >= os = done OutputUnderflow ir ow
+ | ir >= iw = done InputUnderflow ir ow
| otherwise = do
(c,ir') <- readCharBuf iraw ir
writeWord8Buf oraw ow (fromIntegral (ord c))
input@Buffer{ bufRaw=iraw, bufL=ir0, bufR=iw, bufSize=_ }
output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow0, bufSize=os }
= let
- done !ir !ow = return (if ir == iw then input{ bufL=0, bufR=0 }
- else input{ bufL=ir },
- output{ bufR=ow })
+ done why !ir !ow = return (why,
+ if ir == iw then input{ bufL=0, bufR=0 }
+ else input{ bufL=ir },
+ output{ bufR=ow })
loop !ir !ow
- | ow >= os || ir >= iw = done ir ow
+ | ow >= os = done OutputUnderflow ir ow
+ | ir >= iw = done InputUnderflow ir ow
| otherwise = do
(c,ir') <- readCharBuf iraw ir
if ord c > 0xff then invalid else do
writeWord8Buf oraw ow (fromIntegral (ord c))
loop ir' (ow+1)
where
- invalid = if ir > ir0 then done ir ow else ioe_encodingError
+ invalid = done InvalidSequence ir ow
in
loop ir0 ow0
-
-ioe_encodingError :: IO a
-ioe_encodingError = ioException
- (IOError Nothing InvalidArgument "latin1_checked_encode"
- "character is out of range for this encoding" Nothing Nothing)
TextEncoding(..),
TextEncoder, TextDecoder,
EncodeBuffer, DecodeBuffer,
+ CodingProgress(..)
) where
import GHC.Base
-- Text encoders/decoders
data BufferCodec from to state = BufferCodec {
- encode :: Buffer from -> Buffer to -> IO (Buffer from, Buffer to),
+ encode :: Buffer from -> Buffer to -> IO (CodingProgress, Buffer from, Buffer to),
-- ^ The @encode@ function translates elements of the buffer @from@
-- to the buffer @to@. It should translate as many elements as possible
-- given the sizes of the buffers, including translating zero elements
-- if there is either not enough room in @to@, or @from@ does not
-- contain a complete multibyte sequence.
- --
- -- @encode@ should raise an exception if, and only if, @from@
- -- begins with an illegal sequence, or the first element of @from@
- -- is not representable in the encoding of @to@. That is, if any
- -- elements can be successfully translated before an error is
- -- encountered, then @encode@ should translate as much as it can
- -- and not throw an exception. This behaviour is used by the IO
+ --
+ -- The fact that as many elements as possible are translated is used by the IO
-- library in order to report translation errors at the point they
-- actually occur, rather than when the buffer is translated.
+
+ recover :: Buffer from -> Buffer to -> IO (Buffer from, Buffer to),
+ -- ^ The @recover@ function is used to continue decoding
+ -- in the presence of invalid or unrepresentable sequences. This includes
+ -- both those detected by @encode@ returning @InvalidSequence@ and those
+ -- that occur because the input byte sequence appears to be truncated.
+ --
+ -- Progress will usually be made by skipping the first element of the @from@
+ -- buffer. This function should only be called if you are certain that you
+ -- wish to do this skipping, and if the @to@ buffer has at least one element
+ -- of free space.
--
+ -- @recover@ may raise an exception rather than skipping anything.
+
close :: IO (),
-- ^ Resources associated with the encoding may now be released.
-- The @encode@ function may not be called again after calling
-- beginning), and if not, whether to use the big or little-endian
-- encoding.
- setState :: state -> IO()
+ setState :: state -> IO ()
-- restore the state of the codec using the state from a previous
-- call to 'getState'.
}
type DecodeBuffer = Buffer Word8 -> Buffer Char
- -> IO (Buffer Word8, Buffer Char)
+ -> IO (CodingProgress, Buffer Word8, Buffer Char)
type EncodeBuffer = Buffer Char -> Buffer Word8
- -> IO (Buffer Char, Buffer Word8)
+ -> IO (CodingProgress, Buffer Char, Buffer Word8)
type TextDecoder state = BufferCodec Word8 CharBufElem state
type TextEncoder state = BufferCodec CharBufElem Word8 state
instance Show TextEncoding where
-- | Returns the value of 'textEncodingName'
show te = textEncodingName te
+
+data CodingProgress = InputUnderflow -- ^ Stopped because the input contains insufficient available elements,
+ -- or all of the input sequence has been sucessfully translated.
+ | OutputUnderflow -- ^ Stopped because the output contains insufficient free elements
+ | InvalidSequence -- ^ Stopped because there are sufficient free elements in the output
+ -- to output at least one encoded ASCII character, but the input contains
+ -- an invalid or unrepresentable sequence
+ deriving (Eq, Show)
-----------------------------------------------------------------------------
module GHC.IO.Encoding.UTF16 (
- utf16,
+ utf16, mkUTF16,
utf16_decode,
utf16_encode,
- utf16be,
+ utf16be, mkUTF16be,
utf16be_decode,
utf16be_encode,
- utf16le,
+ utf16le, mkUTF16le,
utf16le_decode,
utf16le_encode,
) where
import GHC.Real
import GHC.Num
-- import GHC.IO
-import GHC.IO.Exception
import GHC.IO.Buffer
+import GHC.IO.Encoding.Failure
import GHC.IO.Encoding.Types
import GHC.Word
import Data.Bits
import Data.Maybe
import GHC.IORef
-#if DEBUG
-import System.Posix.Internals
-import Foreign.C
-import GHC.Show
-import GHC.Ptr
-
-puts :: String -> IO ()
- -- In reality should be withCString, but assume ASCII to avoid possible loop
-puts s = do withCAStringLen (s++"\n") $ \(p,len) ->
- c_write 1 (castPtr p) (fromIntegral len)
- return ()
-#endif
-
-- -----------------------------------------------------------------------------
-- The UTF-16 codec: either UTF16BE or UTF16LE with a BOM
utf16 :: TextEncoding
-utf16 = TextEncoding { textEncodingName = "UTF-16",
- mkTextDecoder = utf16_DF,
- mkTextEncoder = utf16_EF }
+utf16 = mkUTF16 ErrorOnCodingFailure
+
+mkUTF16 :: CodingFailureMode -> TextEncoding
+mkUTF16 cfm = TextEncoding { textEncodingName = "UTF-16",
+ mkTextDecoder = utf16_DF cfm,
+ mkTextEncoder = utf16_EF cfm }
-utf16_DF :: IO (TextDecoder (Maybe DecodeBuffer))
-utf16_DF = do
+utf16_DF :: CodingFailureMode -> IO (TextDecoder (Maybe DecodeBuffer))
+utf16_DF cfm = do
seen_bom <- newIORef Nothing
return (BufferCodec {
encode = utf16_decode seen_bom,
+ recover = recoverDecode cfm,
close = return (),
getState = readIORef seen_bom,
setState = writeIORef seen_bom
})
-utf16_EF :: IO (TextEncoder Bool)
-utf16_EF = do
+utf16_EF :: CodingFailureMode -> IO (TextEncoder Bool)
+utf16_EF cfm = do
done_bom <- newIORef False
return (BufferCodec {
encode = utf16_encode done_bom,
+ recover = recoverEncode cfm,
close = return (),
getState = readIORef done_bom,
setState = writeIORef done_bom
b <- readIORef done_bom
if b then utf16_native_encode input output
else if os - ow < 2
- then return (input,output)
+ then return (OutputUnderflow,input,output)
else do
writeIORef done_bom True
writeWord8Buf oraw ow bom1
case mb of
Just decode -> decode input output
Nothing ->
- if iw - ir < 2 then return (input,output) else do
+ if iw - ir < 2 then return (InputUnderflow,input,output) else do
c0 <- readWord8Buf iraw ir
c1 <- readWord8Buf iraw (ir+1)
case () of
-- UTF16LE and UTF16BE
utf16be :: TextEncoding
-utf16be = TextEncoding { textEncodingName = "UTF-16BE",
- mkTextDecoder = utf16be_DF,
- mkTextEncoder = utf16be_EF }
+utf16be = mkUTF16be ErrorOnCodingFailure
-utf16be_DF :: IO (TextDecoder ())
-utf16be_DF =
+mkUTF16be :: CodingFailureMode -> TextEncoding
+mkUTF16be cfm = TextEncoding { textEncodingName = "UTF-16BE",
+ mkTextDecoder = utf16be_DF cfm,
+ mkTextEncoder = utf16be_EF cfm }
+
+utf16be_DF :: CodingFailureMode -> IO (TextDecoder ())
+utf16be_DF cfm =
return (BufferCodec {
encode = utf16be_decode,
+ recover = recoverDecode cfm,
close = return (),
getState = return (),
setState = const $ return ()
})
-utf16be_EF :: IO (TextEncoder ())
-utf16be_EF =
+utf16be_EF :: CodingFailureMode -> IO (TextEncoder ())
+utf16be_EF cfm =
return (BufferCodec {
encode = utf16be_encode,
+ recover = recoverEncode cfm,
close = return (),
getState = return (),
setState = const $ return ()
})
utf16le :: TextEncoding
-utf16le = TextEncoding { textEncodingName = "UTF16-LE",
- mkTextDecoder = utf16le_DF,
- mkTextEncoder = utf16le_EF }
+utf16le = mkUTF16le ErrorOnCodingFailure
+
+mkUTF16le :: CodingFailureMode -> TextEncoding
+mkUTF16le cfm = TextEncoding { textEncodingName = "UTF16-LE",
+ mkTextDecoder = utf16le_DF cfm,
+ mkTextEncoder = utf16le_EF cfm }
-utf16le_DF :: IO (TextDecoder ())
-utf16le_DF =
+utf16le_DF :: CodingFailureMode -> IO (TextDecoder ())
+utf16le_DF cfm =
return (BufferCodec {
encode = utf16le_decode,
+ recover = recoverDecode cfm,
close = return (),
getState = return (),
setState = const $ return ()
})
-utf16le_EF :: IO (TextEncoder ())
-utf16le_EF =
+utf16le_EF :: CodingFailureMode -> IO (TextEncoder ())
+utf16le_EF cfm =
return (BufferCodec {
encode = utf16le_encode,
+ recover = recoverEncode cfm,
close = return (),
getState = return (),
setState = const $ return ()
output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow0, bufSize=os }
= let
loop !ir !ow
- | ow >= os || ir >= iw = done ir ow
- | ir + 1 == iw = done ir ow
+ | ow >= os = done OutputUnderflow ir ow
+ | ir >= iw = done InputUnderflow ir ow
+ | ir + 1 == iw = done InputUnderflow ir ow
| otherwise = do
c0 <- readWord8Buf iraw ir
c1 <- readWord8Buf iraw (ir+1)
if validate1 x1
then do ow' <- writeCharBuf oraw ow (unsafeChr (fromIntegral x1))
loop (ir+2) ow'
- else if iw - ir < 4 then done ir ow else do
+ else if iw - ir < 4 then done InputUnderflow ir ow else do
c2 <- readWord8Buf iraw (ir+2)
c3 <- readWord8Buf iraw (ir+3)
let x2 = fromIntegral c2 `shiftL` 8 + fromIntegral c3
ow' <- writeCharBuf oraw ow (chr2 x1 x2)
loop (ir+4) ow'
where
- invalid = if ir > ir0 then done ir ow else ioe_decodingError
+ invalid = done InvalidSequence ir ow
-- lambda-lifted, to avoid thunks being built in the inner-loop:
- done !ir !ow = return (if ir == iw then input{ bufL=0, bufR=0 }
- else input{ bufL=ir },
- output{ bufR=ow })
+ done why !ir !ow = return (why,
+ if ir == iw then input{ bufL=0, bufR=0 }
+ else input{ bufL=ir },
+ output{ bufR=ow })
in
loop ir0 ow0
output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow0, bufSize=os }
= let
loop !ir !ow
- | ow >= os || ir >= iw = done ir ow
- | ir + 1 == iw = done ir ow
+ | ow >= os = done OutputUnderflow ir ow
+ | ir >= iw = done InputUnderflow ir ow
+ | ir + 1 == iw = done InputUnderflow ir ow
| otherwise = do
c0 <- readWord8Buf iraw ir
c1 <- readWord8Buf iraw (ir+1)
if validate1 x1
then do ow' <- writeCharBuf oraw ow (unsafeChr (fromIntegral x1))
loop (ir+2) ow'
- else if iw - ir < 4 then done ir ow else do
+ else if iw - ir < 4 then done InputUnderflow ir ow else do
c2 <- readWord8Buf iraw (ir+2)
c3 <- readWord8Buf iraw (ir+3)
let x2 = fromIntegral c3 `shiftL` 8 + fromIntegral c2
ow' <- writeCharBuf oraw ow (chr2 x1 x2)
loop (ir+4) ow'
where
- invalid = if ir > ir0 then done ir ow else ioe_decodingError
+ invalid = done InvalidSequence ir ow
-- lambda-lifted, to avoid thunks being built in the inner-loop:
- done !ir !ow = return (if ir == iw then input{ bufL=0, bufR=0 }
- else input{ bufL=ir },
- output{ bufR=ow })
+ done why !ir !ow = return (why,
+ if ir == iw then input{ bufL=0, bufR=0 }
+ else input{ bufL=ir },
+ output{ bufR=ow })
in
loop ir0 ow0
-ioe_decodingError :: IO a
-ioe_decodingError = ioException
- (IOError Nothing InvalidArgument "utf16_decode"
- "invalid UTF-16 byte sequence" Nothing Nothing)
-
utf16be_encode :: EncodeBuffer
utf16be_encode
input@Buffer{ bufRaw=iraw, bufL=ir0, bufR=iw, bufSize=_ }
output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow0, bufSize=os }
= let
- done !ir !ow = return (if ir == iw then input{ bufL=0, bufR=0 }
- else input{ bufL=ir },
- output{ bufR=ow })
+ done why !ir !ow = return (why,
+ if ir == iw then input{ bufL=0, bufR=0 }
+ else input{ bufL=ir },
+ output{ bufR=ow })
loop !ir !ow
- | ir >= iw = done ir ow
- | os - ow < 2 = done ir ow
+ | ir >= iw = done InputUnderflow ir ow
+ | os - ow < 2 = done OutputUnderflow ir ow
| otherwise = do
(c,ir') <- readCharBuf iraw ir
case ord c of
- x | x < 0x10000 -> do
+ x | x < 0x10000 -> if isSurrogate c then done InvalidSequence ir ow else do
writeWord8Buf oraw ow (fromIntegral (x `shiftR` 8))
writeWord8Buf oraw (ow+1) (fromIntegral x)
loop ir' (ow+2)
| otherwise -> do
- if os - ow < 4 then done ir ow else do
+ if os - ow < 4 then done OutputUnderflow ir ow else do
let
n1 = x - 0x10000
c1 = fromIntegral (n1 `shiftR` 18 + 0xD8)
input@Buffer{ bufRaw=iraw, bufL=ir0, bufR=iw, bufSize=_ }
output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow0, bufSize=os }
= let
- done !ir !ow = return (if ir == iw then input{ bufL=0, bufR=0 }
- else input{ bufL=ir },
- output{ bufR=ow })
+ done why !ir !ow = return (why,
+ if ir == iw then input{ bufL=0, bufR=0 }
+ else input{ bufL=ir },
+ output{ bufR=ow })
loop !ir !ow
- | ir >= iw = done ir ow
- | os - ow < 2 = done ir ow
+ | ir >= iw = done InputUnderflow ir ow
+ | os - ow < 2 = done OutputUnderflow ir ow
| otherwise = do
(c,ir') <- readCharBuf iraw ir
case ord c of
- x | x < 0x10000 -> do
+ x | x < 0x10000 -> if isSurrogate c then done InvalidSequence ir ow else do
writeWord8Buf oraw ow (fromIntegral x)
writeWord8Buf oraw (ow+1) (fromIntegral (x `shiftR` 8))
loop ir' (ow+2)
| otherwise ->
- if os - ow < 4 then done ir ow else do
+ if os - ow < 4 then done OutputUnderflow ir ow else do
let
n1 = x - 0x10000
c1 = fromIntegral (n1 `shiftR` 18 + 0xD8)
-----------------------------------------------------------------------------
module GHC.IO.Encoding.UTF32 (
- utf32,
+ utf32, mkUTF32,
utf32_decode,
utf32_encode,
- utf32be,
+ utf32be, mkUTF32be,
utf32be_decode,
utf32be_encode,
- utf32le,
+ utf32le, mkUTF32le,
utf32le_decode,
utf32le_encode,
) where
import GHC.Real
import GHC.Num
-- import GHC.IO
-import GHC.IO.Exception
import GHC.IO.Buffer
+import GHC.IO.Encoding.Failure
import GHC.IO.Encoding.Types
import GHC.Word
import Data.Bits
-- The UTF-32 codec: either UTF-32BE or UTF-32LE with a BOM
utf32 :: TextEncoding
-utf32 = TextEncoding { textEncodingName = "UTF-32",
- mkTextDecoder = utf32_DF,
- mkTextEncoder = utf32_EF }
+utf32 = mkUTF32 ErrorOnCodingFailure
-utf32_DF :: IO (TextDecoder (Maybe DecodeBuffer))
-utf32_DF = do
+mkUTF32 :: CodingFailureMode -> TextEncoding
+mkUTF32 cfm = TextEncoding { textEncodingName = "UTF-32",
+ mkTextDecoder = utf32_DF cfm,
+ mkTextEncoder = utf32_EF cfm }
+
+utf32_DF :: CodingFailureMode -> IO (TextDecoder (Maybe DecodeBuffer))
+utf32_DF cfm = do
seen_bom <- newIORef Nothing
return (BufferCodec {
encode = utf32_decode seen_bom,
+ recover = recoverDecode cfm,
close = return (),
getState = readIORef seen_bom,
setState = writeIORef seen_bom
})
-utf32_EF :: IO (TextEncoder Bool)
-utf32_EF = do
+utf32_EF :: CodingFailureMode -> IO (TextEncoder Bool)
+utf32_EF cfm = do
done_bom <- newIORef False
return (BufferCodec {
encode = utf32_encode done_bom,
+ recover = recoverEncode cfm,
close = return (),
getState = readIORef done_bom,
setState = writeIORef done_bom
b <- readIORef done_bom
if b then utf32_native_encode input output
else if os - ow < 4
- then return (input,output)
+ then return (OutputUnderflow, input,output)
else do
writeIORef done_bom True
writeWord8Buf oraw ow bom0
case mb of
Just decode -> decode input output
Nothing ->
- if iw - ir < 4 then return (input,output) else do
+ if iw - ir < 4 then return (InputUnderflow, input,output) else do
c0 <- readWord8Buf iraw ir
c1 <- readWord8Buf iraw (ir+1)
c2 <- readWord8Buf iraw (ir+2)
-- UTF32LE and UTF32BE
utf32be :: TextEncoding
-utf32be = TextEncoding { textEncodingName = "UTF-32BE",
- mkTextDecoder = utf32be_DF,
- mkTextEncoder = utf32be_EF }
+utf32be = mkUTF32be ErrorOnCodingFailure
+
+mkUTF32be :: CodingFailureMode -> TextEncoding
+mkUTF32be cfm = TextEncoding { textEncodingName = "UTF-32BE",
+ mkTextDecoder = utf32be_DF cfm,
+ mkTextEncoder = utf32be_EF cfm }
-utf32be_DF :: IO (TextDecoder ())
-utf32be_DF =
+utf32be_DF :: CodingFailureMode -> IO (TextDecoder ())
+utf32be_DF cfm =
return (BufferCodec {
encode = utf32be_decode,
+ recover = recoverDecode cfm,
close = return (),
getState = return (),
setState = const $ return ()
})
-utf32be_EF :: IO (TextEncoder ())
-utf32be_EF =
+utf32be_EF :: CodingFailureMode -> IO (TextEncoder ())
+utf32be_EF cfm =
return (BufferCodec {
encode = utf32be_encode,
+ recover = recoverEncode cfm,
close = return (),
getState = return (),
setState = const $ return ()
utf32le :: TextEncoding
-utf32le = TextEncoding { textEncodingName = "UTF-32LE",
- mkTextDecoder = utf32le_DF,
- mkTextEncoder = utf32le_EF }
+utf32le = mkUTF32le ErrorOnCodingFailure
-utf32le_DF :: IO (TextDecoder ())
-utf32le_DF =
+mkUTF32le :: CodingFailureMode -> TextEncoding
+mkUTF32le cfm = TextEncoding { textEncodingName = "UTF-32LE",
+ mkTextDecoder = utf32le_DF cfm,
+ mkTextEncoder = utf32le_EF cfm }
+
+utf32le_DF :: CodingFailureMode -> IO (TextDecoder ())
+utf32le_DF cfm =
return (BufferCodec {
encode = utf32le_decode,
+ recover = recoverDecode cfm,
close = return (),
getState = return (),
setState = const $ return ()
})
-utf32le_EF :: IO (TextEncoder ())
-utf32le_EF =
+utf32le_EF :: CodingFailureMode -> IO (TextEncoder ())
+utf32le_EF cfm =
return (BufferCodec {
encode = utf32le_encode,
+ recover = recoverEncode cfm,
close = return (),
getState = return (),
setState = const $ return ()
output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow0, bufSize=os }
= let
loop !ir !ow
- | ow >= os || iw - ir < 4 = done ir ow
+ | ow >= os = done OutputUnderflow ir ow
+ | iw - ir < 4 = done InputUnderflow ir ow
| otherwise = do
c0 <- readWord8Buf iraw ir
c1 <- readWord8Buf iraw (ir+1)
ow' <- writeCharBuf oraw ow x1
loop (ir+4) ow'
where
- invalid = if ir > ir0 then done ir ow else ioe_decodingError
+ invalid = done InvalidSequence ir ow
-- lambda-lifted, to avoid thunks being built in the inner-loop:
- done !ir !ow = return (if ir == iw then input{ bufL=0, bufR=0 }
- else input{ bufL=ir },
- output{ bufR=ow })
+ done why !ir !ow = return (why,
+ if ir == iw then input{ bufL=0, bufR=0 }
+ else input{ bufL=ir },
+ output{ bufR=ow })
in
loop ir0 ow0
output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow0, bufSize=os }
= let
loop !ir !ow
- | ow >= os || iw - ir < 4 = done ir ow
+ | ow >= os = done OutputUnderflow ir ow
+ | iw - ir < 4 = done InputUnderflow ir ow
| otherwise = do
c0 <- readWord8Buf iraw ir
c1 <- readWord8Buf iraw (ir+1)
ow' <- writeCharBuf oraw ow x1
loop (ir+4) ow'
where
- invalid = if ir > ir0 then done ir ow else ioe_decodingError
+ invalid = done InvalidSequence ir ow
-- lambda-lifted, to avoid thunks being built in the inner-loop:
- done !ir !ow = return (if ir == iw then input{ bufL=0, bufR=0 }
- else input{ bufL=ir },
- output{ bufR=ow })
+ done why !ir !ow = return (why,
+ if ir == iw then input{ bufL=0, bufR=0 }
+ else input{ bufL=ir },
+ output{ bufR=ow })
in
loop ir0 ow0
-ioe_decodingError :: IO a
-ioe_decodingError = ioException
- (IOError Nothing InvalidArgument "utf32_decode"
- "invalid UTF-32 byte sequence" Nothing Nothing)
-
utf32be_encode :: EncodeBuffer
utf32be_encode
input@Buffer{ bufRaw=iraw, bufL=ir0, bufR=iw, bufSize=_ }
output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow0, bufSize=os }
= let
- done !ir !ow = return (if ir == iw then input{ bufL=0, bufR=0 }
- else input{ bufL=ir },
- output{ bufR=ow })
+ done why !ir !ow = return (why,
+ if ir == iw then input{ bufL=0, bufR=0 }
+ else input{ bufL=ir },
+ output{ bufR=ow })
loop !ir !ow
- | ir >= iw = done ir ow
- | os - ow < 4 = done ir ow
+ | ir >= iw = done InputUnderflow ir ow
+ | os - ow < 4 = done OutputUnderflow ir ow
| otherwise = do
(c,ir') <- readCharBuf iraw ir
- let (c0,c1,c2,c3) = ord4 c
- writeWord8Buf oraw ow c0
- writeWord8Buf oraw (ow+1) c1
- writeWord8Buf oraw (ow+2) c2
- writeWord8Buf oraw (ow+3) c3
- loop ir' (ow+4)
+ if isSurrogate c then done InvalidSequence ir ow else do
+ let (c0,c1,c2,c3) = ord4 c
+ writeWord8Buf oraw ow c0
+ writeWord8Buf oraw (ow+1) c1
+ writeWord8Buf oraw (ow+2) c2
+ writeWord8Buf oraw (ow+3) c3
+ loop ir' (ow+4)
in
loop ir0 ow0
input@Buffer{ bufRaw=iraw, bufL=ir0, bufR=iw, bufSize=_ }
output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow0, bufSize=os }
= let
- done !ir !ow = return (if ir == iw then input{ bufL=0, bufR=0 }
- else input{ bufL=ir },
- output{ bufR=ow })
+ done why !ir !ow = return (why,
+ if ir == iw then input{ bufL=0, bufR=0 }
+ else input{ bufL=ir },
+ output{ bufR=ow })
loop !ir !ow
- | ir >= iw = done ir ow
- | os - ow < 4 = done ir ow
+ | ir >= iw = done InputUnderflow ir ow
+ | os - ow < 4 = done OutputUnderflow ir ow
| otherwise = do
(c,ir') <- readCharBuf iraw ir
- let (c0,c1,c2,c3) = ord4 c
- writeWord8Buf oraw ow c3
- writeWord8Buf oraw (ow+1) c2
- writeWord8Buf oraw (ow+2) c1
- writeWord8Buf oraw (ow+3) c0
- loop ir' (ow+4)
+ if isSurrogate c then done InvalidSequence ir ow else do
+ let (c0,c1,c2,c3) = ord4 c
+ writeWord8Buf oraw ow c3
+ writeWord8Buf oraw (ow+1) c2
+ writeWord8Buf oraw (ow+2) c1
+ writeWord8Buf oraw (ow+3) c0
+ loop ir' (ow+4)
in
loop ir0 ow0
-----------------------------------------------------------------------------
module GHC.IO.Encoding.UTF8 (
- utf8,
- utf8_bom,
+ utf8, mkUTF8,
+ utf8_bom, mkUTF8_bom
) where
import GHC.Base
import GHC.Num
import GHC.IORef
-- import GHC.IO
-import GHC.IO.Exception
import GHC.IO.Buffer
+import GHC.IO.Encoding.Failure
import GHC.IO.Encoding.Types
import GHC.Word
import Data.Bits
-import Data.Maybe
utf8 :: TextEncoding
-utf8 = TextEncoding { textEncodingName = "UTF-8",
- mkTextDecoder = utf8_DF,
- mkTextEncoder = utf8_EF }
+utf8 = mkUTF8 ErrorOnCodingFailure
-utf8_DF :: IO (TextDecoder ())
-utf8_DF =
+mkUTF8 :: CodingFailureMode -> TextEncoding
+mkUTF8 cfm = TextEncoding { textEncodingName = "UTF-8",
+ mkTextDecoder = utf8_DF cfm,
+ mkTextEncoder = utf8_EF cfm }
+
+
+utf8_DF :: CodingFailureMode -> IO (TextDecoder ())
+utf8_DF cfm =
return (BufferCodec {
encode = utf8_decode,
+ recover = recoverDecode cfm,
close = return (),
getState = return (),
setState = const $ return ()
})
-utf8_EF :: IO (TextEncoder ())
-utf8_EF =
+utf8_EF :: CodingFailureMode -> IO (TextEncoder ())
+utf8_EF cfm =
return (BufferCodec {
encode = utf8_encode,
+ recover = recoverEncode cfm,
close = return (),
getState = return (),
setState = const $ return ()
})
utf8_bom :: TextEncoding
-utf8_bom = TextEncoding { textEncodingName = "UTF-8BOM",
- mkTextDecoder = utf8_bom_DF,
- mkTextEncoder = utf8_bom_EF }
+utf8_bom = mkUTF8_bom ErrorOnCodingFailure
-utf8_bom_DF :: IO (TextDecoder Bool)
-utf8_bom_DF = do
+mkUTF8_bom :: CodingFailureMode -> TextEncoding
+mkUTF8_bom cfm = TextEncoding { textEncodingName = "UTF-8BOM",
+ mkTextDecoder = utf8_bom_DF cfm,
+ mkTextEncoder = utf8_bom_EF cfm }
+
+utf8_bom_DF :: CodingFailureMode -> IO (TextDecoder Bool)
+utf8_bom_DF cfm = do
ref <- newIORef True
return (BufferCodec {
encode = utf8_bom_decode ref,
+ recover = recoverDecode cfm,
close = return (),
getState = readIORef ref,
setState = writeIORef ref
})
-utf8_bom_EF :: IO (TextEncoder Bool)
-utf8_bom_EF = do
+utf8_bom_EF :: CodingFailureMode -> IO (TextEncoder Bool)
+utf8_bom_EF cfm = do
ref <- newIORef True
return (BufferCodec {
encode = utf8_bom_encode ref,
+ recover = recoverEncode cfm,
close = return (),
getState = readIORef ref,
setState = writeIORef ref
then utf8_decode input output
else do
let no_bom = do writeIORef ref False; utf8_decode input output
- if iw - ir < 1 then return (input,output) else do
+ if iw - ir < 1 then return (InputUnderflow,input,output) else do
c0 <- readWord8Buf iraw ir
if (c0 /= bom0) then no_bom else do
- if iw - ir < 2 then return (input,output) else do
+ if iw - ir < 2 then return (InputUnderflow,input,output) else do
c1 <- readWord8Buf iraw (ir+1)
if (c1 /= bom1) then no_bom else do
- if iw - ir < 3 then return (input,output) else do
+ if iw - ir < 3 then return (InputUnderflow,input,output) else do
c2 <- readWord8Buf iraw (ir+2)
if (c2 /= bom2) then no_bom else do
-- found a BOM, ignore it and carry on
b <- readIORef ref
if not b then utf8_encode input output
else if os - ow < 3
- then return (input,output)
+ then return (OutputUnderflow,input,output)
else do
writeIORef ref False
writeWord8Buf oraw ow bom0
output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow0, bufSize=os }
= let
loop !ir !ow
- | ow >= os || ir >= iw = done ir ow
+ | ow >= os = done OutputUnderflow ir ow
+ | ir >= iw = done InputUnderflow ir ow
| otherwise = do
c0 <- readWord8Buf iraw ir
case c0 of
ow' <- writeCharBuf oraw ow (unsafeChr (fromIntegral c0))
loop (ir+1) ow'
| c0 >= 0xc0 && c0 <= 0xdf ->
- if iw - ir < 2 then done ir ow else do
+ if iw - ir < 2 then done InputUnderflow ir ow else do
c1 <- readWord8Buf iraw (ir+1)
if (c1 < 0x80 || c1 >= 0xc0) then invalid else do
ow' <- writeCharBuf oraw ow (chr2 c0 c1)
loop (ir+2) ow'
| c0 >= 0xe0 && c0 <= 0xef ->
case iw - ir of
- 1 -> done ir ow
+ 1 -> done InputUnderflow ir ow
2 -> do -- check for an error even when we don't have
-- the full sequence yet (#3341)
c1 <- readWord8Buf iraw (ir+1)
if not (validate3 c0 c1 0x80)
- then invalid else done ir ow
+ then invalid else done InputUnderflow ir ow
_ -> do
c1 <- readWord8Buf iraw (ir+1)
c2 <- readWord8Buf iraw (ir+2)
loop (ir+3) ow'
| c0 >= 0xf0 ->
case iw - ir of
- 1 -> done ir ow
+ 1 -> done InputUnderflow ir ow
2 -> do -- check for an error even when we don't have
-- the full sequence yet (#3341)
c1 <- readWord8Buf iraw (ir+1)
if not (validate4 c0 c1 0x80 0x80)
- then invalid else done ir ow
+ then invalid else done InputUnderflow ir ow
3 -> do
c1 <- readWord8Buf iraw (ir+1)
c2 <- readWord8Buf iraw (ir+2)
if not (validate4 c0 c1 c2 0x80)
- then invalid else done ir ow
+ then invalid else done InputUnderflow ir ow
_ -> do
c1 <- readWord8Buf iraw (ir+1)
c2 <- readWord8Buf iraw (ir+2)
| otherwise ->
invalid
where
- invalid = if ir > ir0 then done ir ow else ioe_decodingError
+ invalid = done InvalidSequence ir ow
-- lambda-lifted, to avoid thunks being built in the inner-loop:
- done !ir !ow = return (if ir == iw then input{ bufL=0, bufR=0 }
- else input{ bufL=ir },
- output{ bufR=ow })
+ done why !ir !ow = return (why,
+ if ir == iw then input{ bufL=0, bufR=0 }
+ else input{ bufL=ir },
+ output{ bufR=ow })
in
loop ir0 ow0
-ioe_decodingError :: IO a
-ioe_decodingError = ioException
- (IOError Nothing InvalidArgument "utf8_decode"
- "invalid UTF-8 byte sequence" Nothing Nothing)
-
utf8_encode :: EncodeBuffer
utf8_encode
input@Buffer{ bufRaw=iraw, bufL=ir0, bufR=iw, bufSize=_ }
output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow0, bufSize=os }
= let
- done !ir !ow = return (if ir == iw then input{ bufL=0, bufR=0 }
- else input{ bufL=ir },
- output{ bufR=ow })
+ done why !ir !ow = return (why,
+ if ir == iw then input{ bufL=0, bufR=0 }
+ else input{ bufL=ir },
+ output{ bufR=ow })
loop !ir !ow
- | ow >= os || ir >= iw = done ir ow
+ | ow >= os = done OutputUnderflow ir ow
+ | ir >= iw = done InputUnderflow ir ow
| otherwise = do
(c,ir') <- readCharBuf iraw ir
case ord c of
writeWord8Buf oraw ow (fromIntegral x)
loop ir' (ow+1)
| x <= 0x07FF ->
- if os - ow < 2 then done ir ow else do
+ if os - ow < 2 then done OutputUnderflow ir ow else do
let (c1,c2) = ord2 c
writeWord8Buf oraw ow c1
writeWord8Buf oraw (ow+1) c2
loop ir' (ow+2)
- | x <= 0xFFFF -> do
- if os - ow < 3 then done ir ow else do
+ | x <= 0xFFFF -> if isSurrogate c then done InvalidSequence ir ow else do
+ if os - ow < 3 then done OutputUnderflow ir ow else do
let (c1,c2,c3) = ord3 c
writeWord8Buf oraw ow c1
writeWord8Buf oraw (ow+1) c2
writeWord8Buf oraw (ow+2) c3
loop ir' (ow+3)
| otherwise -> do
- if os - ow < 4 then done ir ow else do
+ if os - ow < 4 then done OutputUnderflow ir ow else do
let (c1,c2,c3,c4) = ord4 c
writeWord8Buf oraw ow c1
writeWord8Buf oraw (ow+1) c2
import GHC.IO.Device (SeekMode(..), IODeviceType(..))
import GHC.Conc.IO
import GHC.IO.Exception
+#ifdef mingw32_HOST_OS
+import GHC.Windows
+#endif
import Foreign
import Foreign.C
-- for this case. We need to detect EPIPE correctly, because it
-- shouldn't be reported as an error when it happens on stdout.
-foreign import ccall unsafe "maperrno" -- in Win32Utils.c
- c_maperrno :: IO ()
-
-- NOTE: "safe" versions of the read/write calls for use by the threaded RTS.
-- These calls may block, but that's ok.
foreign import ccall unsafe "unlockFile"
unlockFile :: CInt -> IO CInt
#endif
-
-puts :: String -> IO ()
-puts s = do _ <- withCStringLen s $ \(p,len) ->
- c_write 1 (castPtr p) (fromIntegral len)
- return ()
("illegal buffer size " ++ showsPrec 9 n []) Nothing Nothing)
-- 9 => should be parens'ified.
+-- ---------------------------------------------------------------------------
+-- Wrapper for Handle encoding/decoding.
+
+-- The interface for TextEncoding changed so that a TextEncoding doesn't raise
+-- an exception if it encounters an invalid sequnce. Furthermore, encoding
+-- returns a reason as to why encoding stopped, letting us know if it was due
+-- to input/output underflow or an invalid sequence.
+--
+-- This code adapts this elaborated interface back to the original TextEncoding
+-- interface.
+--
+-- FIXME: it is possible that Handle code using the haDecoder/haEncoder fields
+-- could be made clearer by using the 'encode' interface directly. I have not
+-- looked into this.
+--
+-- FIXME: we should use recover to deal with EOF, rather than always throwing an
+-- IOException (ioe_invalidCharacter).
+
+streamEncode :: BufferCodec from to state
+ -> Buffer from -> Buffer to
+ -> IO (Buffer from, Buffer to)
+streamEncode codec from to = go (from, to)
+ where
+ go (from, to) = do
+ (why, from', to') <- encode codec from to
+ -- When we are dealing with Handles, we don't care about input/output
+ -- underflow particularly, and we want to delay errors about invalid
+ -- sequences as far as possible.
+ case why of
+ Encoding.InvalidSequence | bufL from == bufL from' -> recover codec from' to' >>= go
+ _ -> return (from', to')
+
-- -----------------------------------------------------------------------------
-- Handle Finalizers
(cbuf',bbuf') <- case haEncoder of
Nothing -> latin1_encode cbuf bbuf
- Just encoder -> (encode encoder) cbuf bbuf
+ Just encoder -> (streamEncode encoder) cbuf bbuf
debugIO ("writeCharBuffer after encoding: cbuf=" ++ summaryBuffer cbuf' ++
" bbuf=" ++ summaryBuffer bbuf')
-- restore the codec state
setState decoder codec_state
- (bbuf1,cbuf1) <- (encode decoder) bbuf0
+ (bbuf1,cbuf1) <- (streamEncode decoder) bbuf0
cbuf0{ bufL=0, bufR=0, bufSize = bufL cbuf0 }
debugIO ("finished, bbuf=" ++ summaryBuffer bbuf1 ++
Just decoder -> do
state <- getState decoder
writeIORef haLastDecode (state, bbuf1)
- (encode decoder) bbuf1 cbuf
+ (streamEncode decoder) bbuf1 cbuf
debugIO ("readTextDevice after decoding: cbuf=" ++ summaryBuffer cbuf' ++
" bbuf=" ++ summaryBuffer bbuf2)
Just decoder -> do
state <- getState decoder
writeIORef haLastDecode (state, bbuf2)
- (encode decoder) bbuf2 cbuf
+ (streamEncode decoder) bbuf2 cbuf
debugIO ("readTextDevice' after decoding: cbuf=" ++ summaryBuffer cbuf' ++
" bbuf=" ++ summaryBuffer bbuf3)
Just decoder -> do
state <- getState decoder
writeIORef haLastDecode (state, bbuf0)
- (encode decoder) bbuf0 cbuf
+ (streamEncode decoder) bbuf0 cbuf
writeIORef haByteBuffer bbuf2
return cbuf'
--- /dev/null
+{-# LANGUAGE NoImplicitPrelude, ForeignFunctionInterface #-}
+-----------------------------------------------------------------------------
+-- |
+-- Module : GHC.Windows
+-- Copyright : (c) The University of Glasgow, 2009
+-- License : see libraries/base/LICENSE
+--
+-- Maintainer : libraries@haskell.org
+-- Stability : internal
+-- Portability : non-portable
+--
+-- Windows functionality used by several modules.
+--
+-- ToDo: this just duplicates part of System.Win32.Types, which isn't
+-- available yet. We should move some Win32 functionality down here,
+-- maybe as part of the grand reorganisation of the base package...
+--
+-----------------------------------------------------------------------------
+
+module GHC.Windows where
+
+import GHC.Base
+import GHC.Ptr
+
+import Data.Word
+
+import Foreign.C.Error (throwErrno)
+import Foreign.C.Types
+
+
+type HANDLE = Ptr ()
+type DWORD = Word32
+
+type LPTSTR = Ptr CWchar
+
+iNFINITE :: DWORD
+iNFINITE = 0xFFFFFFFF -- urgh
+
+throwGetLastError :: String -> IO a
+throwGetLastError where_from = c_maperrno >> throwErrno where_from
+
+foreign import ccall unsafe "maperrno" -- in Win32Utils.c
+ c_maperrno :: IO ()
+
import Prelude
#ifdef __GLASGOW_HASKELL__
-import Data.List
import Foreign
import Foreign.C
import Control.Exception.Base ( bracket )
-import Control.Monad
-- import GHC.IO
import GHC.IO.Exception
+import GHC.IO.Encoding (fileSystemEncoding)
+import qualified GHC.Foreign as GHC
+import Data.List
+#ifdef mingw32_HOST_OS
+import GHC.Environment
+import GHC.Windows
+#else
+import Control.Monad
+#endif
#endif
#ifdef __HUGS__
)
#endif
+#ifdef __GLASGOW_HASKELL__
-- ---------------------------------------------------------------------------
-- getArgs, getProgName, getEnv
+#ifdef mingw32_HOST_OS
+
+-- Ignore the arguments to hs_init on Windows for the sake of Unicode compat
+
+getWin32ProgArgv_certainly :: IO [String]
+getWin32ProgArgv_certainly = do
+ mb_argv <- getWin32ProgArgv
+ case mb_argv of
+ Nothing -> fmap dropRTSArgs getFullArgs
+ Just argv -> return argv
+
+withWin32ProgArgv :: [String] -> IO a -> IO a
+withWin32ProgArgv argv act = bracket begin setWin32ProgArgv (\_ -> act)
+ where
+ begin = do
+ mb_old_argv <- getWin32ProgArgv
+ setWin32ProgArgv (Just argv)
+ return mb_old_argv
+
+getWin32ProgArgv :: IO (Maybe [String])
+getWin32ProgArgv = alloca $ \p_argc -> alloca $ \p_argv -> do
+ c_getWin32ProgArgv p_argc p_argv
+ argc <- peek p_argc
+ argv_p <- peek p_argv
+ if argv_p == nullPtr
+ then return Nothing
+ else do
+ argv_ps <- peekArray (fromIntegral argc) argv_p
+ fmap Just $ mapM peekCWString argv_ps
+
+setWin32ProgArgv :: Maybe [String] -> IO ()
+setWin32ProgArgv Nothing = c_setWin32ProgArgv 0 nullPtr
+setWin32ProgArgv (Just argv) = withMany withCWString argv $ \argv_ps -> withArrayLen argv_ps $ \argc argv_p -> do
+ c_setWin32ProgArgv (fromIntegral argc) argv_p
+
+foreign import ccall unsafe "getWin32ProgArgv"
+ c_getWin32ProgArgv :: Ptr CInt -> Ptr (Ptr CWString) -> IO ()
+
+foreign import ccall unsafe "setWin32ProgArgv"
+ c_setWin32ProgArgv :: CInt -> Ptr CWString -> IO ()
+
+dropRTSArgs :: [String] -> [String]
+dropRTSArgs [] = []
+dropRTSArgs ("+RTS":rest) = dropRTSArgs (dropWhile (/= "-RTS") rest)
+dropRTSArgs ("--RTS":rest) = rest
+dropRTSArgs ("-RTS":rest) = dropRTSArgs rest
+dropRTSArgs (arg:rest) = arg : dropRTSArgs rest
+
+#endif
+
-- | Computation 'getArgs' returns a list of the program's command
-- line arguments (not including the program name).
-
-#ifdef __GLASGOW_HASKELL__
getArgs :: IO [String]
+
+#ifdef mingw32_HOST_OS
+getArgs = fmap tail getWin32ProgArgv_certainly
+#else
getArgs =
alloca $ \ p_argc ->
alloca $ \ p_argv -> do
getProgArgv p_argc p_argv
p <- fromIntegral `liftM` peek p_argc
argv <- peek p_argv
- peekArray (p - 1) (advancePtr argv 1) >>= mapM peekCString
-
+ peekArray (p - 1) (advancePtr argv 1) >>= mapM (GHC.peekCString fileSystemEncoding)
foreign import ccall unsafe "getProgArgv"
getProgArgv :: Ptr CInt -> Ptr (Ptr CString) -> IO ()
+#endif
{-|
Computation 'getProgName' returns the name of the program as it was
is probably really @FOO.EXE@, and that is what 'getProgName' will return.
-}
getProgName :: IO String
+#ifdef mingw32_HOST_OS
+-- Ignore the arguments to hs_init on Windows for the sake of Unicode compat
+getProgName = fmap (basename . head) getWin32ProgArgv_certainly
+#else
getProgName =
alloca $ \ p_argc ->
alloca $ \ p_argv -> do
unpackProgName :: Ptr (Ptr CChar) -> IO String -- argv[0]
unpackProgName argv = do
- s <- peekElemOff argv 0 >>= peekCString
+ s <- peekElemOff argv 0 >>= GHC.peekCString fileSystemEncoding
return (basename s)
- where
- basename :: String -> String
- basename f = go f f
- where
- go acc [] = acc
- go acc (x:xs)
- | isPathSeparator x = go xs xs
- | otherwise = go acc xs
-
- isPathSeparator :: Char -> Bool
- isPathSeparator '/' = True
-#ifdef mingw32_HOST_OS
- isPathSeparator '\\' = True
#endif
- isPathSeparator _ = False
+
+basename :: FilePath -> FilePath
+basename f = go f f
+ where
+ go acc [] = acc
+ go acc (x:xs)
+ | isPathSeparator x = go xs xs
+ | otherwise = go acc xs
+
+ isPathSeparator :: Char -> Bool
+ isPathSeparator '/' = True
+#ifdef mingw32_HOST_OS
+ isPathSeparator '\\' = True
+#endif
+ isPathSeparator _ = False
-- | Computation 'getEnv' @var@ returns the value
-- does not exist.
getEnv :: String -> IO String
+#ifdef mingw32_HOST_OS
+getEnv name = withCWString name $ \s -> try_size s 256
+ where
+ try_size s size = allocaArray (fromIntegral size) $ \p_value -> do
+ res <- c_GetEnvironmentVariable s p_value size
+ case res of
+ 0 -> do
+ err <- c_GetLastError
+ if err == eRROR_ENVVAR_NOT_FOUND
+ then ioe_missingEnvVar name
+ else throwGetLastError "getEnv"
+ _ | res > size -> try_size s res -- Rare: size increased between calls to GetEnvironmentVariable
+ | otherwise -> peekCWString p_value
+
+eRROR_ENVVAR_NOT_FOUND :: DWORD
+eRROR_ENVVAR_NOT_FOUND = 203
+
+foreign import stdcall unsafe "windows.h GetLastError"
+ c_GetLastError:: IO DWORD
+
+foreign import stdcall unsafe "windows.h GetEnvironmentVariableW"
+ c_GetEnvironmentVariable :: LPTSTR -> LPTSTR -> DWORD -> IO DWORD
+#else
getEnv name =
withCString name $ \s -> do
litstring <- c_getenv s
if litstring /= nullPtr
- then peekCString litstring
- else ioException (IOError Nothing NoSuchThing "getEnv"
- "no environment variable" Nothing (Just name))
+ then GHC.peekCString fileSystemEncoding litstring
+ else ioe_missingEnvVar name
foreign import ccall unsafe "getenv"
c_getenv :: CString -> IO (Ptr CChar)
+#endif
+
+ioe_missingEnvVar :: String -> IO a
+ioe_missingEnvVar name = ioException (IOError Nothing NoSuchThing "getEnv"
+ "no environment variable" Nothing (Just name))
{-|
'withArgs' @args act@ - while executing action @act@, have 'getArgs'
-- the duration of an action.
withArgv :: [String] -> IO a -> IO a
-withArgv new_args act = do
+
+#ifdef mingw32_HOST_OS
+-- We have to reflect the updated arguments in the RTS-side variables as
+-- well, because the RTS still consults them for error messages and the like.
+-- If we don't do this then ghc-e005 fails.
+withArgv new_args act = withWin32ProgArgv new_args $ withProgArgv new_args act
+#else
+withArgv = withProgArgv
+#endif
+
+withProgArgv :: [String] -> IO a -> IO a
+withProgArgv new_args act = do
pName <- System.Environment.getProgName
existing_args <- System.Environment.getArgs
- bracket (setArgs new_args)
- (\argv -> do _ <- setArgs (pName:existing_args)
- freeArgv argv)
+ bracket (setProgArgv new_args)
+ (\argv -> do _ <- setProgArgv (pName:existing_args)
+ freeProgArgv argv)
(const act)
-freeArgv :: Ptr CString -> IO ()
-freeArgv argv = do
+freeProgArgv :: Ptr CString -> IO ()
+freeProgArgv argv = do
size <- lengthArray0 nullPtr argv
sequence_ [peek (argv `advancePtr` i) >>= free | i <- [size, size-1 .. 0]]
free argv
-setArgs :: [String] -> IO (Ptr CString)
-setArgs argv = do
- vs <- mapM newCString argv >>= newArray0 nullPtr
- setArgsPrim (genericLength argv) vs
+setProgArgv :: [String] -> IO (Ptr CString)
+setProgArgv argv = do
+ vs <- mapM (GHC.newCString fileSystemEncoding) argv >>= newArray0 nullPtr
+ c_setProgArgv (genericLength argv) vs
return vs
foreign import ccall unsafe "setProgArgv"
- setArgsPrim :: CInt -> Ptr CString -> IO ()
+ c_setProgArgv :: CInt -> Ptr CString -> IO ()
-- |'getEnvironment' retrieves the entire environment as a
-- list of @(key,value)@ pairs.
--
-- If an environment entry does not contain an @\'=\'@ character,
-- the @key@ is the whole entry and the @value@ is the empty string.
-
getEnvironment :: IO [(String, String)]
+
+#ifdef mingw32_HOST_OS
+getEnvironment = bracket c_GetEnvironmentStrings c_FreeEnvironmentStrings $ \pBlock ->
+ if pBlock == nullPtr then return []
+ else go pBlock
+ where
+ go pBlock = do
+ -- The block is terminated by a null byte where there
+ -- should be an environment variable of the form X=Y
+ c <- peek pBlock
+ if c == 0 then return []
+ else do
+ -- Seek the next pair (or terminating null):
+ pBlock' <- seekNull pBlock False
+ -- We now know the length in bytes, but ignore it when
+ -- getting the actual String:
+ str <- peekCWString pBlock
+ fmap (divvy str :) $ go pBlock'
+
+ -- Returns pointer to the byte *after* the next null
+ seekNull pBlock done = do
+ let pBlock' = pBlock `plusPtr` sizeOf (undefined :: CWchar)
+ if done then return pBlock'
+ else do
+ c <- peek pBlock'
+ seekNull pBlock' (c == (0 :: Word8 ))
+
+foreign import stdcall unsafe "windows.h GetEnvironmentStringsW"
+ c_GetEnvironmentStrings :: IO (Ptr CWchar)
+
+foreign import stdcall unsafe "windows.h FreeEnvironmentStringsW"
+ c_FreeEnvironmentStrings :: Ptr CWchar -> IO Bool
+#else
getEnvironment = do
pBlock <- getEnvBlock
if pBlock == nullPtr then return []
else do
- stuff <- peekArray0 nullPtr pBlock >>= mapM peekCString
+ stuff <- peekArray0 nullPtr pBlock >>= mapM (GHC.peekCString fileSystemEncoding)
return (map divvy stuff)
- where
- divvy str =
- case break (=='=') str of
- (xs,[]) -> (xs,[]) -- don't barf (like Posix.getEnvironment)
- (name,_:value) -> (name,value)
foreign import ccall unsafe "__hscore_environ"
getEnvBlock :: IO (Ptr CString)
+#endif
+
+divvy :: String -> (String, String)
+divvy str =
+ case break (=='=') str of
+ (xs,[]) -> (xs,[]) -- don't barf (like Posix.getEnvironment)
+ (name,_:value) -> (name,value)
#endif /* __GLASGOW_HASKELL__ */
#ifdef __GLASGOW_HASKELL__
import GHC.Base
-import GHC.IO hiding ( onException )
+import GHC.IO hiding ( bracket, onException )
import GHC.IO.IOMode
import GHC.IO.Handle.FD
import qualified GHC.IO.FD as FD
import GHC.IO.IOMode
import GHC.IO.Exception
import GHC.IO.Device
+#ifndef mingw32_HOST_OS
+import {-# SOURCE #-} GHC.IO.Encoding (fileSystemEncoding)
+import qualified GHC.Foreign as GHC
+#endif
#elif __HUGS__
import Hugs.Prelude (IOException(..), IOErrorType(..))
import Hugs.IO (IOMode(..))
{-# CFILES cbits/PrelIOUtils.c cbits/consUtils.c #-}
#endif
+
+-- ---------------------------------------------------------------------------
+-- Debugging the base package
+
+puts :: String -> IO ()
+puts s = withCAStringLen (s ++ "\n") $ \(p, len) -> do
+ -- In reality should be withCString, but assume ASCII to avoid loop
+ -- if this is called by GHC.Foreign
+ _ <- c_write 1 (castPtr p) (fromIntegral len)
+ return ()
+
+
-- ---------------------------------------------------------------------------
-- Types
#ifdef mingw32_HOST_OS
withFilePath :: FilePath -> (CWString -> IO a) -> IO a
-withFilePath = withCWString
+withFilePath = withCWString
+
+peekFilePath :: CWString -> IO FilePath
+peekFilePath = peekCWString
#else
+
withFilePath :: FilePath -> (CString -> IO a) -> IO a
+peekFilePath :: CString -> IO FilePath
+peekFilePathLen :: CStringLen -> IO FilePath
+
+#if __GLASGOW_HASKELL__
+withFilePath = GHC.withCString fileSystemEncoding
+peekFilePath = GHC.peekCString fileSystemEncoding
+peekFilePathLen = GHC.peekCStringLen fileSystemEncoding
+#else
withFilePath = withCString
+peekFilePath = peekCString
+peekFilePathLen = peekCStringLen
+#endif
+
#endif
-- ---------------------------------------------------------------------------
--- /dev/null
+{-# LANGUAGE NoImplicitPrelude #-}
+module System.Posix.Internals where
+
+import GHC.IO
+import GHC.Base
+
+puts :: String -> IO ()
GHC.Exts,
GHC.Float,
GHC.Float.RealFracMethods,
+ GHC.Foreign,
GHC.Float.ConversionUtils,
GHC.ForeignPtr,
GHC.MVar,
GHC.IO.Encoding.Types,
GHC.IO.Encoding.Iconv,
GHC.IO.Encoding.CodePage,
+ GHC.IO.Encoding.Failure,
GHC.IO.Handle,
GHC.IO.Handle.Types,
GHC.IO.Handle.Internals,
if os(windows)
exposed-modules: GHC.IO.Encoding.CodePage.Table
GHC.Conc.Windows
+ GHC.Windows
}
exposed-modules:
Control.Applicative,