#ifdef __GLASGOW_HASKELL__
import GHC.Base
-import GHC.IO hiding (finally,onException)
+import GHC.IO hiding (bracket,finally,onException)
import GHC.IO.Exception
import GHC.Exception
import GHC.Show
-----------------------------------------------------------------------------
module Foreign.C.String ( -- representation of strings in C
-
-- * C strings
CString, -- = Ptr CChar
-- ** Using a locale-dependent encoding
+#ifndef __GLASGOW_HASKELL__
-- | Currently these functions are identical to their @CAString@ counterparts;
-- eventually they will use an encoding determined by the current locale.
+#else
+ -- | These functions are different from their @CAString@ counterparts
+ -- in that they will use an encoding determined by the current locale,
+ -- rather than always assuming ASCII.
+#endif
-- conversion of C strings into Haskell strings
--
import Data.Word
#ifdef __GLASGOW_HASKELL__
+import Control.Monad
+
import GHC.List
import GHC.Real
import GHC.Num
import GHC.Base
+
+import {-# SOURCE #-} GHC.IO.Encoding
+import qualified GHC.Foreign as GHC
#else
import Data.Char ( chr, ord )
#define unsafeChr chr
-- | Marshal a NUL terminated C string into a Haskell string.
--
peekCString :: CString -> IO String
+#ifndef __GLASGOW_HASKELL__
peekCString = peekCAString
+#else
+peekCString = GHC.peekCString foreignEncoding
+#endif
-- | Marshal a C string with explicit length into a Haskell string.
--
peekCStringLen :: CStringLen -> IO String
+#ifndef __GLASGOW_HASKELL__
peekCStringLen = peekCAStringLen
+#else
+peekCStringLen = GHC.peekCStringLen foreignEncoding
+#endif
-- | Marshal a Haskell string into a NUL terminated C string.
--
-- 'Foreign.Marshal.Alloc.finalizerFree'.
--
newCString :: String -> IO CString
+#ifndef __GLASGOW_HASKELL__
newCString = newCAString
+#else
+newCString = GHC.newCString foreignEncoding
+#endif
-- | Marshal a Haskell string into a C string (ie, character array) with
-- explicit length information.
-- 'Foreign.Marshal.Alloc.finalizerFree'.
--
newCStringLen :: String -> IO CStringLen
+#ifndef __GLASGOW_HASKELL__
newCStringLen = newCAStringLen
+#else
+newCStringLen = GHC.newCStringLen foreignEncoding
+#endif
-- | Marshal a Haskell string into a NUL terminated C string using temporary
-- storage.
-- storage must /not/ be used after this.
--
withCString :: String -> (CString -> IO a) -> IO a
+#ifndef __GLASGOW_HASKELL__
withCString = withCAString
+#else
+withCString = GHC.withCString foreignEncoding
+#endif
-- | Marshal a Haskell string into a C string (ie, character array)
-- in temporary storage, with explicit length information.
-- storage must /not/ be used after this.
--
withCStringLen :: String -> (CStringLen -> IO a) -> IO a
+#ifndef __GLASGOW_HASKELL__
withCStringLen = withCAStringLen
+#else
+withCStringLen = GHC.withCStringLen foreignEncoding
+#endif
+
+#ifndef __GLASGOW_HASKELL__
-- | Determines whether a character can be accurately encoded in a 'CString'.
-- Unrepresentable characters are converted to @\'?\'@.
--
-- Currently only Latin-1 characters are representable.
charIsRepresentable :: Char -> IO Bool
charIsRepresentable c = return (ord c < 256)
+#else
+-- -- | Determines whether a character can be accurately encoded in a 'CString'.
+-- -- Unrepresentable characters are converted to '?' or their nearest visual equivalent.
+charIsRepresentable :: Char -> IO Bool
+charIsRepresentable = GHC.charIsRepresentable foreignEncoding
+#endif
-- single byte characters
-- ----------------------
import Data.Bits (shiftR)
import Data.Maybe (Maybe(..))
import Data.Typeable
-import Foreign.C.Error (throwErrno)
import GHC.Base
import GHC.Conc.Sync
import GHC.Enum (Enum)
import GHC.Real (div, fromIntegral)
import GHC.Show (Show)
import GHC.Word (Word32, Word64)
+import GHC.Windows
-- ----------------------------------------------------------------------------
-- Thread waiting
r <- c_WaitForSingleObject wakeup timeout
case r of
- 0xffffffff -> do c_maperrno; throwErrno "service_loop"
+ 0xffffffff -> do throwGetLastError "service_loop"
0 -> do
r2 <- c_readIOManagerEvent
exit <-
milli_seconds = (micro_seconds + 999) `div` 1000
in return (all, fromIntegral milli_seconds)
--- ToDo: this just duplicates part of System.Win32.Types, which isn't
--- available yet. We should move some Win32 functionality down here,
--- maybe as part of the grand reorganisation of the base package...
-type HANDLE = Ptr ()
-type DWORD = Word32
-
-iNFINITE :: DWORD
-iNFINITE = 0xFFFFFFFF -- urgh
-
foreign import ccall unsafe "getIOManagerEvent" -- in the RTS (ThrIOManager.c)
c_getIOManagerEvent :: IO HANDLE
foreign import ccall unsafe "sendIOManagerEvent" -- in the RTS (ThrIOManager.c)
c_sendIOManagerEvent :: Word32 -> IO ()
-foreign import ccall unsafe "maperrno" -- in Win32Utils.c
- c_maperrno :: IO ()
-
foreign import stdcall "WaitForSingleObject"
c_WaitForSingleObject :: HANDLE -> DWORD -> IO DWORD
-{-# LANGUAGE ForeignFunctionInterface #-}
+{-# LANGUAGE CPP, ForeignFunctionInterface #-}
module GHC.Environment (getFullArgs) where
import Prelude
import Foreign
import Foreign.C
+
+#ifdef mingw32_HOST_OS
+import GHC.IO (finally)
+import GHC.Windows
+
+-- Ignore the arguments to hs_init on Windows for the sake of Unicode compat
+getFullArgs :: IO [String]
+getFullArgs = do
+ p_arg_string <- c_GetCommandLine
+ alloca $ \p_argc -> do
+ p_argv <- c_CommandLineToArgv p_arg_string p_argc
+ if p_argv == nullPtr
+ then throwGetLastError "getFullArgs"
+ else flip finally (c_LocalFree p_argv) $ do
+ argc <- peek p_argc
+ p_argvs <- peekArray (fromIntegral argc) p_argv
+ mapM peekCWString p_argvs
+
+foreign import stdcall unsafe "windows.h GetCommandLineW"
+ c_GetCommandLine :: IO (Ptr CWString)
+
+foreign import stdcall unsafe "windows.h CommandLineToArgvW"
+ c_CommandLineToArgv :: Ptr CWString -> Ptr CInt -> IO (Ptr CWString)
+
+foreign import stdcall unsafe "Windows.h LocalFree"
+ c_LocalFree :: Ptr a -> IO (Ptr a)
+#else
import Control.Monad
+import GHC.IO.Encoding
+import qualified GHC.Foreign as GHC
+
getFullArgs :: IO [String]
getFullArgs =
alloca $ \ p_argc ->
getFullProgArgv p_argc p_argv
p <- fromIntegral `liftM` peek p_argc
argv <- peek p_argv
- peekArray (p - 1) (advancePtr argv 1) >>= mapM peekCString
+ peekArray (p - 1) (advancePtr argv 1) >>= mapM (GHC.peekCString fileSystemEncoding)
foreign import ccall unsafe "getFullProgArgv"
getFullProgArgv :: Ptr CInt -> Ptr (Ptr CString) -> IO ()
-
+#endif
\ No newline at end of file
--- /dev/null
+{-# LANGUAGE CPP, NoImplicitPrelude #-}
+
+-----------------------------------------------------------------------------
+-- |
+-- Module : GHC.IO.Encoding
+-- Copyright : (c) The University of Glasgow, 2008-2011
+-- License : see libraries/base/LICENSE
+--
+-- Maintainer : libraries@haskell.org
+-- Stability : internal
+-- Portability : non-portable
+--
+-- Foreign marshalling support for CStrings with configurable encodings
+--
+-----------------------------------------------------------------------------
+
+module GHC.Foreign (
+ -- * C strings with a configurable encoding
+
+ -- conversion of C strings into Haskell strings
+ --
+ peekCString, -- :: TextEncoding -> CString -> IO String
+ peekCStringLen, -- :: TextEncoding -> CStringLen -> IO String
+
+ -- conversion of Haskell strings into C strings
+ --
+ newCString, -- :: TextEncoding -> String -> IO CString
+ newCStringLen, -- :: TextEncoding -> String -> IO CStringLen
+
+ -- conversion of Haskell strings into C strings using temporary storage
+ --
+ withCString, -- :: TextEncoding -> String -> (CString -> IO a) -> IO a
+ withCStringLen, -- :: TextEncoding -> String -> (CStringLen -> IO a) -> IO a
+
+ charIsRepresentable, -- :: TextEncoding -> Char -> IO Bool
+ ) where
+
+import Foreign.Marshal.Array
+import Foreign.C.Types
+import Foreign.Ptr
+import Foreign.Storable
+
+import Data.Word
+
+-- Imports for the locale-encoding version of marshallers
+import Control.Monad
+
+import Data.Tuple (fst)
+import Data.Maybe
+
+import {-# SOURCE #-} System.Posix.Internals (puts)
+import GHC.Show ( show )
+
+import Foreign.Marshal.Alloc
+import Foreign.ForeignPtr
+
+import GHC.Err (undefined)
+import GHC.List
+import GHC.Num
+import GHC.Base
+
+import GHC.IO
+import GHC.IO.Exception
+import GHC.IO.Buffer
+import GHC.IO.Encoding.Failure (surrogatifyRoundtripCharacter, desurrogatifyRoundtripCharacter)
+import GHC.IO.Encoding.Types
+
+
+c_DEBUG_DUMP :: Bool
+c_DEBUG_DUMP = False
+
+putDebugMsg :: String -> IO ()
+putDebugMsg | c_DEBUG_DUMP = puts
+ | otherwise = const (return ())
+
+
+-- These definitions are identical to those in Foreign.C.String, but copied in here to avoid a cycle:
+type CString = Ptr CChar
+type CStringLen = (Ptr CChar, Int)
+
+-- exported functions
+-- ------------------
+
+-- | Marshal a NUL terminated C string into a Haskell string.
+--
+peekCString :: TextEncoding -> CString -> IO String
+peekCString enc cp = do
+ sz <- lengthArray0 nUL cp
+ peekEncodedCString enc (cp, sz * cCharSize)
+
+-- | Marshal a C string with explicit length into a Haskell string.
+--
+peekCStringLen :: TextEncoding -> CStringLen -> IO String
+peekCStringLen = peekEncodedCString
+
+-- | Marshal a Haskell string into a NUL terminated C string.
+--
+-- * the Haskell string may /not/ contain any NUL characters
+--
+-- * new storage is allocated for the C string and must be
+-- explicitly freed using 'Foreign.Marshal.Alloc.free' or
+-- 'Foreign.Marshal.Alloc.finalizerFree'.
+--
+newCString :: TextEncoding -> String -> IO CString
+newCString enc = liftM fst . newEncodedCString enc True
+
+-- | Marshal a Haskell string into a C string (ie, character array) with
+-- explicit length information.
+--
+-- * new storage is allocated for the C string and must be
+-- explicitly freed using 'Foreign.Marshal.Alloc.free' or
+-- 'Foreign.Marshal.Alloc.finalizerFree'.
+--
+newCStringLen :: TextEncoding -> String -> IO CStringLen
+newCStringLen enc = newEncodedCString enc False
+
+-- | Marshal a Haskell string into a NUL terminated C string using temporary
+-- storage.
+--
+-- * the Haskell string may /not/ contain any NUL characters
+--
+-- * the memory is freed when the subcomputation terminates (either
+-- normally or via an exception), so the pointer to the temporary
+-- storage must /not/ be used after this.
+--
+withCString :: TextEncoding -> String -> (CString -> IO a) -> IO a
+withCString enc s act = withEncodedCString enc True s $ \(cp, _sz) -> act cp
+
+-- | Marshal a Haskell string into a C string (ie, character array)
+-- in temporary storage, with explicit length information.
+--
+-- * the memory is freed when the subcomputation terminates (either
+-- normally or via an exception), so the pointer to the temporary
+-- storage must /not/ be used after this.
+--
+withCStringLen :: TextEncoding -> String -> (CStringLen -> IO a) -> IO a
+withCStringLen enc = withEncodedCString enc False
+
+
+-- | Determines whether a character can be accurately encoded in a 'CString'.
+--
+-- Pretty much anyone who uses this function is in a state of sin because
+-- whether or not a character is encodable will, in general, depend on the
+-- context in which it occurs.
+charIsRepresentable :: TextEncoding -> Char -> IO Bool
+charIsRepresentable enc c = withCString enc [c] (fmap (== [c]) . peekCString enc) `catchException` (\e -> let _ = e :: IOException in return False)
+
+-- auxiliary definitions
+-- ----------------------
+
+-- C's end of string character
+nUL :: CChar
+nUL = 0
+
+-- Size of a CChar in bytes
+cCharSize :: Int
+cCharSize = sizeOf (undefined :: CChar)
+
+
+{-# INLINE peekEncodedCString #-}
+peekEncodedCString :: TextEncoding -- ^ Encoding of CString
+ -> CStringLen
+ -> IO String -- ^ String in Haskell terms
+peekEncodedCString (TextEncoding { mkTextDecoder = mk_decoder }) (p, sz_bytes)
+ = bracket mk_decoder close $ \decoder -> do
+ let chunk_size = sz_bytes `max` 1 -- Decode buffer chunk size in characters: one iteration only for ASCII
+ from0 <- fmap (\fp -> bufferAdd sz_bytes (emptyBuffer fp sz_bytes ReadBuffer)) $ newForeignPtr_ (castPtr p)
+ to <- newCharBuffer chunk_size WriteBuffer
+
+ let go iteration from = do
+ (why, from', to') <- encode decoder from to
+ if isEmptyBuffer from'
+ then
+ -- No input remaining: @why@ will be InputUnderflow, but we don't care
+ fmap (map desurrogatifyRoundtripCharacter) $ withBuffer to' $ peekArray (bufferElems to')
+ else do
+ -- Input remaining: what went wrong?
+ putDebugMsg ("peekEncodedCString: " ++ show iteration ++ " " ++ show why)
+ (from'', to'') <- case why of InvalidSequence -> recover decoder from' to' -- These conditions are equally bad because
+ InputUnderflow -> recover decoder from' to' -- they indicate malformed/truncated input
+ OutputUnderflow -> return (from', to') -- We will have more space next time round
+ putDebugMsg ("peekEncodedCString: from " ++ summaryBuffer from ++ " " ++ summaryBuffer from' ++ " " ++ summaryBuffer from'')
+ putDebugMsg ("peekEncodedCString: to " ++ summaryBuffer to ++ " " ++ summaryBuffer to' ++ " " ++ summaryBuffer to'')
+ to_chars <- withBuffer to'' $ peekArray (bufferElems to'')
+ fmap (map desurrogatifyRoundtripCharacter to_chars++) $ go (iteration + 1) from''
+
+ go (0 :: Int) from0
+
+{-# INLINE withEncodedCString #-}
+withEncodedCString :: TextEncoding -- ^ Encoding of CString to create
+ -> Bool -- ^ Null-terminate?
+ -> String -- ^ String to encode
+ -> (CStringLen -> IO a) -- ^ Worker that can safely use the allocated memory
+ -> IO a
+withEncodedCString (TextEncoding { mkTextEncoder = mk_encoder }) null_terminate s act
+ = bracket mk_encoder close $ \encoder -> withArrayLen (map surrogatifyRoundtripCharacter s) $ \sz p -> do
+ from <- fmap (\fp -> bufferAdd sz (emptyBuffer fp sz ReadBuffer)) $ newForeignPtr_ p
+
+ let go iteration to_sz_bytes = do
+ putDebugMsg ("withEncodedCString: " ++ show iteration)
+ allocaBytes to_sz_bytes $ \to_p -> do
+ mb_res <- tryFillBufferAndCall encoder null_terminate from to_p to_sz_bytes act
+ case mb_res of
+ Nothing -> go (iteration + 1) (to_sz_bytes * 2)
+ Just res -> return res
+
+ -- If the input string is ASCII, this value will ensure we only allocate once
+ go (0 :: Int) (cCharSize * (sz + 1))
+
+{-# INLINE newEncodedCString #-}
+newEncodedCString :: TextEncoding -- ^ Encoding of CString to create
+ -> Bool -- ^ Null-terminate?
+ -> String -- ^ String to encode
+ -> IO CStringLen
+newEncodedCString (TextEncoding { mkTextEncoder = mk_encoder }) null_terminate s
+ = bracket mk_encoder close $ \encoder -> withArrayLen (map surrogatifyRoundtripCharacter s) $ \sz p -> do
+ from <- fmap (\fp -> bufferAdd sz (emptyBuffer fp sz ReadBuffer)) $ newForeignPtr_ p
+
+ let go iteration to_p to_sz_bytes = do
+ putDebugMsg ("newEncodedCString: " ++ show iteration)
+ mb_res <- tryFillBufferAndCall encoder null_terminate from to_p to_sz_bytes return
+ case mb_res of
+ Nothing -> do
+ let to_sz_bytes' = to_sz_bytes * 2
+ to_p' <- reallocBytes to_p to_sz_bytes'
+ go (iteration + 1) to_p' to_sz_bytes'
+ Just res -> return res
+
+ -- If the input string is ASCII, this value will ensure we only allocate once
+ let to_sz_bytes = cCharSize * (sz + 1)
+ to_p <- mallocBytes to_sz_bytes
+ go (0 :: Int) to_p to_sz_bytes
+
+
+tryFillBufferAndCall :: TextEncoder dstate -> Bool -> Buffer Char -> Ptr Word8 -> Int
+ -> (CStringLen -> IO a) -> IO (Maybe a)
+tryFillBufferAndCall encoder null_terminate from0 to_p to_sz_bytes act = do
+ to_fp <- newForeignPtr_ to_p
+ go (0 :: Int) (from0, emptyBuffer to_fp to_sz_bytes WriteBuffer)
+ where
+ go iteration (from, to) = do
+ (why, from', to') <- encode encoder from to
+ putDebugMsg ("tryFillBufferAndCall: " ++ show iteration ++ " " ++ show why ++ " " ++ summaryBuffer from ++ " " ++ summaryBuffer from')
+ if isEmptyBuffer from'
+ then if null_terminate && bufferAvailable to' == 0
+ then return Nothing -- We had enough for the string but not the terminator: ask the caller for more buffer
+ else do
+ -- Awesome, we had enough buffer
+ let bytes = bufferElems to'
+ withBuffer to' $ \to_ptr -> do
+ when null_terminate $ pokeElemOff to_ptr (bufR to') 0
+ fmap Just $ act (castPtr to_ptr, bytes) -- NB: the length information is specified as being in *bytes*
+ else case why of -- We didn't consume all of the input
+ InputUnderflow -> recover encoder from' to' >>= go (iteration + 1) -- These conditions are equally bad
+ InvalidSequence -> recover encoder from' to' >>= go (iteration + 1) -- since the input was truncated/invalid
+ OutputUnderflow -> return Nothing -- Oops, out of buffer during decoding: ask the caller for more
mask, mask_, uninterruptibleMask, uninterruptibleMask_,
MaskingState(..), getMaskingState,
block, unblock, blocked, unsafeUnmask,
- onException, finally, evaluate
+ onException, bracket, finally, evaluate
) where
import GHC.Base
MaskedInterruptible -> blockUninterruptible $ io block
MaskedUninterruptible -> io id
+bracket
+ :: IO a -- ^ computation to run first (\"acquire resource\")
+ -> (a -> IO b) -- ^ computation to run last (\"release resource\")
+ -> (a -> IO c) -- ^ computation to run in-between
+ -> IO c -- returns the value from the in-between computation
+bracket before after thing =
+ mask $ \restore -> do
+ a <- before
+ r <- restore (thing a) `onException` after a
+ _ <- after a
+ return r
+
finally :: IO a -- ^ computation to run first
-> IO b -- ^ computation to run afterward (even if an exception
-- was raised)
-----------------------------------------------------------------------------
module GHC.IO.Encoding (
- BufferCodec(..), TextEncoding(..), TextEncoder, TextDecoder,
+ BufferCodec(..), TextEncoding(..), TextEncoder, TextDecoder, CodingProgress(..),
latin1, latin1_encode, latin1_decode,
utf8, utf8_bom,
utf16, utf16le, utf16be,
utf32, utf32le, utf32be,
- localeEncoding,
+ localeEncoding, fileSystemEncoding, foreignEncoding,
mkTextEncoding,
) where
import GHC.Base
--import GHC.IO
+import GHC.IO.Exception
import GHC.IO.Buffer
+import GHC.IO.Encoding.Failure
import GHC.IO.Encoding.Types
import GHC.Word
#if !defined(mingw32_HOST_OS)
import qualified GHC.IO.Encoding.UTF16 as UTF16
import qualified GHC.IO.Encoding.UTF32 as UTF32
-#if defined(mingw32_HOST_OS)
+import Data.List
import Data.Maybe
-import GHC.IO.Exception
-#endif
-- -----------------------------------------------------------------------------
utf32be = UTF32.utf32be
-- | The Unicode encoding of the current locale
-localeEncoding :: TextEncoding
+localeEncoding :: TextEncoding
+
+-- | The Unicode encoding of the current locale, but allowing arbitrary
+-- undecodable bytes to be round-tripped through it.
+--
+-- This 'TextEncoding' is used to decode and encode command line arguments
+-- and environment variables on non-Windows platforms.
+--
+-- On Windows, this encoding *should not* be used if possible because
+-- the use of code pages is deprecated: Strings should be retrieved
+-- via the "wide" W-family of UTF-16 APIs instead
+fileSystemEncoding :: TextEncoding
+
+-- | The Unicode encoding of the current locale, but where undecodable
+-- bytes are replaced with their closest visual match. Used for
+-- the 'CString' marshalling functions in "Foreign.C.String"
+foreignEncoding :: TextEncoding
+
#if !defined(mingw32_HOST_OS)
localeEncoding = Iconv.localeEncoding
+fileSystemEncoding = Iconv.mkLocaleEncoding RoundtripFailure
+foreignEncoding = Iconv.mkLocaleEncoding IgnoreCodingFailure
#else
localeEncoding = CodePage.localeEncoding
+fileSystemEncoding = CodePage.mkLocaleEncoding RoundtripFailure
+foreignEncoding = CodePage.mkLocaleEncoding IgnoreCodingFailure
#endif
-- | Look up the named Unicode encoding. May fail with
-- @CP@; for example, @\"CP1250\"@.
--
mkTextEncoding :: String -> IO TextEncoding
-#if !defined(mingw32_HOST_OS)
-mkTextEncoding = Iconv.mkTextEncoding
+mkTextEncoding e = case mb_coding_failure_mode of
+ Nothing -> unknown_encoding
+ Just cfm -> case enc of
+ "UTF-8" -> return $ UTF8.mkUTF8 cfm
+ "UTF-16" -> return $ UTF16.mkUTF16 cfm
+ "UTF-16LE" -> return $ UTF16.mkUTF16le cfm
+ "UTF-16BE" -> return $ UTF16.mkUTF16be cfm
+ "UTF-32" -> return $ UTF32.mkUTF32 cfm
+ "UTF-32LE" -> return $ UTF32.mkUTF32le cfm
+ "UTF-32BE" -> return $ UTF32.mkUTF32be cfm
+#if defined(mingw32_HOST_OS)
+ 'C':'P':n | [(cp,"")] <- reads n -> return $ CodePage.mkCodePageEncoding cfm cp
+ _ -> unknown_encoding
#else
-mkTextEncoding "UTF-8" = return utf8
-mkTextEncoding "UTF-16" = return utf16
-mkTextEncoding "UTF-16LE" = return utf16le
-mkTextEncoding "UTF-16BE" = return utf16be
-mkTextEncoding "UTF-32" = return utf32
-mkTextEncoding "UTF-32LE" = return utf32le
-mkTextEncoding "UTF-32BE" = return utf32be
-mkTextEncoding ('C':'P':n)
- | [(cp,"")] <- reads n = return $ CodePage.codePageEncoding cp
-mkTextEncoding e = ioException
- (IOError Nothing NoSuchThing "mkTextEncoding"
- ("unknown encoding:" ++ e) Nothing Nothing)
+ _ -> Iconv.mkIconvEncoding cfm enc
#endif
+ where
+ -- The only problem with actually documenting //IGNORE and //TRANSLIT as
+ -- supported suffixes is that they are not necessarily supported with non-GNU iconv
+ (enc, suffix) = span (/= '/') e
+ mb_coding_failure_mode = case suffix of
+ "" -> Just ErrorOnCodingFailure
+ "//IGNORE" -> Just IgnoreCodingFailure
+ "//TRANSLIT" -> Just TransliterateCodingFailure
+ "//ROUNDTRIP" -> Just RoundtripFailure
+ _ -> Nothing
+
+ unknown_encoding = ioException (IOError Nothing NoSuchThing "mkTextEncoding"
+ ("unknown encoding:" ++ e) Nothing Nothing)
latin1_encode :: CharBuffer -> Buffer Word8 -> IO (CharBuffer, Buffer Word8)
-latin1_encode = Latin1.latin1_encode -- unchecked, used for binary
+latin1_encode input output = fmap (\(_why,input',output') -> (input',output')) $ Latin1.latin1_encode input output -- unchecked, used for binary
--latin1_encode = unsafePerformIO $ do mkTextEncoder Iconv.latin1 >>= return.encode
latin1_decode :: Buffer Word8 -> CharBuffer -> IO (Buffer Word8, CharBuffer)
-latin1_decode = Latin1.latin1_decode
+latin1_decode input output = fmap (\(_why,input',output') -> (input',output')) $ Latin1.latin1_decode input output
--latin1_decode = unsafePerformIO $ do mkTextDecoder Iconv.latin1 >>= return.encode
--- /dev/null
+{-# LANGUAGE NoImplicitPrelude #-}
+module GHC.IO.Encoding where
+
+import GHC.IO.Encoding.Types
+
+localeEncoding, fileSystemEncoding, foreignEncoding :: TextEncoding
\ No newline at end of file
#if !defined(mingw32_HOST_OS)
) where
#else
- codePageEncoding,
- localeEncoding
+ codePageEncoding, mkCodePageEncoding,
+ localeEncoding, mkLocaleEncoding
) where
import GHC.Base
import GHC.Enum
import GHC.Word
import GHC.IO (unsafePerformIO)
+import GHC.IO.Encoding.Failure
import GHC.IO.Encoding.Types
import GHC.IO.Buffer
-import GHC.IO.Exception
import Data.Bits
import Data.Maybe
import Data.List (lookup)
import GHC.IO.Encoding.CodePage.Table
-import GHC.IO.Encoding.Latin1 (latin1)
-import GHC.IO.Encoding.UTF8 (utf8)
-import GHC.IO.Encoding.UTF16 (utf16le, utf16be)
-import GHC.IO.Encoding.UTF32 (utf32le, utf32be)
+import GHC.IO.Encoding.Latin1 (mkLatin1)
+import GHC.IO.Encoding.UTF8 (mkUTF8)
+import GHC.IO.Encoding.UTF16 (mkUTF16le, mkUTF16be)
+import GHC.IO.Encoding.UTF32 (mkUTF32le, mkUTF32be)
-- note CodePage = UInt which might not work on Win64. But the Win32 package
-- also has this issue.
foreign import stdcall unsafe "windows.h GetACP"
getACP :: IO Word32
-{-# NOINLINE localeEncoding #-}
+{-# NOINLINE currentCodePage #-}
+currentCodePage :: Word32
+currentCodePage = unsafePerformIO getCurrentCodePage
+
localeEncoding :: TextEncoding
-localeEncoding = unsafePerformIO $ fmap codePageEncoding getCurrentCodePage
-
+localeEncoding = mkLocaleEncoding ErrorOnCodingFailure
+
+mkLocaleEncoding :: CodingFailureMode -> TextEncoding
+mkLocaleEncoding cfm = mkCodePageEncoding cfm currentCodePage
+
codePageEncoding :: Word32 -> TextEncoding
-codePageEncoding 65001 = utf8
-codePageEncoding 1200 = utf16le
-codePageEncoding 1201 = utf16be
-codePageEncoding 12000 = utf32le
-codePageEncoding 12001 = utf32be
-codePageEncoding cp = maybe latin1 (buildEncoding cp) (lookup cp codePageMap)
-
-buildEncoding :: Word32 -> CodePageArrays -> TextEncoding
-buildEncoding cp SingleByteCP {decoderArray = dec, encoderArray = enc}
+codePageEncoding = mkCodePageEncoding ErrorOnCodingFailure
+
+mkCodePageEncoding :: CodingFailureMode -> Word32 -> TextEncoding
+mkCodePageEncoding cfm 65001 = mkUTF8 cfm
+mkCodePageEncoding cfm 1200 = mkUTF16le cfm
+mkCodePageEncoding cfm 1201 = mkUTF16be cfm
+mkCodePageEncoding cfm 12000 = mkUTF32le cfm
+mkCodePageEncoding cfm 12001 = mkUTF32be cfm
+mkCodePageEncoding cfm cp = maybe (mkLatin1 cfm) (buildEncoding cfm cp) (lookup cp codePageMap)
+
+buildEncoding :: CodingFailureMode -> Word32 -> CodePageArrays -> TextEncoding
+buildEncoding cfm cp SingleByteCP {decoderArray = dec, encoderArray = enc}
= TextEncoding {
- textEncodingName = "CP" ++ show cp,
- mkTextDecoder = return $ simpleCodec
- $ decodeFromSingleByte dec
- , mkTextEncoder = return $ simpleCodec $ encodeToSingleByte enc
+ textEncodingName = "CP" ++ show cp
+ , mkTextDecoder = return $ simpleCodec (recoverDecode cfm) $ decodeFromSingleByte dec
+ , mkTextEncoder = return $ simpleCodec (recoverEncode cfm) $ encodeToSingleByte enc
}
simpleCodec :: (Buffer from -> Buffer to -> IO (Buffer from, Buffer to))
+ -> (Buffer from -> Buffer to -> IO (CodingProgress, Buffer from, Buffer to))
-> BufferCodec from to ()
-simpleCodec f = BufferCodec {encode = f, close = return (), getState = return (),
- setState = return }
+simpleCodec r f = BufferCodec {
+ encode = f,
+ recover = r,
+ close = return (),
+ getState = return (),
+ setState = return
+ }
decodeFromSingleByte :: ConvArray Char -> DecodeBuffer
decodeFromSingleByte convArr
input@Buffer { bufRaw=iraw, bufL=ir0, bufR=iw, bufSize=_ }
output@Buffer { bufRaw=oraw, bufL=_, bufR=ow0, bufSize=os }
= let
- done !ir !ow = return (if ir==iw then input{ bufL=0, bufR=0}
- else input{ bufL=ir},
- output {bufR=ow})
+ done why !ir !ow = return (why,
+ if ir==iw then input{ bufL=0, bufR=0}
+ else input{ bufL=ir},
+ output {bufR=ow})
loop !ir !ow
- | ow >= os || ir >= iw = done ir ow
+ | ow >= os = done OutputUnderflow ir ow
+ | ir >= iw = done InputUnderflow ir ow
| otherwise = do
b <- readWord8Buf iraw ir
let c = lookupConv convArr b
ow' <- writeCharBuf oraw ow c
loop (ir+1) ow'
where
- invalid = if ir > ir0 then done ir ow else ioe_decodingError
+ invalid = done InvalidSequence ir ow
in loop ir0 ow0
encodeToSingleByte :: CompactArray Char Word8 -> EncodeBuffer
input@Buffer{ bufRaw=iraw, bufL=ir0, bufR=iw, bufSize=_ }
output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow0, bufSize=os }
= let
- done !ir !ow = return (if ir==iw then input { bufL=0, bufR=0 }
- else input { bufL=ir },
- output {bufR=ow})
+ done why !ir !ow = return (why,
+ if ir==iw then input { bufL=0, bufR=0 }
+ else input { bufL=ir },
+ output {bufR=ow})
loop !ir !ow
- | ow >= os || ir >= iw = done ir ow
+ | ow >= os = done OutputUnderflow ir ow
+ | ir >= iw = done InputUnderflow ir ow
| otherwise = do
(c,ir') <- readCharBuf iraw ir
case lookupCompact maxChar indices values c of
writeWord8Buf oraw ow b
loop ir' (ow+1)
where
- invalid = if ir > ir0 then done ir ow else ioe_encodingError
+ invalid = done InvalidSequence ir ow
in
loop ir0 ow0
-ioe_decodingError :: IO a
-ioe_decodingError = ioException
- (IOError Nothing InvalidArgument "codePageEncoding"
- "invalid code page byte sequence" Nothing Nothing)
-
-ioe_encodingError :: IO a
-ioe_encodingError = ioException
- (IOError Nothing InvalidArgument "codePageEncoding"
- "character is not in the code page" Nothing Nothing)
-
--------------------------------------------
-- Array access functions
--- /dev/null
+{-# LANGUAGE NoImplicitPrelude, PatternGuards #-}
+-----------------------------------------------------------------------------
+-- |
+-- Module : GHC.IO.Encoding.Failure
+-- Copyright : (c) The University of Glasgow, 2008-2011
+-- License : see libraries/base/LICENSE
+--
+-- Maintainer : libraries@haskell.org
+-- Stability : internal
+-- Portability : non-portable
+--
+-- Types for specifying how text encoding/decoding fails
+--
+-----------------------------------------------------------------------------
+
+module GHC.IO.Encoding.Failure (
+ CodingFailureMode(..), codingFailureModeSuffix,
+ isSurrogate,
+ surrogatifyRoundtripCharacter, desurrogatifyRoundtripCharacter,
+ recoverDecode, recoverEncode
+ ) where
+
+import GHC.IO
+import GHC.IO.Buffer
+import GHC.IO.Exception
+
+import GHC.Base
+import GHC.Word
+import GHC.Show
+import GHC.Num
+import GHC.Real ( fromIntegral )
+
+--import System.Posix.Internals
+
+import Data.Maybe
+
+-- | The 'CodingFailureMode' is used to construct 'TextEncoding's, and specifies
+-- how they handle illegal sequences.
+data CodingFailureMode = ErrorOnCodingFailure -- ^ Throw an error when an illegal sequence is encountered
+ | IgnoreCodingFailure -- ^ Attempt to ignore and recover if an illegal sequence is encountered
+ | TransliterateCodingFailure -- ^ Replace with the closest visual match upon an illegal sequence
+ | RoundtripFailure -- ^ Use the private-use escape mechanism to attempt to allow illegal sequences to be roundtripped.
+ deriving (Show) -- This will only work properly for those encodings which are strict supersets of ASCII in the sense
+ -- that valid ASCII data is also valid in that encoding. This is not true for e.g. UTF-16, because
+ -- ASCII characters must be padded to two bytes to retain their meaning.
+
+-- Note [Roundtripping]
+-- ~~~~~~~~~~~~~~~~~~~~
+--
+-- Roundtripping is based on the ideas of PEP383. However, unlike PEP383 we do not wish to use lone surrogate codepoints
+-- to escape undecodable bytes, because that may confuse Unicode processing software written in Haskell. Instead, we use
+-- the range of private-use characters from 0xEF80 to 0xEFFF designated for "encoding hacks" by the ConScript Unicode Registery.
+--
+-- This introduces a technical problem when it comes to encoding back to bytes using iconv. The iconv code will not fail when
+-- it tries to encode a private-use character (as it would if trying to encode a surrogate), which means that we won't get a
+-- chance to replace it with the byte we originally escaped.
+--
+-- To work around this, when filling the buffer to be encoded (in writeBlocks/withEncodedCString/newEncodedCString), we replace
+-- the private-use characters with lone surrogates again! Likewise, when reading from a buffer (unpack/unpack_nl/peekEncodedCString)
+-- we have to do the inverse process.
+--
+-- The user of String should never see these lone surrogates, but it ensures that iconv will throw an error when encountering them.
+-- We use lone surrogates in the range 0xDC00 to 0xDCFF for this purpose.
+
+codingFailureModeSuffix :: CodingFailureMode -> String
+codingFailureModeSuffix ErrorOnCodingFailure = ""
+codingFailureModeSuffix IgnoreCodingFailure = "//IGNORE"
+codingFailureModeSuffix TransliterateCodingFailure = "//TRANSLIT"
+codingFailureModeSuffix RoundtripFailure = "//ROUNDTRIP"
+
+-- | In transliterate mode, we use this character when decoding unknown bytes.
+--
+-- This is the defined Unicode replacement character: <http://www.fileformat.info/info/unicode/char/0fffd/index.htm>
+unrepresentableChar :: Char
+unrepresentableChar = '\xFFFD'
+
+-- | Some characters are actually "surrogate" codepoints defined for use in UTF-16. We need to signal an
+-- invalid character if we detect them when encoding a sequence of 'Char's into 'Word8's because they won't
+-- give valid Unicode.
+--
+-- We may also need to signal an invalid character if we detect them when encoding a sequence of 'Char's into 'Word8's
+-- because the 'RoundtripFailure' mode creates these to round-trip bytes through our internal UTF-16 encoding.
+isSurrogate :: Char -> Bool
+isSurrogate c = (0xD800 <= x && x <= 0xDBFF) || (0xDC00 <= x && x <= 0xDFFF)
+ where x = ord c
+
+-- | We use some private-use characters for roundtripping unknown bytes through a String
+isRoundtripEscapeChar :: Char -> Bool
+isRoundtripEscapeChar c = 0xEF00 <= x && x < 0xF000
+ where x = ord c
+
+-- | We use some surrogate characters for roundtripping unknown bytes through a String
+isRoundtripEscapeSurrogateChar :: Char -> Bool
+isRoundtripEscapeSurrogateChar c = 0xDC00 <= x && x < 0xDD00
+ where x = ord c
+
+-- Private use characters (in Strings) --> lone surrogates (in Buffer CharBufElem)
+surrogatifyRoundtripCharacter :: Char -> Char
+surrogatifyRoundtripCharacter c | isRoundtripEscapeChar c = chr (ord c - 0xEF00 + 0xDC00)
+ | otherwise = c
+
+-- Lone surrogates (in Buffer CharBufElem) --> private use characters (in Strings)
+desurrogatifyRoundtripCharacter :: Char -> Char
+desurrogatifyRoundtripCharacter c | isRoundtripEscapeSurrogateChar c = chr (ord c - 0xDC00 + 0xEF00)
+ | otherwise = c
+
+-- Bytes (in Buffer Word8) --> lone surrogates (in Buffer CharBufElem)
+escapeToRoundtripCharacterSurrogate :: Word8 -> Char
+escapeToRoundtripCharacterSurrogate b
+ | b < 128 = chr (fromIntegral b) -- Disallow 'smuggling' of ASCII bytes. For roundtripping to work, this assumes encoding is ASCII-superset.
+ | otherwise = chr (0xDC00 + fromIntegral b)
+
+-- Lone surrogates (in Buffer CharBufElem) --> bytes (in Buffer Word8)
+unescapeRoundtripCharacterSurrogate :: Char -> Maybe Word8
+unescapeRoundtripCharacterSurrogate c
+ | 0xDC80 <= x && x < 0xDD00 = Just (fromIntegral x) -- Discard high byte
+ | otherwise = Nothing
+ where x = ord c
+
+recoverDecode :: CodingFailureMode -> Buffer Word8 -> Buffer Char -> IO (Buffer Word8, Buffer Char)
+recoverDecode cfm input@Buffer{ bufRaw=iraw, bufL=ir, bufR=_ }
+ output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow } = do
+ --puts $ "recoverDecode " ++ show ir
+ case cfm of
+ ErrorOnCodingFailure -> ioe_decodingError
+ IgnoreCodingFailure -> return (input { bufL=ir+1 }, output)
+ TransliterateCodingFailure -> do
+ ow' <- writeCharBuf oraw ow unrepresentableChar
+ return (input { bufL=ir+1 }, output { bufR=ow' })
+ RoundtripFailure -> do
+ b <- readWord8Buf iraw ir
+ ow' <- writeCharBuf oraw ow (escapeToRoundtripCharacterSurrogate b)
+ return (input { bufL=ir+1 }, output { bufR=ow' })
+
+recoverEncode :: CodingFailureMode -> Buffer Char -> Buffer Word8 -> IO (Buffer Char, Buffer Word8)
+recoverEncode cfm input@Buffer{ bufRaw=iraw, bufL=ir, bufR=_ }
+ output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow } = do
+ (c,ir') <- readCharBuf iraw ir
+ --puts $ "recoverEncode " ++ show ir ++ " " ++ show ir'
+ case cfm of
+ IgnoreCodingFailure -> return (input { bufL=ir' }, output)
+ TransliterateCodingFailure -> do
+ if c == '?'
+ then return (input { bufL=ir' }, output)
+ else do
+ -- XXX: evil hack! To implement transliteration, we just poke an
+ -- ASCII ? into the input buffer and tell the caller to try and decode
+ -- again. This is *probably* safe given current uses of TextEncoding.
+ --
+ -- The "if" test above ensures we skip if the encoding fails to deal with
+ -- the ?, though this should never happen in practice as all encodings are
+ -- in fact capable of reperesenting all ASCII characters.
+ _ir' <- writeCharBuf iraw ir '?'
+ return (input, output)
+
+ -- This implementation does not work because e.g. UTF-16 requires 2 bytes to
+ -- encode a simple ASCII value
+ --writeWord8Buf oraw ow unrepresentableByte
+ --return (input { bufL=ir' }, output { bufR=ow+1 })
+ RoundtripFailure | Just x <- unescapeRoundtripCharacterSurrogate c -> do
+ writeWord8Buf oraw ow x
+ return (input { bufL=ir' }, output { bufR=ow+1 })
+ _ -> ioe_encodingError
+
+ioe_decodingError :: IO a
+ioe_decodingError = ioException
+ (IOError Nothing InvalidArgument "recoverDecode"
+ "invalid byte sequence" Nothing Nothing)
+
+ioe_encodingError :: IO a
+ioe_encodingError = ioException
+ (IOError Nothing InvalidArgument "recoverEncode"
+ "invalid character" Nothing Nothing)
-- #hide
module GHC.IO.Encoding.Iconv (
#if !defined(mingw32_HOST_OS)
- mkTextEncoding,
- latin1,
- utf8,
- utf16, utf16le, utf16be,
- utf32, utf32le, utf32be,
- localeEncoding
+ iconvEncoding, mkIconvEncoding,
+ localeEncoding, mkLocaleEncoding
#endif
) where
import Data.Maybe
import GHC.Base
import GHC.IO.Buffer
+import GHC.IO.Encoding.Failure
import GHC.IO.Encoding.Types
import GHC.List (span)
import GHC.Num
| c_DEBUG_DUMP = puts s
| otherwise = return ()
-puts :: String -> IO ()
-puts s = do _ <- withCAStringLen (s ++ "\n") $ \(p, len) ->
- -- In reality should be withCString, but assume ASCII to avoid loop
- c_write 1 (castPtr p) (fromIntegral len)
- return ()
-
-- -----------------------------------------------------------------------------
-- iconv encoders/decoders
-{-# NOINLINE latin1 #-}
-latin1 :: TextEncoding
-latin1 = unsafePerformIO (mkTextEncoding "Latin1")
-
-{-# NOINLINE utf8 #-}
-utf8 :: TextEncoding
-utf8 = unsafePerformIO (mkTextEncoding "UTF8")
-
-{-# NOINLINE utf16 #-}
-utf16 :: TextEncoding
-utf16 = unsafePerformIO (mkTextEncoding "UTF16")
-
-{-# NOINLINE utf16le #-}
-utf16le :: TextEncoding
-utf16le = unsafePerformIO (mkTextEncoding "UTF16LE")
-
-{-# NOINLINE utf16be #-}
-utf16be :: TextEncoding
-utf16be = unsafePerformIO (mkTextEncoding "UTF16BE")
-
-{-# NOINLINE utf32 #-}
-utf32 :: TextEncoding
-utf32 = unsafePerformIO (mkTextEncoding "UTF32")
-
-{-# NOINLINE utf32le #-}
-utf32le :: TextEncoding
-utf32le = unsafePerformIO (mkTextEncoding "UTF32LE")
-
-{-# NOINLINE utf32be #-}
-utf32be :: TextEncoding
-utf32be = unsafePerformIO (mkTextEncoding "UTF32BE")
-
{-# NOINLINE localeEncodingName #-}
localeEncodingName :: String
localeEncodingName = unsafePerformIO $ do
cstr <- c_localeEncoding
peekCAString cstr -- Assume charset names are ASCII
-{-# NOINLINE localeEncoding #-}
localeEncoding :: TextEncoding
-localeEncoding = unsafePerformIO $ mkTextEncoding localeEncodingName
+localeEncoding = mkLocaleEncoding ErrorOnCodingFailure
+
+mkLocaleEncoding :: CodingFailureMode -> TextEncoding
+mkLocaleEncoding cfm = unsafePerformIO $ mkIconvEncoding cfm localeEncodingName
-- We hope iconv_t is a storable type. It should be, since it has at least the
-- value -1, which is a possible return value from iconv_open.
char_shift | charSize == 2 = 1
| otherwise = 2
-mkTextEncoding :: String -> IO TextEncoding
-mkTextEncoding charset = do
+iconvEncoding :: String -> IO TextEncoding
+iconvEncoding = mkIconvEncoding ErrorOnCodingFailure
+
+mkIconvEncoding :: CodingFailureMode -> String -> IO TextEncoding
+mkIconvEncoding cfm charset = do
return (TextEncoding {
textEncodingName = charset,
- mkTextDecoder = newIConv raw_charset (haskellChar ++ suffix) iconvDecode,
- mkTextEncoder = newIConv haskellChar charset iconvEncode})
+ mkTextDecoder = newIConv raw_charset (haskellChar ++ suffix) (recoverDecode cfm) iconvDecode,
+ mkTextEncoder = newIConv haskellChar charset (recoverEncode cfm) iconvEncode})
where
-- An annoying feature of GNU iconv is that the //PREFIXES only take
-- effect when they appear on the tocode parameter to iconv_open:
(raw_charset, suffix) = span (/= '/') charset
newIConv :: String -> String
- -> (IConv -> Buffer a -> Buffer b -> IO (Buffer a, Buffer b))
+ -> (Buffer a -> Buffer b -> IO (Buffer a, Buffer b))
+ -> (IConv -> Buffer a -> Buffer b -> IO (CodingProgress, Buffer a, Buffer b))
-> IO (BufferCodec a b ())
-newIConv from to fn =
+newIConv from to rec fn =
-- Assume charset names are ASCII
withCAString from $ \ from_str ->
withCAString to $ \ to_str -> do
let iclose = throwErrnoIfMinus1_ "Iconv.close" $ hs_iconv_close iconvt
return BufferCodec{
encode = fn iconvt,
+ recover = rec,
close = iclose,
-- iconv doesn't supply a way to save/restore the state
getState = return (),
setState = const $ return ()
}
-iconvDecode :: IConv -> Buffer Word8 -> Buffer CharBufElem
- -> IO (Buffer Word8, Buffer CharBufElem)
+iconvDecode :: IConv -> DecodeBuffer
iconvDecode iconv_t ibuf obuf = iconvRecode iconv_t ibuf 0 obuf char_shift
-iconvEncode :: IConv -> Buffer CharBufElem -> Buffer Word8
- -> IO (Buffer CharBufElem, Buffer Word8)
+iconvEncode :: IConv -> EncodeBuffer
iconvEncode iconv_t ibuf obuf = iconvRecode iconv_t ibuf char_shift obuf 0
iconvRecode :: IConv -> Buffer a -> Int -> Buffer b -> Int
- -> IO (Buffer a, Buffer b)
+ -> IO (CodingProgress, Buffer a, Buffer b)
iconvRecode iconv_t
input@Buffer{ bufRaw=iraw, bufL=ir, bufR=iw, bufSize=_ } iscale
output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow, bufSize=os } oscale
iconv_trace ("iconvRecode after, output=" ++ show (summaryBuffer new_output))
if (res /= -1)
then do -- all input translated
- return (new_input, new_output)
+ return (InputUnderflow, new_input, new_output)
else do
errno <- getErrno
case errno of
- e | e == eINVAL || e == e2BIG
- || e == eILSEQ && new_inleft' /= (iw-ir) -> do
- iconv_trace ("iconv ignoring error: " ++ show (errnoToIOError "iconv" e Nothing Nothing))
- -- Output overflow is harmless
- --
- -- Similarly, we ignore EILSEQ unless we converted no
- -- characters. Sometimes iconv reports EILSEQ for a
- -- character in the input even when there is no room
- -- in the output; in this case we might be about to
- -- change the encoding anyway, so the following bytes
- -- could very well be in a different encoding.
- -- This also helps with pinpointing EILSEQ errors: we
- -- don't report it until the rest of the characters in
- -- the buffer have been drained.
- return (new_input, new_output)
-
- e -> do
- iconv_trace ("iconv returned error: " ++ show (errnoToIOError "iconv" e Nothing Nothing))
- throwErrno "iconvRecoder"
- -- illegal sequence, or some other error
+ e | e == e2BIG -> return (OutputUnderflow, new_input, new_output)
+ | e == eINVAL -> return (InputUnderflow, new_input, new_output)
+ -- Sometimes iconv reports EILSEQ for a
+ -- character in the input even when there is no room
+ -- in the output; in this case we might be about to
+ -- change the encoding anyway, so the following bytes
+ -- could very well be in a different encoding.
+ --
+ -- Because we can only say InvalidSequence if there is at least
+ -- one element left in the output, we have to special case this.
+ | e == eILSEQ -> return (if new_outleft' == 0 then OutputUnderflow else InvalidSequence, new_input, new_output)
+ | otherwise -> do
+ iconv_trace ("iconv returned error: " ++ show (errnoToIOError "iconv" e Nothing Nothing))
+ throwErrno "iconvRecoder"
#endif /* !mingw32_HOST_OS */
-----------------------------------------------------------------------------
module GHC.IO.Encoding.Latin1 (
- latin1,
- latin1_checked,
+ latin1, mkLatin1,
+ latin1_checked, mkLatin1_checked,
latin1_decode,
latin1_encode,
latin1_checked_encode,
import GHC.Real
import GHC.Num
-- import GHC.IO
-import GHC.IO.Exception
import GHC.IO.Buffer
+import GHC.IO.Encoding.Failure
import GHC.IO.Encoding.Types
-import Data.Maybe
-- -----------------------------------------------------------------------------
-- Latin1
latin1 :: TextEncoding
-latin1 = TextEncoding { textEncodingName = "ISO8859-1",
- mkTextDecoder = latin1_DF,
- mkTextEncoder = latin1_EF }
+latin1 = mkLatin1 ErrorOnCodingFailure
-latin1_DF :: IO (TextDecoder ())
-latin1_DF =
+mkLatin1 :: CodingFailureMode -> TextEncoding
+mkLatin1 cfm = TextEncoding { textEncodingName = "ISO8859-1",
+ mkTextDecoder = latin1_DF cfm,
+ mkTextEncoder = latin1_EF cfm }
+
+latin1_DF :: CodingFailureMode -> IO (TextDecoder ())
+latin1_DF cfm =
return (BufferCodec {
encode = latin1_decode,
+ recover = recoverDecode cfm,
close = return (),
getState = return (),
setState = const $ return ()
})
-latin1_EF :: IO (TextEncoder ())
-latin1_EF =
+latin1_EF :: CodingFailureMode -> IO (TextEncoder ())
+latin1_EF cfm =
return (BufferCodec {
encode = latin1_encode,
+ recover = recoverEncode cfm,
close = return (),
getState = return (),
setState = const $ return ()
})
latin1_checked :: TextEncoding
-latin1_checked = TextEncoding { textEncodingName = "ISO8859-1(checked)",
- mkTextDecoder = latin1_DF,
- mkTextEncoder = latin1_checked_EF }
+latin1_checked = mkLatin1_checked ErrorOnCodingFailure
+
+mkLatin1_checked :: CodingFailureMode -> TextEncoding
+mkLatin1_checked cfm = TextEncoding { textEncodingName = "ISO8859-1(checked)",
+ mkTextDecoder = latin1_DF cfm,
+ mkTextEncoder = latin1_checked_EF cfm }
-latin1_checked_EF :: IO (TextEncoder ())
-latin1_checked_EF =
+latin1_checked_EF :: CodingFailureMode -> IO (TextEncoder ())
+latin1_checked_EF cfm =
return (BufferCodec {
encode = latin1_checked_encode,
+ recover = recoverEncode cfm,
close = return (),
getState = return (),
setState = const $ return ()
output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow0, bufSize=os }
= let
loop !ir !ow
- | ow >= os || ir >= iw = done ir ow
+ | ow >= os = done OutputUnderflow ir ow
+ | ir >= iw = done InputUnderflow ir ow
| otherwise = do
c0 <- readWord8Buf iraw ir
ow' <- writeCharBuf oraw ow (unsafeChr (fromIntegral c0))
loop (ir+1) ow'
-- lambda-lifted, to avoid thunks being built in the inner-loop:
- done !ir !ow = return (if ir == iw then input{ bufL=0, bufR=0 }
- else input{ bufL=ir },
- output{ bufR=ow })
+ done why !ir !ow = return (why,
+ if ir == iw then input{ bufL=0, bufR=0 }
+ else input{ bufL=ir },
+ output{ bufR=ow })
in
loop ir0 ow0
input@Buffer{ bufRaw=iraw, bufL=ir0, bufR=iw, bufSize=_ }
output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow0, bufSize=os }
= let
- done !ir !ow = return (if ir == iw then input{ bufL=0, bufR=0 }
- else input{ bufL=ir },
- output{ bufR=ow })
+ done why !ir !ow = return (why,
+ if ir == iw then input{ bufL=0, bufR=0 }
+ else input{ bufL=ir },
+ output{ bufR=ow })
loop !ir !ow
- | ow >= os || ir >= iw = done ir ow
+ | ow >= os = done OutputUnderflow ir ow
+ | ir >= iw = done InputUnderflow ir ow
| otherwise = do
(c,ir') <- readCharBuf iraw ir
writeWord8Buf oraw ow (fromIntegral (ord c))
input@Buffer{ bufRaw=iraw, bufL=ir0, bufR=iw, bufSize=_ }
output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow0, bufSize=os }
= let
- done !ir !ow = return (if ir == iw then input{ bufL=0, bufR=0 }
- else input{ bufL=ir },
- output{ bufR=ow })
+ done why !ir !ow = return (why,
+ if ir == iw then input{ bufL=0, bufR=0 }
+ else input{ bufL=ir },
+ output{ bufR=ow })
loop !ir !ow
- | ow >= os || ir >= iw = done ir ow
+ | ow >= os = done OutputUnderflow ir ow
+ | ir >= iw = done InputUnderflow ir ow
| otherwise = do
(c,ir') <- readCharBuf iraw ir
if ord c > 0xff then invalid else do
writeWord8Buf oraw ow (fromIntegral (ord c))
loop ir' (ow+1)
where
- invalid = if ir > ir0 then done ir ow else ioe_encodingError
+ invalid = done InvalidSequence ir ow
in
loop ir0 ow0
-
-ioe_encodingError :: IO a
-ioe_encodingError = ioException
- (IOError Nothing InvalidArgument "latin1_checked_encode"
- "character is out of range for this encoding" Nothing Nothing)
TextEncoding(..),
TextEncoder, TextDecoder,
EncodeBuffer, DecodeBuffer,
+ CodingProgress(..)
) where
import GHC.Base
-- Text encoders/decoders
data BufferCodec from to state = BufferCodec {
- encode :: Buffer from -> Buffer to -> IO (Buffer from, Buffer to),
+ encode :: Buffer from -> Buffer to -> IO (CodingProgress, Buffer from, Buffer to),
-- ^ The @encode@ function translates elements of the buffer @from@
-- to the buffer @to@. It should translate as many elements as possible
-- given the sizes of the buffers, including translating zero elements
-- if there is either not enough room in @to@, or @from@ does not
-- contain a complete multibyte sequence.
- --
- -- @encode@ should raise an exception if, and only if, @from@
- -- begins with an illegal sequence, or the first element of @from@
- -- is not representable in the encoding of @to@. That is, if any
- -- elements can be successfully translated before an error is
- -- encountered, then @encode@ should translate as much as it can
- -- and not throw an exception. This behaviour is used by the IO
+ --
+ -- The fact that as many elements as possible are translated is used by the IO
-- library in order to report translation errors at the point they
-- actually occur, rather than when the buffer is translated.
--
+ -- To allow us to use iconv as a BufferCode efficiently, character buffers are
+ -- defined to contain lone surrogates instead of those private use characters that
+ -- are used for roundtripping. Thus, Chars poked and peeked from a character buffer
+ -- must undergo surrogatifyRoundtripCharacter and desurrogatifyRoundtripCharacter
+ -- respectively.
+ --
+ -- For more information on this, see Note [Roundtripping] in GHC.IO.Encoding.Failure.
+
+ recover :: Buffer from -> Buffer to -> IO (Buffer from, Buffer to),
+ -- ^ The @recover@ function is used to continue decoding
+ -- in the presence of invalid or unrepresentable sequences. This includes
+ -- both those detected by @encode@ returning @InvalidSequence@ and those
+ -- that occur because the input byte sequence appears to be truncated.
+ --
+ -- Progress will usually be made by skipping the first element of the @from@
+ -- buffer. This function should only be called if you are certain that you
+ -- wish to do this skipping, and if the @to@ buffer has at least one element
+ -- of free space.
+ --
+ -- @recover@ may raise an exception rather than skipping anything.
+ --
+ -- Currently, some implementations of @recover@ may mutate the input buffer.
+ -- In particular, this feature is used to implement transliteration.
+
close :: IO (),
-- ^ Resources associated with the encoding may now be released.
-- The @encode@ function may not be called again after calling
-- beginning), and if not, whether to use the big or little-endian
-- encoding.
- setState :: state -> IO()
+ setState :: state -> IO ()
-- restore the state of the codec using the state from a previous
-- call to 'getState'.
}
type DecodeBuffer = Buffer Word8 -> Buffer Char
- -> IO (Buffer Word8, Buffer Char)
+ -> IO (CodingProgress, Buffer Word8, Buffer Char)
type EncodeBuffer = Buffer Char -> Buffer Word8
- -> IO (Buffer Char, Buffer Word8)
+ -> IO (CodingProgress, Buffer Char, Buffer Word8)
type TextDecoder state = BufferCodec Word8 CharBufElem state
type TextEncoder state = BufferCodec CharBufElem Word8 state
instance Show TextEncoding where
-- | Returns the value of 'textEncodingName'
show te = textEncodingName te
+
+data CodingProgress = InputUnderflow -- ^ Stopped because the input contains insufficient available elements,
+ -- or all of the input sequence has been sucessfully translated.
+ | OutputUnderflow -- ^ Stopped because the output contains insufficient free elements
+ | InvalidSequence -- ^ Stopped because there are sufficient free elements in the output
+ -- to output at least one encoded ASCII character, but the input contains
+ -- an invalid or unrepresentable sequence
+ deriving (Eq, Show)
-----------------------------------------------------------------------------
module GHC.IO.Encoding.UTF16 (
- utf16,
+ utf16, mkUTF16,
utf16_decode,
utf16_encode,
- utf16be,
+ utf16be, mkUTF16be,
utf16be_decode,
utf16be_encode,
- utf16le,
+ utf16le, mkUTF16le,
utf16le_decode,
utf16le_encode,
) where
import GHC.Real
import GHC.Num
-- import GHC.IO
-import GHC.IO.Exception
import GHC.IO.Buffer
+import GHC.IO.Encoding.Failure
import GHC.IO.Encoding.Types
import GHC.Word
import Data.Bits
import Data.Maybe
import GHC.IORef
-#if DEBUG
-import System.Posix.Internals
-import Foreign.C
-import GHC.Show
-import GHC.Ptr
-
-puts :: String -> IO ()
- -- In reality should be withCString, but assume ASCII to avoid possible loop
-puts s = do withCAStringLen (s++"\n") $ \(p,len) ->
- c_write 1 (castPtr p) (fromIntegral len)
- return ()
-#endif
-
-- -----------------------------------------------------------------------------
-- The UTF-16 codec: either UTF16BE or UTF16LE with a BOM
utf16 :: TextEncoding
-utf16 = TextEncoding { textEncodingName = "UTF-16",
- mkTextDecoder = utf16_DF,
- mkTextEncoder = utf16_EF }
+utf16 = mkUTF16 ErrorOnCodingFailure
+
+mkUTF16 :: CodingFailureMode -> TextEncoding
+mkUTF16 cfm = TextEncoding { textEncodingName = "UTF-16",
+ mkTextDecoder = utf16_DF cfm,
+ mkTextEncoder = utf16_EF cfm }
-utf16_DF :: IO (TextDecoder (Maybe DecodeBuffer))
-utf16_DF = do
+utf16_DF :: CodingFailureMode -> IO (TextDecoder (Maybe DecodeBuffer))
+utf16_DF cfm = do
seen_bom <- newIORef Nothing
return (BufferCodec {
encode = utf16_decode seen_bom,
+ recover = recoverDecode cfm,
close = return (),
getState = readIORef seen_bom,
setState = writeIORef seen_bom
})
-utf16_EF :: IO (TextEncoder Bool)
-utf16_EF = do
+utf16_EF :: CodingFailureMode -> IO (TextEncoder Bool)
+utf16_EF cfm = do
done_bom <- newIORef False
return (BufferCodec {
encode = utf16_encode done_bom,
+ recover = recoverEncode cfm,
close = return (),
getState = readIORef done_bom,
setState = writeIORef done_bom
b <- readIORef done_bom
if b then utf16_native_encode input output
else if os - ow < 2
- then return (input,output)
+ then return (OutputUnderflow,input,output)
else do
writeIORef done_bom True
writeWord8Buf oraw ow bom1
case mb of
Just decode -> decode input output
Nothing ->
- if iw - ir < 2 then return (input,output) else do
+ if iw - ir < 2 then return (InputUnderflow,input,output) else do
c0 <- readWord8Buf iraw ir
c1 <- readWord8Buf iraw (ir+1)
case () of
-- UTF16LE and UTF16BE
utf16be :: TextEncoding
-utf16be = TextEncoding { textEncodingName = "UTF-16BE",
- mkTextDecoder = utf16be_DF,
- mkTextEncoder = utf16be_EF }
+utf16be = mkUTF16be ErrorOnCodingFailure
-utf16be_DF :: IO (TextDecoder ())
-utf16be_DF =
+mkUTF16be :: CodingFailureMode -> TextEncoding
+mkUTF16be cfm = TextEncoding { textEncodingName = "UTF-16BE",
+ mkTextDecoder = utf16be_DF cfm,
+ mkTextEncoder = utf16be_EF cfm }
+
+utf16be_DF :: CodingFailureMode -> IO (TextDecoder ())
+utf16be_DF cfm =
return (BufferCodec {
encode = utf16be_decode,
+ recover = recoverDecode cfm,
close = return (),
getState = return (),
setState = const $ return ()
})
-utf16be_EF :: IO (TextEncoder ())
-utf16be_EF =
+utf16be_EF :: CodingFailureMode -> IO (TextEncoder ())
+utf16be_EF cfm =
return (BufferCodec {
encode = utf16be_encode,
+ recover = recoverEncode cfm,
close = return (),
getState = return (),
setState = const $ return ()
})
utf16le :: TextEncoding
-utf16le = TextEncoding { textEncodingName = "UTF16-LE",
- mkTextDecoder = utf16le_DF,
- mkTextEncoder = utf16le_EF }
+utf16le = mkUTF16le ErrorOnCodingFailure
+
+mkUTF16le :: CodingFailureMode -> TextEncoding
+mkUTF16le cfm = TextEncoding { textEncodingName = "UTF16-LE",
+ mkTextDecoder = utf16le_DF cfm,
+ mkTextEncoder = utf16le_EF cfm }
-utf16le_DF :: IO (TextDecoder ())
-utf16le_DF =
+utf16le_DF :: CodingFailureMode -> IO (TextDecoder ())
+utf16le_DF cfm =
return (BufferCodec {
encode = utf16le_decode,
+ recover = recoverDecode cfm,
close = return (),
getState = return (),
setState = const $ return ()
})
-utf16le_EF :: IO (TextEncoder ())
-utf16le_EF =
+utf16le_EF :: CodingFailureMode -> IO (TextEncoder ())
+utf16le_EF cfm =
return (BufferCodec {
encode = utf16le_encode,
+ recover = recoverEncode cfm,
close = return (),
getState = return (),
setState = const $ return ()
output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow0, bufSize=os }
= let
loop !ir !ow
- | ow >= os || ir >= iw = done ir ow
- | ir + 1 == iw = done ir ow
+ | ow >= os = done OutputUnderflow ir ow
+ | ir >= iw = done InputUnderflow ir ow
+ | ir + 1 == iw = done InputUnderflow ir ow
| otherwise = do
c0 <- readWord8Buf iraw ir
c1 <- readWord8Buf iraw (ir+1)
if validate1 x1
then do ow' <- writeCharBuf oraw ow (unsafeChr (fromIntegral x1))
loop (ir+2) ow'
- else if iw - ir < 4 then done ir ow else do
+ else if iw - ir < 4 then done InputUnderflow ir ow else do
c2 <- readWord8Buf iraw (ir+2)
c3 <- readWord8Buf iraw (ir+3)
let x2 = fromIntegral c2 `shiftL` 8 + fromIntegral c3
ow' <- writeCharBuf oraw ow (chr2 x1 x2)
loop (ir+4) ow'
where
- invalid = if ir > ir0 then done ir ow else ioe_decodingError
+ invalid = done InvalidSequence ir ow
-- lambda-lifted, to avoid thunks being built in the inner-loop:
- done !ir !ow = return (if ir == iw then input{ bufL=0, bufR=0 }
- else input{ bufL=ir },
- output{ bufR=ow })
+ done why !ir !ow = return (why,
+ if ir == iw then input{ bufL=0, bufR=0 }
+ else input{ bufL=ir },
+ output{ bufR=ow })
in
loop ir0 ow0
output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow0, bufSize=os }
= let
loop !ir !ow
- | ow >= os || ir >= iw = done ir ow
- | ir + 1 == iw = done ir ow
+ | ow >= os = done OutputUnderflow ir ow
+ | ir >= iw = done InputUnderflow ir ow
+ | ir + 1 == iw = done InputUnderflow ir ow
| otherwise = do
c0 <- readWord8Buf iraw ir
c1 <- readWord8Buf iraw (ir+1)
if validate1 x1
then do ow' <- writeCharBuf oraw ow (unsafeChr (fromIntegral x1))
loop (ir+2) ow'
- else if iw - ir < 4 then done ir ow else do
+ else if iw - ir < 4 then done InputUnderflow ir ow else do
c2 <- readWord8Buf iraw (ir+2)
c3 <- readWord8Buf iraw (ir+3)
let x2 = fromIntegral c3 `shiftL` 8 + fromIntegral c2
ow' <- writeCharBuf oraw ow (chr2 x1 x2)
loop (ir+4) ow'
where
- invalid = if ir > ir0 then done ir ow else ioe_decodingError
+ invalid = done InvalidSequence ir ow
-- lambda-lifted, to avoid thunks being built in the inner-loop:
- done !ir !ow = return (if ir == iw then input{ bufL=0, bufR=0 }
- else input{ bufL=ir },
- output{ bufR=ow })
+ done why !ir !ow = return (why,
+ if ir == iw then input{ bufL=0, bufR=0 }
+ else input{ bufL=ir },
+ output{ bufR=ow })
in
loop ir0 ow0
-ioe_decodingError :: IO a
-ioe_decodingError = ioException
- (IOError Nothing InvalidArgument "utf16_decode"
- "invalid UTF-16 byte sequence" Nothing Nothing)
-
utf16be_encode :: EncodeBuffer
utf16be_encode
input@Buffer{ bufRaw=iraw, bufL=ir0, bufR=iw, bufSize=_ }
output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow0, bufSize=os }
= let
- done !ir !ow = return (if ir == iw then input{ bufL=0, bufR=0 }
- else input{ bufL=ir },
- output{ bufR=ow })
+ done why !ir !ow = return (why,
+ if ir == iw then input{ bufL=0, bufR=0 }
+ else input{ bufL=ir },
+ output{ bufR=ow })
loop !ir !ow
- | ir >= iw = done ir ow
- | os - ow < 2 = done ir ow
+ | ir >= iw = done InputUnderflow ir ow
+ | os - ow < 2 = done OutputUnderflow ir ow
| otherwise = do
(c,ir') <- readCharBuf iraw ir
case ord c of
- x | x < 0x10000 -> do
+ x | x < 0x10000 -> if isSurrogate c then done InvalidSequence ir ow else do
writeWord8Buf oraw ow (fromIntegral (x `shiftR` 8))
writeWord8Buf oraw (ow+1) (fromIntegral x)
loop ir' (ow+2)
| otherwise -> do
- if os - ow < 4 then done ir ow else do
+ if os - ow < 4 then done OutputUnderflow ir ow else do
let
n1 = x - 0x10000
c1 = fromIntegral (n1 `shiftR` 18 + 0xD8)
input@Buffer{ bufRaw=iraw, bufL=ir0, bufR=iw, bufSize=_ }
output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow0, bufSize=os }
= let
- done !ir !ow = return (if ir == iw then input{ bufL=0, bufR=0 }
- else input{ bufL=ir },
- output{ bufR=ow })
+ done why !ir !ow = return (why,
+ if ir == iw then input{ bufL=0, bufR=0 }
+ else input{ bufL=ir },
+ output{ bufR=ow })
loop !ir !ow
- | ir >= iw = done ir ow
- | os - ow < 2 = done ir ow
+ | ir >= iw = done InputUnderflow ir ow
+ | os - ow < 2 = done OutputUnderflow ir ow
| otherwise = do
(c,ir') <- readCharBuf iraw ir
case ord c of
- x | x < 0x10000 -> do
+ x | x < 0x10000 -> if isSurrogate c then done InvalidSequence ir ow else do
writeWord8Buf oraw ow (fromIntegral x)
writeWord8Buf oraw (ow+1) (fromIntegral (x `shiftR` 8))
loop ir' (ow+2)
| otherwise ->
- if os - ow < 4 then done ir ow else do
+ if os - ow < 4 then done OutputUnderflow ir ow else do
let
n1 = x - 0x10000
c1 = fromIntegral (n1 `shiftR` 18 + 0xD8)
-----------------------------------------------------------------------------
module GHC.IO.Encoding.UTF32 (
- utf32,
+ utf32, mkUTF32,
utf32_decode,
utf32_encode,
- utf32be,
+ utf32be, mkUTF32be,
utf32be_decode,
utf32be_encode,
- utf32le,
+ utf32le, mkUTF32le,
utf32le_decode,
utf32le_encode,
) where
import GHC.Real
import GHC.Num
-- import GHC.IO
-import GHC.IO.Exception
import GHC.IO.Buffer
+import GHC.IO.Encoding.Failure
import GHC.IO.Encoding.Types
import GHC.Word
import Data.Bits
-- The UTF-32 codec: either UTF-32BE or UTF-32LE with a BOM
utf32 :: TextEncoding
-utf32 = TextEncoding { textEncodingName = "UTF-32",
- mkTextDecoder = utf32_DF,
- mkTextEncoder = utf32_EF }
+utf32 = mkUTF32 ErrorOnCodingFailure
-utf32_DF :: IO (TextDecoder (Maybe DecodeBuffer))
-utf32_DF = do
+mkUTF32 :: CodingFailureMode -> TextEncoding
+mkUTF32 cfm = TextEncoding { textEncodingName = "UTF-32",
+ mkTextDecoder = utf32_DF cfm,
+ mkTextEncoder = utf32_EF cfm }
+
+utf32_DF :: CodingFailureMode -> IO (TextDecoder (Maybe DecodeBuffer))
+utf32_DF cfm = do
seen_bom <- newIORef Nothing
return (BufferCodec {
encode = utf32_decode seen_bom,
+ recover = recoverDecode cfm,
close = return (),
getState = readIORef seen_bom,
setState = writeIORef seen_bom
})
-utf32_EF :: IO (TextEncoder Bool)
-utf32_EF = do
+utf32_EF :: CodingFailureMode -> IO (TextEncoder Bool)
+utf32_EF cfm = do
done_bom <- newIORef False
return (BufferCodec {
encode = utf32_encode done_bom,
+ recover = recoverEncode cfm,
close = return (),
getState = readIORef done_bom,
setState = writeIORef done_bom
b <- readIORef done_bom
if b then utf32_native_encode input output
else if os - ow < 4
- then return (input,output)
+ then return (OutputUnderflow, input,output)
else do
writeIORef done_bom True
writeWord8Buf oraw ow bom0
case mb of
Just decode -> decode input output
Nothing ->
- if iw - ir < 4 then return (input,output) else do
+ if iw - ir < 4 then return (InputUnderflow, input,output) else do
c0 <- readWord8Buf iraw ir
c1 <- readWord8Buf iraw (ir+1)
c2 <- readWord8Buf iraw (ir+2)
-- UTF32LE and UTF32BE
utf32be :: TextEncoding
-utf32be = TextEncoding { textEncodingName = "UTF-32BE",
- mkTextDecoder = utf32be_DF,
- mkTextEncoder = utf32be_EF }
+utf32be = mkUTF32be ErrorOnCodingFailure
+
+mkUTF32be :: CodingFailureMode -> TextEncoding
+mkUTF32be cfm = TextEncoding { textEncodingName = "UTF-32BE",
+ mkTextDecoder = utf32be_DF cfm,
+ mkTextEncoder = utf32be_EF cfm }
-utf32be_DF :: IO (TextDecoder ())
-utf32be_DF =
+utf32be_DF :: CodingFailureMode -> IO (TextDecoder ())
+utf32be_DF cfm =
return (BufferCodec {
encode = utf32be_decode,
+ recover = recoverDecode cfm,
close = return (),
getState = return (),
setState = const $ return ()
})
-utf32be_EF :: IO (TextEncoder ())
-utf32be_EF =
+utf32be_EF :: CodingFailureMode -> IO (TextEncoder ())
+utf32be_EF cfm =
return (BufferCodec {
encode = utf32be_encode,
+ recover = recoverEncode cfm,
close = return (),
getState = return (),
setState = const $ return ()
utf32le :: TextEncoding
-utf32le = TextEncoding { textEncodingName = "UTF-32LE",
- mkTextDecoder = utf32le_DF,
- mkTextEncoder = utf32le_EF }
+utf32le = mkUTF32le ErrorOnCodingFailure
-utf32le_DF :: IO (TextDecoder ())
-utf32le_DF =
+mkUTF32le :: CodingFailureMode -> TextEncoding
+mkUTF32le cfm = TextEncoding { textEncodingName = "UTF-32LE",
+ mkTextDecoder = utf32le_DF cfm,
+ mkTextEncoder = utf32le_EF cfm }
+
+utf32le_DF :: CodingFailureMode -> IO (TextDecoder ())
+utf32le_DF cfm =
return (BufferCodec {
encode = utf32le_decode,
+ recover = recoverDecode cfm,
close = return (),
getState = return (),
setState = const $ return ()
})
-utf32le_EF :: IO (TextEncoder ())
-utf32le_EF =
+utf32le_EF :: CodingFailureMode -> IO (TextEncoder ())
+utf32le_EF cfm =
return (BufferCodec {
encode = utf32le_encode,
+ recover = recoverEncode cfm,
close = return (),
getState = return (),
setState = const $ return ()
output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow0, bufSize=os }
= let
loop !ir !ow
- | ow >= os || iw - ir < 4 = done ir ow
+ | ow >= os = done OutputUnderflow ir ow
+ | iw - ir < 4 = done InputUnderflow ir ow
| otherwise = do
c0 <- readWord8Buf iraw ir
c1 <- readWord8Buf iraw (ir+1)
ow' <- writeCharBuf oraw ow x1
loop (ir+4) ow'
where
- invalid = if ir > ir0 then done ir ow else ioe_decodingError
+ invalid = done InvalidSequence ir ow
-- lambda-lifted, to avoid thunks being built in the inner-loop:
- done !ir !ow = return (if ir == iw then input{ bufL=0, bufR=0 }
- else input{ bufL=ir },
- output{ bufR=ow })
+ done why !ir !ow = return (why,
+ if ir == iw then input{ bufL=0, bufR=0 }
+ else input{ bufL=ir },
+ output{ bufR=ow })
in
loop ir0 ow0
output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow0, bufSize=os }
= let
loop !ir !ow
- | ow >= os || iw - ir < 4 = done ir ow
+ | ow >= os = done OutputUnderflow ir ow
+ | iw - ir < 4 = done InputUnderflow ir ow
| otherwise = do
c0 <- readWord8Buf iraw ir
c1 <- readWord8Buf iraw (ir+1)
ow' <- writeCharBuf oraw ow x1
loop (ir+4) ow'
where
- invalid = if ir > ir0 then done ir ow else ioe_decodingError
+ invalid = done InvalidSequence ir ow
-- lambda-lifted, to avoid thunks being built in the inner-loop:
- done !ir !ow = return (if ir == iw then input{ bufL=0, bufR=0 }
- else input{ bufL=ir },
- output{ bufR=ow })
+ done why !ir !ow = return (why,
+ if ir == iw then input{ bufL=0, bufR=0 }
+ else input{ bufL=ir },
+ output{ bufR=ow })
in
loop ir0 ow0
-ioe_decodingError :: IO a
-ioe_decodingError = ioException
- (IOError Nothing InvalidArgument "utf32_decode"
- "invalid UTF-32 byte sequence" Nothing Nothing)
-
utf32be_encode :: EncodeBuffer
utf32be_encode
input@Buffer{ bufRaw=iraw, bufL=ir0, bufR=iw, bufSize=_ }
output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow0, bufSize=os }
= let
- done !ir !ow = return (if ir == iw then input{ bufL=0, bufR=0 }
- else input{ bufL=ir },
- output{ bufR=ow })
+ done why !ir !ow = return (why,
+ if ir == iw then input{ bufL=0, bufR=0 }
+ else input{ bufL=ir },
+ output{ bufR=ow })
loop !ir !ow
- | ir >= iw = done ir ow
- | os - ow < 4 = done ir ow
+ | ir >= iw = done InputUnderflow ir ow
+ | os - ow < 4 = done OutputUnderflow ir ow
| otherwise = do
(c,ir') <- readCharBuf iraw ir
- let (c0,c1,c2,c3) = ord4 c
- writeWord8Buf oraw ow c0
- writeWord8Buf oraw (ow+1) c1
- writeWord8Buf oraw (ow+2) c2
- writeWord8Buf oraw (ow+3) c3
- loop ir' (ow+4)
+ if isSurrogate c then done InvalidSequence ir ow else do
+ let (c0,c1,c2,c3) = ord4 c
+ writeWord8Buf oraw ow c0
+ writeWord8Buf oraw (ow+1) c1
+ writeWord8Buf oraw (ow+2) c2
+ writeWord8Buf oraw (ow+3) c3
+ loop ir' (ow+4)
in
loop ir0 ow0
input@Buffer{ bufRaw=iraw, bufL=ir0, bufR=iw, bufSize=_ }
output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow0, bufSize=os }
= let
- done !ir !ow = return (if ir == iw then input{ bufL=0, bufR=0 }
- else input{ bufL=ir },
- output{ bufR=ow })
+ done why !ir !ow = return (why,
+ if ir == iw then input{ bufL=0, bufR=0 }
+ else input{ bufL=ir },
+ output{ bufR=ow })
loop !ir !ow
- | ir >= iw = done ir ow
- | os - ow < 4 = done ir ow
+ | ir >= iw = done InputUnderflow ir ow
+ | os - ow < 4 = done OutputUnderflow ir ow
| otherwise = do
(c,ir') <- readCharBuf iraw ir
- let (c0,c1,c2,c3) = ord4 c
- writeWord8Buf oraw ow c3
- writeWord8Buf oraw (ow+1) c2
- writeWord8Buf oraw (ow+2) c1
- writeWord8Buf oraw (ow+3) c0
- loop ir' (ow+4)
+ if isSurrogate c then done InvalidSequence ir ow else do
+ let (c0,c1,c2,c3) = ord4 c
+ writeWord8Buf oraw ow c3
+ writeWord8Buf oraw (ow+1) c2
+ writeWord8Buf oraw (ow+2) c1
+ writeWord8Buf oraw (ow+3) c0
+ loop ir' (ow+4)
in
loop ir0 ow0
-----------------------------------------------------------------------------
module GHC.IO.Encoding.UTF8 (
- utf8,
- utf8_bom,
+ utf8, mkUTF8,
+ utf8_bom, mkUTF8_bom
) where
import GHC.Base
import GHC.Num
import GHC.IORef
-- import GHC.IO
-import GHC.IO.Exception
import GHC.IO.Buffer
+import GHC.IO.Encoding.Failure
import GHC.IO.Encoding.Types
import GHC.Word
import Data.Bits
-import Data.Maybe
utf8 :: TextEncoding
-utf8 = TextEncoding { textEncodingName = "UTF-8",
- mkTextDecoder = utf8_DF,
- mkTextEncoder = utf8_EF }
+utf8 = mkUTF8 ErrorOnCodingFailure
-utf8_DF :: IO (TextDecoder ())
-utf8_DF =
+mkUTF8 :: CodingFailureMode -> TextEncoding
+mkUTF8 cfm = TextEncoding { textEncodingName = "UTF-8",
+ mkTextDecoder = utf8_DF cfm,
+ mkTextEncoder = utf8_EF cfm }
+
+
+utf8_DF :: CodingFailureMode -> IO (TextDecoder ())
+utf8_DF cfm =
return (BufferCodec {
encode = utf8_decode,
+ recover = recoverDecode cfm,
close = return (),
getState = return (),
setState = const $ return ()
})
-utf8_EF :: IO (TextEncoder ())
-utf8_EF =
+utf8_EF :: CodingFailureMode -> IO (TextEncoder ())
+utf8_EF cfm =
return (BufferCodec {
encode = utf8_encode,
+ recover = recoverEncode cfm,
close = return (),
getState = return (),
setState = const $ return ()
})
utf8_bom :: TextEncoding
-utf8_bom = TextEncoding { textEncodingName = "UTF-8BOM",
- mkTextDecoder = utf8_bom_DF,
- mkTextEncoder = utf8_bom_EF }
+utf8_bom = mkUTF8_bom ErrorOnCodingFailure
-utf8_bom_DF :: IO (TextDecoder Bool)
-utf8_bom_DF = do
+mkUTF8_bom :: CodingFailureMode -> TextEncoding
+mkUTF8_bom cfm = TextEncoding { textEncodingName = "UTF-8BOM",
+ mkTextDecoder = utf8_bom_DF cfm,
+ mkTextEncoder = utf8_bom_EF cfm }
+
+utf8_bom_DF :: CodingFailureMode -> IO (TextDecoder Bool)
+utf8_bom_DF cfm = do
ref <- newIORef True
return (BufferCodec {
encode = utf8_bom_decode ref,
+ recover = recoverDecode cfm,
close = return (),
getState = readIORef ref,
setState = writeIORef ref
})
-utf8_bom_EF :: IO (TextEncoder Bool)
-utf8_bom_EF = do
+utf8_bom_EF :: CodingFailureMode -> IO (TextEncoder Bool)
+utf8_bom_EF cfm = do
ref <- newIORef True
return (BufferCodec {
encode = utf8_bom_encode ref,
+ recover = recoverEncode cfm,
close = return (),
getState = readIORef ref,
setState = writeIORef ref
then utf8_decode input output
else do
let no_bom = do writeIORef ref False; utf8_decode input output
- if iw - ir < 1 then return (input,output) else do
+ if iw - ir < 1 then return (InputUnderflow,input,output) else do
c0 <- readWord8Buf iraw ir
if (c0 /= bom0) then no_bom else do
- if iw - ir < 2 then return (input,output) else do
+ if iw - ir < 2 then return (InputUnderflow,input,output) else do
c1 <- readWord8Buf iraw (ir+1)
if (c1 /= bom1) then no_bom else do
- if iw - ir < 3 then return (input,output) else do
+ if iw - ir < 3 then return (InputUnderflow,input,output) else do
c2 <- readWord8Buf iraw (ir+2)
if (c2 /= bom2) then no_bom else do
-- found a BOM, ignore it and carry on
b <- readIORef ref
if not b then utf8_encode input output
else if os - ow < 3
- then return (input,output)
+ then return (OutputUnderflow,input,output)
else do
writeIORef ref False
writeWord8Buf oraw ow bom0
output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow0, bufSize=os }
= let
loop !ir !ow
- | ow >= os || ir >= iw = done ir ow
+ | ow >= os = done OutputUnderflow ir ow
+ | ir >= iw = done InputUnderflow ir ow
| otherwise = do
c0 <- readWord8Buf iraw ir
case c0 of
ow' <- writeCharBuf oraw ow (unsafeChr (fromIntegral c0))
loop (ir+1) ow'
| c0 >= 0xc0 && c0 <= 0xdf ->
- if iw - ir < 2 then done ir ow else do
+ if iw - ir < 2 then done InputUnderflow ir ow else do
c1 <- readWord8Buf iraw (ir+1)
if (c1 < 0x80 || c1 >= 0xc0) then invalid else do
ow' <- writeCharBuf oraw ow (chr2 c0 c1)
loop (ir+2) ow'
| c0 >= 0xe0 && c0 <= 0xef ->
case iw - ir of
- 1 -> done ir ow
+ 1 -> done InputUnderflow ir ow
2 -> do -- check for an error even when we don't have
-- the full sequence yet (#3341)
c1 <- readWord8Buf iraw (ir+1)
if not (validate3 c0 c1 0x80)
- then invalid else done ir ow
+ then invalid else done InputUnderflow ir ow
_ -> do
c1 <- readWord8Buf iraw (ir+1)
c2 <- readWord8Buf iraw (ir+2)
loop (ir+3) ow'
| c0 >= 0xf0 ->
case iw - ir of
- 1 -> done ir ow
+ 1 -> done InputUnderflow ir ow
2 -> do -- check for an error even when we don't have
-- the full sequence yet (#3341)
c1 <- readWord8Buf iraw (ir+1)
if not (validate4 c0 c1 0x80 0x80)
- then invalid else done ir ow
+ then invalid else done InputUnderflow ir ow
3 -> do
c1 <- readWord8Buf iraw (ir+1)
c2 <- readWord8Buf iraw (ir+2)
if not (validate4 c0 c1 c2 0x80)
- then invalid else done ir ow
+ then invalid else done InputUnderflow ir ow
_ -> do
c1 <- readWord8Buf iraw (ir+1)
c2 <- readWord8Buf iraw (ir+2)
| otherwise ->
invalid
where
- invalid = if ir > ir0 then done ir ow else ioe_decodingError
+ invalid = done InvalidSequence ir ow
-- lambda-lifted, to avoid thunks being built in the inner-loop:
- done !ir !ow = return (if ir == iw then input{ bufL=0, bufR=0 }
- else input{ bufL=ir },
- output{ bufR=ow })
+ done why !ir !ow = return (why,
+ if ir == iw then input{ bufL=0, bufR=0 }
+ else input{ bufL=ir },
+ output{ bufR=ow })
in
loop ir0 ow0
-ioe_decodingError :: IO a
-ioe_decodingError = ioException
- (IOError Nothing InvalidArgument "utf8_decode"
- "invalid UTF-8 byte sequence" Nothing Nothing)
-
utf8_encode :: EncodeBuffer
utf8_encode
input@Buffer{ bufRaw=iraw, bufL=ir0, bufR=iw, bufSize=_ }
output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow0, bufSize=os }
= let
- done !ir !ow = return (if ir == iw then input{ bufL=0, bufR=0 }
- else input{ bufL=ir },
- output{ bufR=ow })
+ done why !ir !ow = return (why,
+ if ir == iw then input{ bufL=0, bufR=0 }
+ else input{ bufL=ir },
+ output{ bufR=ow })
loop !ir !ow
- | ow >= os || ir >= iw = done ir ow
+ | ow >= os = done OutputUnderflow ir ow
+ | ir >= iw = done InputUnderflow ir ow
| otherwise = do
(c,ir') <- readCharBuf iraw ir
case ord c of
writeWord8Buf oraw ow (fromIntegral x)
loop ir' (ow+1)
| x <= 0x07FF ->
- if os - ow < 2 then done ir ow else do
+ if os - ow < 2 then done OutputUnderflow ir ow else do
let (c1,c2) = ord2 c
writeWord8Buf oraw ow c1
writeWord8Buf oraw (ow+1) c2
loop ir' (ow+2)
- | x <= 0xFFFF -> do
- if os - ow < 3 then done ir ow else do
+ | x <= 0xFFFF -> if isSurrogate c then done InvalidSequence ir ow else do
+ if os - ow < 3 then done OutputUnderflow ir ow else do
let (c1,c2,c3) = ord3 c
writeWord8Buf oraw ow c1
writeWord8Buf oraw (ow+1) c2
writeWord8Buf oraw (ow+2) c3
loop ir' (ow+3)
| otherwise -> do
- if os - ow < 4 then done ir ow else do
+ if os - ow < 4 then done OutputUnderflow ir ow else do
let (c1,c2,c3,c4) = ord4 c
writeWord8Buf oraw ow c1
writeWord8Buf oraw (ow+1) c2
import GHC.IO.Device (SeekMode(..), IODeviceType(..))
import GHC.Conc.IO
import GHC.IO.Exception
+#ifdef mingw32_HOST_OS
+import GHC.Windows
+#endif
import Foreign
import Foreign.C
-- for this case. We need to detect EPIPE correctly, because it
-- shouldn't be reported as an error when it happens on stdout.
-foreign import ccall unsafe "maperrno" -- in Win32Utils.c
- c_maperrno :: IO ()
-
-- NOTE: "safe" versions of the read/write calls for use by the threaded RTS.
-- These calls may block, but that's ok.
foreign import ccall unsafe "unlockFile"
unlockFile :: CInt -> IO CInt
#endif
-
-puts :: String -> IO ()
-puts s = do _ <- withCStringLen s $ \(p,len) ->
- c_write 1 (castPtr p) (fromIntegral len)
- return ()
("illegal buffer size " ++ showsPrec 9 n []) Nothing Nothing)
-- 9 => should be parens'ified.
+-- ---------------------------------------------------------------------------
+-- Wrapper for Handle encoding/decoding.
+
+-- The interface for TextEncoding changed so that a TextEncoding doesn't raise
+-- an exception if it encounters an invalid sequnce. Furthermore, encoding
+-- returns a reason as to why encoding stopped, letting us know if it was due
+-- to input/output underflow or an invalid sequence.
+--
+-- This code adapts this elaborated interface back to the original TextEncoding
+-- interface.
+--
+-- FIXME: it is possible that Handle code using the haDecoder/haEncoder fields
+-- could be made clearer by using the 'encode' interface directly. I have not
+-- looked into this.
+--
+-- FIXME: we should use recover to deal with EOF, rather than always throwing an
+-- IOException (ioe_invalidCharacter).
+
+streamEncode :: BufferCodec from to state
+ -> Buffer from -> Buffer to
+ -> IO (Buffer from, Buffer to)
+streamEncode codec from to = go (from, to)
+ where
+ go (from, to) = do
+ (why, from', to') <- encode codec from to
+ -- When we are dealing with Handles, we don't care about input/output
+ -- underflow particularly, and we want to delay errors about invalid
+ -- sequences as far as possible.
+ case why of
+ Encoding.InvalidSequence | bufL from == bufL from' -> recover codec from' to' >>= go
+ _ -> return (from', to')
+
-- -----------------------------------------------------------------------------
-- Handle Finalizers
(cbuf',bbuf') <- case haEncoder of
Nothing -> latin1_encode cbuf bbuf
- Just encoder -> (encode encoder) cbuf bbuf
+ Just encoder -> (streamEncode encoder) cbuf bbuf
debugIO ("writeCharBuffer after encoding: cbuf=" ++ summaryBuffer cbuf' ++
" bbuf=" ++ summaryBuffer bbuf')
-- restore the codec state
setState decoder codec_state
- (bbuf1,cbuf1) <- (encode decoder) bbuf0
+ (bbuf1,cbuf1) <- (streamEncode decoder) bbuf0
cbuf0{ bufL=0, bufR=0, bufSize = bufL cbuf0 }
debugIO ("finished, bbuf=" ++ summaryBuffer bbuf1 ++
Just decoder -> do
state <- getState decoder
writeIORef haLastDecode (state, bbuf1)
- (encode decoder) bbuf1 cbuf
+ (streamEncode decoder) bbuf1 cbuf
debugIO ("readTextDevice after decoding: cbuf=" ++ summaryBuffer cbuf' ++
" bbuf=" ++ summaryBuffer bbuf2)
Just decoder -> do
state <- getState decoder
writeIORef haLastDecode (state, bbuf2)
- (encode decoder) bbuf2 cbuf
+ (streamEncode decoder) bbuf2 cbuf
debugIO ("readTextDevice' after decoding: cbuf=" ++ summaryBuffer cbuf' ++
" bbuf=" ++ summaryBuffer bbuf3)
Just decoder -> do
state <- getState decoder
writeIORef haLastDecode (state, bbuf0)
- (encode decoder) bbuf0 cbuf
+ (streamEncode decoder) bbuf0 cbuf
writeIORef haByteBuffer bbuf2
return cbuf'
import GHC.IO.Buffer
import qualified GHC.IO.BufferedIO as Buffered
import GHC.IO.Exception
+import GHC.IO.Encoding.Failure (surrogatifyRoundtripCharacter, desurrogatifyRoundtripCharacter)
import GHC.Exception
import GHC.IO.Handle.Types
import GHC.IO.Handle.Internals
else do c1 <- peekElemOff pbuf (i-1)
let c = (fromIntegral c1 - 0xd800) * 0x400 +
(fromIntegral c2 - 0xdc00) + 0x10000
- unpackRB (unsafeChr c : acc) (i-2)
+ unpackRB (desurrogatifyRoundtripCharacter (unsafeChr c) : acc) (i-2)
#else
c <- peekElemOff pbuf i
- unpackRB (c:acc) (i-1)
+ unpackRB (desurrogatifyRoundtripCharacter c:acc) (i-1)
#endif
in
unpackRB acc0 (w-1)
then unpackRB ('\n':acc) (i-2)
else unpackRB ('\n':acc) (i-1)
else do
- unpackRB (c:acc) (i-1)
+ unpackRB (desurrogatifyRoundtripCharacter c:acc) (i-1)
in do
c <- peekElemOff pbuf (w-1)
if (c == '\r')
else do
shoveString n' cs rest
| otherwise = do
- n' <- writeCharBuf raw n c
+ n' <- writeCharBuf raw n (surrogatifyRoundtripCharacter c)
shoveString n' cs rest
in
shoveString 0 s (if add_nl then "\n" else "")
--- /dev/null
+{-# LANGUAGE NoImplicitPrelude, ForeignFunctionInterface #-}
+-----------------------------------------------------------------------------
+-- |
+-- Module : GHC.Windows
+-- Copyright : (c) The University of Glasgow, 2009
+-- License : see libraries/base/LICENSE
+--
+-- Maintainer : libraries@haskell.org
+-- Stability : internal
+-- Portability : non-portable
+--
+-- Windows functionality used by several modules.
+--
+-- ToDo: this just duplicates part of System.Win32.Types, which isn't
+-- available yet. We should move some Win32 functionality down here,
+-- maybe as part of the grand reorganisation of the base package...
+--
+-----------------------------------------------------------------------------
+
+module GHC.Windows where
+
+import GHC.Base
+import GHC.Ptr
+
+import Data.Word
+
+import Foreign.C.Error (throwErrno)
+import Foreign.C.Types
+
+
+type HANDLE = Ptr ()
+type DWORD = Word32
+
+type LPTSTR = Ptr CWchar
+
+iNFINITE :: DWORD
+iNFINITE = 0xFFFFFFFF -- urgh
+
+throwGetLastError :: String -> IO a
+throwGetLastError where_from = c_maperrno >> throwErrno where_from
+
+foreign import ccall unsafe "maperrno" -- in Win32Utils.c
+ c_maperrno :: IO ()
+
import Prelude
#ifdef __GLASGOW_HASKELL__
-import Data.List
import Foreign
import Foreign.C
import Control.Exception.Base ( bracket )
-import Control.Monad
-- import GHC.IO
import GHC.IO.Exception
+import GHC.IO.Encoding (fileSystemEncoding)
+import qualified GHC.Foreign as GHC
+import Data.List
+#ifdef mingw32_HOST_OS
+import GHC.Environment
+import GHC.Windows
+#else
+import Control.Monad
+#endif
#endif
#ifdef __HUGS__
)
#endif
+#ifdef __GLASGOW_HASKELL__
-- ---------------------------------------------------------------------------
-- getArgs, getProgName, getEnv
+#ifdef mingw32_HOST_OS
+
+-- Ignore the arguments to hs_init on Windows for the sake of Unicode compat
+
+getWin32ProgArgv_certainly :: IO [String]
+getWin32ProgArgv_certainly = do
+ mb_argv <- getWin32ProgArgv
+ case mb_argv of
+ Nothing -> fmap dropRTSArgs getFullArgs
+ Just argv -> return argv
+
+withWin32ProgArgv :: [String] -> IO a -> IO a
+withWin32ProgArgv argv act = bracket begin setWin32ProgArgv (\_ -> act)
+ where
+ begin = do
+ mb_old_argv <- getWin32ProgArgv
+ setWin32ProgArgv (Just argv)
+ return mb_old_argv
+
+getWin32ProgArgv :: IO (Maybe [String])
+getWin32ProgArgv = alloca $ \p_argc -> alloca $ \p_argv -> do
+ c_getWin32ProgArgv p_argc p_argv
+ argc <- peek p_argc
+ argv_p <- peek p_argv
+ if argv_p == nullPtr
+ then return Nothing
+ else do
+ argv_ps <- peekArray (fromIntegral argc) argv_p
+ fmap Just $ mapM peekCWString argv_ps
+
+setWin32ProgArgv :: Maybe [String] -> IO ()
+setWin32ProgArgv Nothing = c_setWin32ProgArgv 0 nullPtr
+setWin32ProgArgv (Just argv) = withMany withCWString argv $ \argv_ps -> withArrayLen argv_ps $ \argc argv_p -> do
+ c_setWin32ProgArgv (fromIntegral argc) argv_p
+
+foreign import ccall unsafe "getWin32ProgArgv"
+ c_getWin32ProgArgv :: Ptr CInt -> Ptr (Ptr CWString) -> IO ()
+
+foreign import ccall unsafe "setWin32ProgArgv"
+ c_setWin32ProgArgv :: CInt -> Ptr CWString -> IO ()
+
+dropRTSArgs :: [String] -> [String]
+dropRTSArgs [] = []
+dropRTSArgs ("+RTS":rest) = dropRTSArgs (dropWhile (/= "-RTS") rest)
+dropRTSArgs ("--RTS":rest) = rest
+dropRTSArgs ("-RTS":rest) = dropRTSArgs rest
+dropRTSArgs (arg:rest) = arg : dropRTSArgs rest
+
+#endif
+
-- | Computation 'getArgs' returns a list of the program's command
-- line arguments (not including the program name).
-
-#ifdef __GLASGOW_HASKELL__
getArgs :: IO [String]
+
+#ifdef mingw32_HOST_OS
+getArgs = fmap tail getWin32ProgArgv_certainly
+#else
getArgs =
alloca $ \ p_argc ->
alloca $ \ p_argv -> do
getProgArgv p_argc p_argv
p <- fromIntegral `liftM` peek p_argc
argv <- peek p_argv
- peekArray (p - 1) (advancePtr argv 1) >>= mapM peekCString
-
+ peekArray (p - 1) (advancePtr argv 1) >>= mapM (GHC.peekCString fileSystemEncoding)
foreign import ccall unsafe "getProgArgv"
getProgArgv :: Ptr CInt -> Ptr (Ptr CString) -> IO ()
+#endif
{-|
Computation 'getProgName' returns the name of the program as it was
is probably really @FOO.EXE@, and that is what 'getProgName' will return.
-}
getProgName :: IO String
+#ifdef mingw32_HOST_OS
+-- Ignore the arguments to hs_init on Windows for the sake of Unicode compat
+getProgName = fmap (basename . head) getWin32ProgArgv_certainly
+#else
getProgName =
alloca $ \ p_argc ->
alloca $ \ p_argv -> do
unpackProgName :: Ptr (Ptr CChar) -> IO String -- argv[0]
unpackProgName argv = do
- s <- peekElemOff argv 0 >>= peekCString
+ s <- peekElemOff argv 0 >>= GHC.peekCString fileSystemEncoding
return (basename s)
- where
- basename :: String -> String
- basename f = go f f
- where
- go acc [] = acc
- go acc (x:xs)
- | isPathSeparator x = go xs xs
- | otherwise = go acc xs
-
- isPathSeparator :: Char -> Bool
- isPathSeparator '/' = True
-#ifdef mingw32_HOST_OS
- isPathSeparator '\\' = True
#endif
- isPathSeparator _ = False
+
+basename :: FilePath -> FilePath
+basename f = go f f
+ where
+ go acc [] = acc
+ go acc (x:xs)
+ | isPathSeparator x = go xs xs
+ | otherwise = go acc xs
+
+ isPathSeparator :: Char -> Bool
+ isPathSeparator '/' = True
+#ifdef mingw32_HOST_OS
+ isPathSeparator '\\' = True
+#endif
+ isPathSeparator _ = False
-- | Computation 'getEnv' @var@ returns the value
-- does not exist.
getEnv :: String -> IO String
+#ifdef mingw32_HOST_OS
+getEnv name = withCWString name $ \s -> try_size s 256
+ where
+ try_size s size = allocaArray (fromIntegral size) $ \p_value -> do
+ res <- c_GetEnvironmentVariable s p_value size
+ case res of
+ 0 -> do
+ err <- c_GetLastError
+ if err == eRROR_ENVVAR_NOT_FOUND
+ then ioe_missingEnvVar name
+ else throwGetLastError "getEnv"
+ _ | res > size -> try_size s res -- Rare: size increased between calls to GetEnvironmentVariable
+ | otherwise -> peekCWString p_value
+
+eRROR_ENVVAR_NOT_FOUND :: DWORD
+eRROR_ENVVAR_NOT_FOUND = 203
+
+foreign import stdcall unsafe "windows.h GetLastError"
+ c_GetLastError:: IO DWORD
+
+foreign import stdcall unsafe "windows.h GetEnvironmentVariableW"
+ c_GetEnvironmentVariable :: LPTSTR -> LPTSTR -> DWORD -> IO DWORD
+#else
getEnv name =
withCString name $ \s -> do
litstring <- c_getenv s
if litstring /= nullPtr
- then peekCString litstring
- else ioException (IOError Nothing NoSuchThing "getEnv"
- "no environment variable" Nothing (Just name))
+ then GHC.peekCString fileSystemEncoding litstring
+ else ioe_missingEnvVar name
foreign import ccall unsafe "getenv"
c_getenv :: CString -> IO (Ptr CChar)
+#endif
+
+ioe_missingEnvVar :: String -> IO a
+ioe_missingEnvVar name = ioException (IOError Nothing NoSuchThing "getEnv"
+ "no environment variable" Nothing (Just name))
{-|
'withArgs' @args act@ - while executing action @act@, have 'getArgs'
-- the duration of an action.
withArgv :: [String] -> IO a -> IO a
-withArgv new_args act = do
+
+#ifdef mingw32_HOST_OS
+-- We have to reflect the updated arguments in the RTS-side variables as
+-- well, because the RTS still consults them for error messages and the like.
+-- If we don't do this then ghc-e005 fails.
+withArgv new_args act = withWin32ProgArgv new_args $ withProgArgv new_args act
+#else
+withArgv = withProgArgv
+#endif
+
+withProgArgv :: [String] -> IO a -> IO a
+withProgArgv new_args act = do
pName <- System.Environment.getProgName
existing_args <- System.Environment.getArgs
- bracket (setArgs new_args)
- (\argv -> do _ <- setArgs (pName:existing_args)
- freeArgv argv)
+ bracket (setProgArgv new_args)
+ (\argv -> do _ <- setProgArgv (pName:existing_args)
+ freeProgArgv argv)
(const act)
-freeArgv :: Ptr CString -> IO ()
-freeArgv argv = do
+freeProgArgv :: Ptr CString -> IO ()
+freeProgArgv argv = do
size <- lengthArray0 nullPtr argv
sequence_ [peek (argv `advancePtr` i) >>= free | i <- [size, size-1 .. 0]]
free argv
-setArgs :: [String] -> IO (Ptr CString)
-setArgs argv = do
- vs <- mapM newCString argv >>= newArray0 nullPtr
- setArgsPrim (genericLength argv) vs
+setProgArgv :: [String] -> IO (Ptr CString)
+setProgArgv argv = do
+ vs <- mapM (GHC.newCString fileSystemEncoding) argv >>= newArray0 nullPtr
+ c_setProgArgv (genericLength argv) vs
return vs
foreign import ccall unsafe "setProgArgv"
- setArgsPrim :: CInt -> Ptr CString -> IO ()
+ c_setProgArgv :: CInt -> Ptr CString -> IO ()
-- |'getEnvironment' retrieves the entire environment as a
-- list of @(key,value)@ pairs.
--
-- If an environment entry does not contain an @\'=\'@ character,
-- the @key@ is the whole entry and the @value@ is the empty string.
-
getEnvironment :: IO [(String, String)]
+
+#ifdef mingw32_HOST_OS
+getEnvironment = bracket c_GetEnvironmentStrings c_FreeEnvironmentStrings $ \pBlock ->
+ if pBlock == nullPtr then return []
+ else go pBlock
+ where
+ go pBlock = do
+ -- The block is terminated by a null byte where there
+ -- should be an environment variable of the form X=Y
+ c <- peek pBlock
+ if c == 0 then return []
+ else do
+ -- Seek the next pair (or terminating null):
+ pBlock' <- seekNull pBlock False
+ -- We now know the length in bytes, but ignore it when
+ -- getting the actual String:
+ str <- peekCWString pBlock
+ fmap (divvy str :) $ go pBlock'
+
+ -- Returns pointer to the byte *after* the next null
+ seekNull pBlock done = do
+ let pBlock' = pBlock `plusPtr` sizeOf (undefined :: CWchar)
+ if done then return pBlock'
+ else do
+ c <- peek pBlock'
+ seekNull pBlock' (c == (0 :: Word8 ))
+
+foreign import stdcall unsafe "windows.h GetEnvironmentStringsW"
+ c_GetEnvironmentStrings :: IO (Ptr CWchar)
+
+foreign import stdcall unsafe "windows.h FreeEnvironmentStringsW"
+ c_FreeEnvironmentStrings :: Ptr CWchar -> IO Bool
+#else
getEnvironment = do
pBlock <- getEnvBlock
if pBlock == nullPtr then return []
else do
- stuff <- peekArray0 nullPtr pBlock >>= mapM peekCString
+ stuff <- peekArray0 nullPtr pBlock >>= mapM (GHC.peekCString fileSystemEncoding)
return (map divvy stuff)
- where
- divvy str =
- case break (=='=') str of
- (xs,[]) -> (xs,[]) -- don't barf (like Posix.getEnvironment)
- (name,_:value) -> (name,value)
foreign import ccall unsafe "__hscore_environ"
getEnvBlock :: IO (Ptr CString)
+#endif
+
+divvy :: String -> (String, String)
+divvy str =
+ case break (=='=') str of
+ (xs,[]) -> (xs,[]) -- don't barf (like Posix.getEnvironment)
+ (name,_:value) -> (name,value)
#endif /* __GLASGOW_HASKELL__ */
#ifdef __GLASGOW_HASKELL__
import GHC.Base
-import GHC.IO hiding ( onException )
+import GHC.IO hiding ( bracket, onException )
import GHC.IO.IOMode
import GHC.IO.Handle.FD
import qualified GHC.IO.FD as FD
import GHC.IO.IOMode
import GHC.IO.Exception
import GHC.IO.Device
+#ifndef mingw32_HOST_OS
+import {-# SOURCE #-} GHC.IO.Encoding (fileSystemEncoding)
+import qualified GHC.Foreign as GHC
+#endif
#elif __HUGS__
import Hugs.Prelude (IOException(..), IOErrorType(..))
import Hugs.IO (IOMode(..))
{-# CFILES cbits/PrelIOUtils.c cbits/consUtils.c #-}
#endif
+
+-- ---------------------------------------------------------------------------
+-- Debugging the base package
+
+puts :: String -> IO ()
+puts s = withCAStringLen (s ++ "\n") $ \(p, len) -> do
+ -- In reality should be withCString, but assume ASCII to avoid loop
+ -- if this is called by GHC.Foreign
+ _ <- c_write 1 (castPtr p) (fromIntegral len)
+ return ()
+
+
-- ---------------------------------------------------------------------------
-- Types
#ifdef mingw32_HOST_OS
withFilePath :: FilePath -> (CWString -> IO a) -> IO a
-withFilePath = withCWString
+withFilePath = withCWString
+
+peekFilePath :: CWString -> IO FilePath
+peekFilePath = peekCWString
#else
+
withFilePath :: FilePath -> (CString -> IO a) -> IO a
+peekFilePath :: CString -> IO FilePath
+peekFilePathLen :: CStringLen -> IO FilePath
+
+#if __GLASGOW_HASKELL__
+withFilePath = GHC.withCString fileSystemEncoding
+peekFilePath = GHC.peekCString fileSystemEncoding
+peekFilePathLen = GHC.peekCStringLen fileSystemEncoding
+#else
withFilePath = withCString
+peekFilePath = peekCString
+peekFilePathLen = peekCStringLen
+#endif
+
#endif
-- ---------------------------------------------------------------------------
--- /dev/null
+{-# LANGUAGE NoImplicitPrelude #-}
+module System.Posix.Internals where
+
+import GHC.IO
+import GHC.Base
+
+puts :: String -> IO ()
GHC.Exts,
GHC.Float,
GHC.Float.RealFracMethods,
+ GHC.Foreign,
GHC.Float.ConversionUtils,
GHC.ForeignPtr,
GHC.MVar,
GHC.IO.Encoding.Types,
GHC.IO.Encoding.Iconv,
GHC.IO.Encoding.CodePage,
+ GHC.IO.Encoding.Failure,
GHC.IO.Handle,
GHC.IO.Handle.Types,
GHC.IO.Handle.Internals,
if os(windows)
exposed-modules: GHC.IO.Encoding.CodePage.Table
GHC.Conc.Windows
+ GHC.Windows
}
exposed-modules:
Control.Applicative,