Big patch to improve Unicode support in GHC. Validated on OS X and Windows, this

[ghc-base.git] / GHC / Foreign.hs
diff --git a/GHC/Foreign.hs b/GHC/Foreign.hs

new file mode 100644 (file)

index 0000000..b4c760c
--- /dev/null
+++ b/GHC/Foreign.hs
@@ -0,0 +1,255 @@
+{-# LANGUAGE CPP, NoImplicitPrelude #-}
+
+-----------------------------------------------------------------------------
+-- |
+-- Module      :  GHC.IO.Encoding
+-- Copyright   :  (c) The University of Glasgow, 2008-2011
+-- License     :  see libraries/base/LICENSE
+-- 
+-- Maintainer  :  libraries@haskell.org
+-- Stability   :  internal
+-- Portability :  non-portable
+--
+-- Foreign marshalling support for CStrings with configurable encodings
+--
+-----------------------------------------------------------------------------
+
+module GHC.Foreign (
+    -- * C strings with a configurable encoding
+    
+    -- conversion of C strings into Haskell strings
+    --
+    peekCString,       -- :: TextEncoding -> CString    -> IO String
+    peekCStringLen,    -- :: TextEncoding -> CStringLen -> IO String
+    
+    -- conversion of Haskell strings into C strings
+    --
+    newCString,        -- :: TextEncoding -> String -> IO CString
+    newCStringLen,     -- :: TextEncoding -> String -> IO CStringLen
+    
+    -- conversion of Haskell strings into C strings using temporary storage
+    --
+    withCString,       -- :: TextEncoding -> String -> (CString    -> IO a) -> IO a
+    withCStringLen,    -- :: TextEncoding -> String -> (CStringLen -> IO a) -> IO a
+    
+    charIsRepresentable, -- :: TextEncoding -> Char -> IO Bool
+  ) where
+
+import Foreign.Marshal.Array
+import Foreign.C.Types
+import Foreign.Ptr
+import Foreign.Storable
+
+import Data.Word
+
+-- Imports for the locale-encoding version of marshallers
+import Control.Monad
+
+import Data.Tuple (fst)
+import Data.Maybe
+
+import {-# SOURCE #-} System.Posix.Internals (puts)
+import GHC.Show ( show )
+
+import Foreign.Marshal.Alloc
+import Foreign.ForeignPtr
+
+import GHC.Err (undefined)
+import GHC.List
+import GHC.Num
+import GHC.Base
+
+import GHC.IO
+import GHC.IO.Exception
+import GHC.IO.Buffer
+import GHC.IO.Encoding.Types
+
+
+c_DEBUG_DUMP :: Bool
+c_DEBUG_DUMP = False
+
+putDebugMsg :: String -> IO ()
+putDebugMsg | c_DEBUG_DUMP = puts
+            | otherwise    = const (return ())
+
+
+-- These definitions are identical to those in Foreign.C.String, but copied in here to avoid a cycle:
+type CString    = Ptr CChar
+type CStringLen = (Ptr CChar, Int)
+
+-- exported functions
+-- ------------------
+
+-- | Marshal a NUL terminated C string into a Haskell string.
+--
+peekCString    :: TextEncoding -> CString -> IO String
+peekCString enc cp = do
+    sz <- lengthArray0 nUL cp
+    peekEncodedCString enc (cp, sz * cCharSize)
+
+-- | Marshal a C string with explicit length into a Haskell string.
+--
+peekCStringLen           :: TextEncoding -> CStringLen -> IO String
+peekCStringLen = peekEncodedCString
+
+-- | Marshal a Haskell string into a NUL terminated C string.
+--
+-- * the Haskell string may /not/ contain any NUL characters
+--
+-- * new storage is allocated for the C string and must be
+--   explicitly freed using 'Foreign.Marshal.Alloc.free' or
+--   'Foreign.Marshal.Alloc.finalizerFree'.
+--
+newCString :: TextEncoding -> String -> IO CString
+newCString enc = liftM fst . newEncodedCString enc True
+
+-- | Marshal a Haskell string into a C string (ie, character array) with
+-- explicit length information.
+--
+-- * new storage is allocated for the C string and must be
+--   explicitly freed using 'Foreign.Marshal.Alloc.free' or
+--   'Foreign.Marshal.Alloc.finalizerFree'.
+--
+newCStringLen     :: TextEncoding -> String -> IO CStringLen
+newCStringLen enc = newEncodedCString enc False
+
+-- | Marshal a Haskell string into a NUL terminated C string using temporary
+-- storage.
+--
+-- * the Haskell string may /not/ contain any NUL characters
+--
+-- * the memory is freed when the subcomputation terminates (either
+--   normally or via an exception), so the pointer to the temporary
+--   storage must /not/ be used after this.
+--
+withCString :: TextEncoding -> String -> (CString -> IO a) -> IO a
+withCString enc s act = withEncodedCString enc True s $ \(cp, _sz) -> act cp
+
+-- | Marshal a Haskell string into a C string (ie, character array)
+-- in temporary storage, with explicit length information.
+--
+-- * the memory is freed when the subcomputation terminates (either
+--   normally or via an exception), so the pointer to the temporary
+--   storage must /not/ be used after this.
+--
+withCStringLen         :: TextEncoding -> String -> (CStringLen -> IO a) -> IO a
+withCStringLen enc = withEncodedCString enc False
+
+
+-- | Determines whether a character can be accurately encoded in a 'CString'.
+--
+-- Pretty much anyone who uses this function is in a state of sin because
+-- whether or not a character is encodable will, in general, depend on the
+-- context in which it occurs.
+charIsRepresentable :: TextEncoding -> Char -> IO Bool
+charIsRepresentable enc c = withCString enc [c] (fmap (== [c]) . peekCString enc) `catchException` (\e -> let _ = e :: IOException in return False)
+
+-- auxiliary definitions
+-- ----------------------
+
+-- C's end of string character
+nUL :: CChar
+nUL  = 0
+
+-- Size of a CChar in bytes
+cCharSize :: Int
+cCharSize = sizeOf (undefined :: CChar)
+
+
+{-# INLINE peekEncodedCString #-}
+peekEncodedCString :: TextEncoding -- ^ Encoding of CString
+                   -> CStringLen
+                   -> IO String    -- ^ String in Haskell terms
+peekEncodedCString (TextEncoding { mkTextDecoder = mk_decoder }) (p, sz_bytes)
+  = bracket mk_decoder close $ \decoder -> do
+      let chunk_size = sz_bytes `max` 1 -- Decode buffer chunk size in characters: one iteration only for ASCII
+      from0 <- fmap (\fp -> bufferAdd sz_bytes (emptyBuffer fp sz_bytes ReadBuffer)) $ newForeignPtr_ (castPtr p)
+      to <- newCharBuffer chunk_size WriteBuffer
+
+      let go iteration from = do
+            (why, from', to') <- encode decoder from to
+            if isEmptyBuffer from'
+             then
+              -- No input remaining: @why@ will be InputUnderflow, but we don't care
+              withBuffer to' $ peekArray (bufferElems to')
+             else do
+              -- Input remaining: what went wrong?
+              putDebugMsg ("peekEncodedCString: " ++ show iteration ++ " " ++ show why)
+              (from'', to'') <- case why of InvalidSequence -> recover decoder from' to' -- These conditions are equally bad because
+                                            InputUnderflow  -> recover decoder from' to' -- they indicate malformed/truncated input
+                                            OutputUnderflow -> return (from', to')       -- We will have more space next time round
+              putDebugMsg ("peekEncodedCString: from " ++ summaryBuffer from ++ " " ++ summaryBuffer from' ++ " " ++ summaryBuffer from'')
+              putDebugMsg ("peekEncodedCString: to " ++ summaryBuffer to ++ " " ++ summaryBuffer to' ++ " " ++ summaryBuffer to'')
+              to_chars <- withBuffer to'' $ peekArray (bufferElems to'')
+              fmap (to_chars++) $ go (iteration + 1) from''
+
+      go (0 :: Int) from0
+
+{-# INLINE withEncodedCString #-}
+withEncodedCString :: TextEncoding         -- ^ Encoding of CString to create
+                   -> Bool                 -- ^ Null-terminate?
+                   -> String               -- ^ String to encode
+                   -> (CStringLen -> IO a) -- ^ Worker that can safely use the allocated memory
+                   -> IO a
+withEncodedCString (TextEncoding { mkTextEncoder = mk_encoder }) null_terminate s act
+  = bracket mk_encoder close $ \encoder -> withArrayLen s $ \sz p -> do
+      from <- fmap (\fp -> bufferAdd sz (emptyBuffer fp sz ReadBuffer)) $ newForeignPtr_ p
+
+      let go iteration to_sz_bytes = do
+           putDebugMsg ("withEncodedCString: " ++ show iteration)
+           allocaBytes to_sz_bytes $ \to_p -> do
+            mb_res <- tryFillBufferAndCall encoder null_terminate from to_p to_sz_bytes act
+            case mb_res of
+              Nothing  -> go (iteration + 1) (to_sz_bytes * 2)
+              Just res -> return res
+
+      -- If the input string is ASCII, this value will ensure we only allocate once
+      go (0 :: Int) (cCharSize * (sz + 1))
+
+{-# INLINE newEncodedCString #-}
+newEncodedCString :: TextEncoding  -- ^ Encoding of CString to create
+                  -> Bool          -- ^ Null-terminate?
+                  -> String        -- ^ String to encode
+                  -> IO CStringLen
+newEncodedCString (TextEncoding { mkTextEncoder = mk_encoder }) null_terminate s
+  = bracket mk_encoder close $ \encoder -> withArrayLen s $ \sz p -> do
+      from <- fmap (\fp -> bufferAdd sz (emptyBuffer fp sz ReadBuffer)) $ newForeignPtr_ p
+
+      let go iteration to_p to_sz_bytes = do
+           putDebugMsg ("newEncodedCString: " ++ show iteration)
+           mb_res <- tryFillBufferAndCall encoder null_terminate from to_p to_sz_bytes return
+           case mb_res of
+             Nothing  -> do
+                 let to_sz_bytes' = to_sz_bytes * 2
+                 to_p' <- reallocBytes to_p to_sz_bytes'
+                 go (iteration + 1) to_p' to_sz_bytes'
+             Just res -> return res
+
+      -- If the input string is ASCII, this value will ensure we only allocate once
+      let to_sz_bytes = cCharSize * (sz + 1)
+      to_p <- mallocBytes to_sz_bytes
+      go (0 :: Int) to_p to_sz_bytes
+
+
+tryFillBufferAndCall :: TextEncoder dstate -> Bool -> Buffer Char -> Ptr Word8 -> Int
+                     -> (CStringLen -> IO a) -> IO (Maybe a)
+tryFillBufferAndCall encoder null_terminate from0 to_p to_sz_bytes act = do
+    to_fp <- newForeignPtr_ to_p
+    go (0 :: Int) (from0, emptyBuffer to_fp to_sz_bytes WriteBuffer)
+  where
+    go iteration (from, to) = do
+      (why, from', to') <- encode encoder from to
+      putDebugMsg ("tryFillBufferAndCall: " ++ show iteration ++ " " ++ show why ++ " " ++ summaryBuffer from ++ " " ++ summaryBuffer from')
+      if isEmptyBuffer from'
+       then if null_terminate && bufferAvailable to' == 0
+             then return Nothing -- We had enough for the string but not the terminator: ask the caller for more buffer
+             else do
+               -- Awesome, we had enough buffer
+               let bytes = bufferElems to'
+               withBuffer to' $ \to_ptr -> do
+                   when null_terminate $ pokeElemOff to_ptr (bufR to') 0
+                   fmap Just $ act (castPtr to_ptr, bytes) -- NB: the length information is specified as being in *bytes*
+       else case why of -- We didn't consume all of the input
+              InputUnderflow  -> recover encoder from' to' >>= go (iteration + 1) -- These conditions are equally bad
+              InvalidSequence -> recover encoder from' to' >>= go (iteration + 1) -- since the input was truncated/invalid
+              OutputUnderflow -> return Nothing -- Oops, out of buffer during decoding: ask the caller for more