Buffers for scanning string input stored in external arrays.
\begin{code}
+{-# LANGUAGE BangPatterns #-}
+{-# OPTIONS_GHC -O -funbox-strict-fields #-}
+-- We always optimise this, otherwise performance of a non-optimised
+-- compiler is severely affected
+
module StringBuffer
(
StringBuffer(..),
- -- non-abstract for vs\/HaskellService
+ -- non-abstract for vs\/HaskellService
- -- * Creation\/destruction
+ -- * Creation\/destruction
hGetStringBuffer,
hGetStringBufferBlock,
appendStringBuffers,
- stringToStringBuffer,
+ stringToStringBuffer,
- -- * Inspection
- nextChar,
- currentChar,
- prevChar,
- atEnd,
+ -- * Inspection
+ nextChar,
+ currentChar,
+ prevChar,
+ atEnd,
- -- * Moving and comparison
- stepOn,
- offsetBytes,
- byteDiff,
+ -- * Moving and comparison
+ stepOn,
+ offsetBytes,
+ byteDiff,
-- * Conversion
lexemeToString,
lexemeToFastString,
- -- * Parsing integers
- parseUnsignedInteger,
+ -- * Parsing integers
+ parseUnsignedInteger,
) where
#include "HsVersions.h"
import Encoding
-import FastString ( FastString,mkFastString,mkFastStringBytes )
+import FastString hiding ( buf )
+import FastTypes
+import FastFunctions
import Foreign
-import System.IO ( hGetBuf, hFileSize,IOMode(ReadMode), hClose
+import System.IO ( hGetBuf, hFileSize,IOMode(ReadMode), hClose
, Handle, hTell )
import GHC.Exts
-import GHC.IOBase ( IO(..) )
-import GHC.Base ( unsafeChr )
-
-#if __GLASGOW_HASKELL__ >= 601
-import System.IO ( openBinaryFile )
-#else
-import IOExts ( openFileEx, IOModeEx(..) )
-#endif
-#if __GLASGOW_HASKELL__ < 601
-openBinaryFile fp mode = openFileEx fp (BinaryMode mode)
-#endif
+import System.IO ( openBinaryFile )
-- -----------------------------------------------------------------------------
-- The StringBuffer type
data StringBuffer
= StringBuffer {
buf :: {-# UNPACK #-} !(ForeignPtr Word8),
- len :: {-# UNPACK #-} !Int, -- length
- cur :: {-# UNPACK #-} !Int -- current pos
+ len :: {-# UNPACK #-} !Int, -- length
+ cur :: {-# UNPACK #-} !Int -- current pos
}
- -- The buffer is assumed to be UTF-8 encoded, and furthermore
- -- we add three '\0' bytes to the end as sentinels so that the
- -- decoder doesn't have to check for overflow at every single byte
- -- of a multibyte sequence.
+ -- The buffer is assumed to be UTF-8 encoded, and furthermore
+ -- we add three '\0' bytes to the end as sentinels so that the
+ -- decoder doesn't have to check for overflow at every single byte
+ -- of a multibyte sequence.
instance Show StringBuffer where
- showsPrec _ s = showString "<stringbuffer("
- . shows (len s) . showString "," . shows (cur s)
- . showString ">"
+ showsPrec _ s = showString "<stringbuffer("
+ . shows (len s) . showString "," . shows (cur s)
+ . showString ")>"
-- -----------------------------------------------------------------------------
-- Creation / Destruction
r <- if size == 0 then return 0 else hGetBuf h ptr size
hClose h
if (r /= size)
- then ioError (userError "short read of file")
- else do
- pokeArray (ptr `plusPtr` size :: Ptr Word8) [0,0,0]
- -- sentinels for UTF-8 decoding
- return (StringBuffer buf size 0)
+ then ioError (userError "short read of file")
+ else newUTF8StringBuffer buf ptr size
hGetStringBufferBlock :: Handle -> Int -> IO StringBuffer
hGetStringBufferBlock handle wanted
withForeignPtr buf $ \ptr ->
do r <- if size == 0 then return 0 else hGetBuf handle ptr size
if r /= size
- then ioError (userError $ "short read of file: "++show(r,size,fromIntegral size_i,handle))
- else do pokeArray (ptr `plusPtr` size :: Ptr Word8) [0,0,0]
- return (StringBuffer buf size 0)
+ then ioError (userError $ "short read of file: "++show(r,size,size_i,handle))
+ else newUTF8StringBuffer buf ptr size
+
+newUTF8StringBuffer :: ForeignPtr Word8 -> Ptr Word8 -> Int -> IO StringBuffer
+newUTF8StringBuffer buf ptr size = do
+ pokeArray (ptr `plusPtr` size :: Ptr Word8) [0,0,0]
+ -- sentinels for UTF-8 decoding
+ let
+ sb0 = StringBuffer buf size 0
+ (first_char, sb1) = nextChar sb0
+ -- skip the byte-order mark if there is one (see #1744)
+ -- This is better than treating #FEFF as whitespace,
+ -- because that would mess up layout. We don't have a concept
+ -- of zero-width whitespace in Haskell: all whitespace codepoints
+ -- have a width of one column.
+ return (if first_char == '\xfeff' then sb1 else sb0)
appendStringBuffers :: StringBuffer -> StringBuffer -> IO StringBuffer
appendStringBuffers sb1 sb2
withForeignPtr newBuf $ \ptr ->
withForeignPtr (buf sb1) $ \sb1Ptr ->
withForeignPtr (buf sb2) $ \sb2Ptr ->
- do copyArray (sb1Ptr `advancePtr` cur sb1) ptr (calcLen sb1)
- copyArray (sb2Ptr `advancePtr` cur sb2) (ptr `advancePtr` cur sb1) (calcLen sb2)
+ do copyArray ptr (sb1Ptr `advancePtr` cur sb1) sb1_len
+ copyArray (ptr `advancePtr` sb1_len) (sb2Ptr `advancePtr` cur sb2) sb2_len
pokeArray (ptr `advancePtr` size) [0,0,0]
return (StringBuffer newBuf size 0)
- where calcLen sb = len sb - cur sb
- size = calcLen sb1 + calcLen sb2
-
-stringToStringBuffer :: String -> IO StringBuffer
-stringToStringBuffer str = do
+ where sb1_len = calcLen sb1
+ sb2_len = calcLen sb2
+ calcLen sb = len sb - cur sb
+ size = sb1_len + sb2_len
+
+stringToStringBuffer :: String -> StringBuffer
+stringToStringBuffer str =
+ unsafePerformIO $ do
let size = utf8EncodedLength str
buf <- mallocForeignPtrArray (size+3)
withForeignPtr buf $ \ptr -> do
utf8EncodeString ptr str
pokeArray (ptr `plusPtr` size :: Ptr Word8) [0,0,0]
- -- sentinels for UTF-8 decoding
+ -- sentinels for UTF-8 decoding
return (StringBuffer buf size 0)
-- -----------------------------------------------------------------------------
nextChar (StringBuffer buf len (I# cur#)) =
inlinePerformIO $ do
withForeignPtr buf $ \(Ptr a#) -> do
- case utf8DecodeChar# (a# `plusAddr#` cur#) of
- (# c#, b# #) ->
- let cur' = I# (b# `minusAddr#` a#) in
- return (C# c#, StringBuffer buf len cur')
+ case utf8DecodeChar# (a# `plusAddr#` cur#) of
+ (# c#, b# #) ->
+ let cur' = I# (b# `minusAddr#` a#) in
+ return (C# c#, StringBuffer buf len cur')
currentChar :: StringBuffer -> Char
currentChar = fst . nextChar
prevChar :: StringBuffer -> Char -> Char
-prevChar (StringBuffer buf len 0) deflt = deflt
-prevChar (StringBuffer buf len cur) deflt =
+prevChar (StringBuffer _ _ 0) deflt = deflt
+prevChar (StringBuffer buf _ cur) _ =
inlinePerformIO $ do
withForeignPtr buf $ \p -> do
p' <- utf8PrevChar (p `plusPtr` cur)
lexemeToString :: StringBuffer -> Int {-bytes-} -> String
lexemeToString _ 0 = ""
lexemeToString (StringBuffer buf _ cur) bytes =
- inlinePerformIO $
- withForeignPtr buf $ \ptr ->
+ inlinePerformIO $
+ withForeignPtr buf $ \ptr ->
utf8DecodeString (ptr `plusPtr` cur) bytes
lexemeToFastString :: StringBuffer -> Int {-bytes-} -> FastString
-lexemeToFastString _ 0 = mkFastString ""
+lexemeToFastString _ 0 = nilFS
lexemeToFastString (StringBuffer buf _ cur) len =
inlinePerformIO $
withForeignPtr buf $ \ptr ->
-- -----------------------------------------------------------------------------
-- Parsing integer strings in various bases
-
+{-
byteOff :: StringBuffer -> Int -> Char
-byteOff (StringBuffer buf _ cur) i =
+byteOff (StringBuffer buf _ cur) i =
inlinePerformIO $ withForeignPtr buf $ \ptr -> do
- w <- peek (ptr `plusPtr` (cur+i))
- return (unsafeChr (fromIntegral (w::Word8)))
-
+-- return $! cBox (indexWord8OffFastPtrAsFastChar
+-- (pUnbox ptr) (iUnbox (cur+i)))
+--or
+-- w <- peek (ptr `plusPtr` (cur+i))
+-- return (unsafeChr (fromIntegral (w::Word8)))
+-}
-- | XXX assumes ASCII digits only (by using byteOff)
parseUnsignedInteger :: StringBuffer -> Int -> Integer -> (Char->Int) -> Integer
-parseUnsignedInteger buf len radix char_to_int
- = go 0 0
- where
+parseUnsignedInteger (StringBuffer buf _ cur) len radix char_to_int
+ = inlinePerformIO $ withForeignPtr buf $ \ptr -> return $! let
+ --LOL, in implementations where the indexing needs slow unsafePerformIO,
+ --this is less (not more) efficient than using the IO monad explicitly
+ --here.
+ !ptr' = pUnbox ptr
+ byteOff i = cBox (indexWord8OffFastPtrAsFastChar ptr' (iUnbox (cur + i)))
go i x | i == len = x
- | otherwise = go (i+1)
- (x * radix + toInteger (char_to_int (byteOff buf i)))
-
--- -----------------------------------------------------------------------------
--- under the carpet
-
--- Just like unsafePerformIO, but we inline it.
-{-# INLINE inlinePerformIO #-}
-inlinePerformIO :: IO a -> a
-inlinePerformIO (IO m) = case m realWorld# of (# _, r #) -> r
+ | otherwise = case byteOff i of
+ char -> go (i + 1) (x * radix + toInteger (char_to_int char))
+ in go 0 0
\end{code}