2 % (c) The University of Glasgow 2006
3 % (c) The University of Glasgow, 1997-2006
6 Buffers for scanning string input stored in external arrays.
9 {-# OPTIONS_GHC -O -funbox-strict-fields #-}
10 -- We always optimise this, otherwise performance of a non-optimised
11 -- compiler is severely affected
16 -- non-abstract for vs\/HaskellService
18 -- * Creation\/destruction
20 hGetStringBufferBlock,
30 -- * Moving and comparison
43 #include "HsVersions.h"
46 import FastString hiding ( buf )
51 import System.IO ( hGetBuf, hFileSize,IOMode(ReadMode), hClose
56 import System.IO ( openBinaryFile )
58 -- -----------------------------------------------------------------------------
59 -- The StringBuffer type
61 -- |A StringBuffer is an internal pointer to a sized chunk of bytes.
62 -- The bytes are intended to be *immutable*. There are pure
63 -- operations to read the contents of a StringBuffer.
65 -- A StringBuffer may have a finalizer, depending on how it was
70 buf :: {-# UNPACK #-} !(ForeignPtr Word8),
71 len :: {-# UNPACK #-} !Int, -- length
72 cur :: {-# UNPACK #-} !Int -- current pos
74 -- The buffer is assumed to be UTF-8 encoded, and furthermore
75 -- we add three '\0' bytes to the end as sentinels so that the
76 -- decoder doesn't have to check for overflow at every single byte
77 -- of a multibyte sequence.
79 instance Show StringBuffer where
80 showsPrec _ s = showString "<stringbuffer("
81 . shows (len s) . showString "," . shows (cur s)
84 -- -----------------------------------------------------------------------------
85 -- Creation / Destruction
87 hGetStringBuffer :: FilePath -> IO StringBuffer
88 hGetStringBuffer fname = do
89 h <- openBinaryFile fname ReadMode
91 let size = fromIntegral size_i
92 buf <- mallocForeignPtrArray (size+3)
93 withForeignPtr buf $ \ptr -> do
94 r <- if size == 0 then return 0 else hGetBuf h ptr size
97 then ioError (userError "short read of file")
98 else newUTF8StringBuffer buf ptr size
100 hGetStringBufferBlock :: Handle -> Int -> IO StringBuffer
101 hGetStringBufferBlock handle wanted
102 = do size_i <- hFileSize handle
103 offset_i <- hTell handle
104 let size = min wanted (fromIntegral $ size_i-offset_i)
105 buf <- mallocForeignPtrArray (size+3)
106 withForeignPtr buf $ \ptr ->
107 do r <- if size == 0 then return 0 else hGetBuf handle ptr size
109 then ioError (userError $ "short read of file: "++show(r,size,size_i,handle))
110 else newUTF8StringBuffer buf ptr size
112 newUTF8StringBuffer :: ForeignPtr Word8 -> Ptr Word8 -> Int -> IO StringBuffer
113 newUTF8StringBuffer buf ptr size = do
114 pokeArray (ptr `plusPtr` size :: Ptr Word8) [0,0,0]
115 -- sentinels for UTF-8 decoding
117 sb0 = StringBuffer buf size 0
118 (first_char, sb1) = nextChar sb0
119 -- skip the byte-order mark if there is one (see #1744)
120 -- This is better than treating #FEFF as whitespace,
121 -- because that would mess up layout. We don't have a concept
122 -- of zero-width whitespace in Haskell: all whitespace codepoints
123 -- have a width of one column.
124 return (if first_char == '\xfeff' then sb1 else sb0)
126 appendStringBuffers :: StringBuffer -> StringBuffer -> IO StringBuffer
127 appendStringBuffers sb1 sb2
128 = do newBuf <- mallocForeignPtrArray (size+3)
129 withForeignPtr newBuf $ \ptr ->
130 withForeignPtr (buf sb1) $ \sb1Ptr ->
131 withForeignPtr (buf sb2) $ \sb2Ptr ->
132 do copyArray ptr (sb1Ptr `advancePtr` cur sb1) sb1_len
133 copyArray (ptr `advancePtr` sb1_len) (sb2Ptr `advancePtr` cur sb2) sb2_len
134 pokeArray (ptr `advancePtr` size) [0,0,0]
135 return (StringBuffer newBuf size 0)
136 where sb1_len = calcLen sb1
137 sb2_len = calcLen sb2
138 calcLen sb = len sb - cur sb
139 size = sb1_len + sb2_len
141 stringToStringBuffer :: String -> IO StringBuffer
142 stringToStringBuffer str = do
143 let size = utf8EncodedLength str
144 buf <- mallocForeignPtrArray (size+3)
145 withForeignPtr buf $ \ptr -> do
146 utf8EncodeString ptr str
147 pokeArray (ptr `plusPtr` size :: Ptr Word8) [0,0,0]
148 -- sentinels for UTF-8 decoding
149 return (StringBuffer buf size 0)
151 -- -----------------------------------------------------------------------------
154 -- Getting our fingers dirty a little here, but this is performance-critical
155 {-# INLINE nextChar #-}
156 nextChar :: StringBuffer -> (Char,StringBuffer)
157 nextChar (StringBuffer buf len (I# cur#)) =
159 withForeignPtr buf $ \(Ptr a#) -> do
160 case utf8DecodeChar# (a# `plusAddr#` cur#) of
162 let cur' = I# (b# `minusAddr#` a#) in
163 return (C# c#, StringBuffer buf len cur')
165 currentChar :: StringBuffer -> Char
166 currentChar = fst . nextChar
168 prevChar :: StringBuffer -> Char -> Char
169 prevChar (StringBuffer _ _ 0) deflt = deflt
170 prevChar (StringBuffer buf _ cur) _ =
172 withForeignPtr buf $ \p -> do
173 p' <- utf8PrevChar (p `plusPtr` cur)
174 return (fst (utf8DecodeChar p'))
176 -- -----------------------------------------------------------------------------
179 stepOn :: StringBuffer -> StringBuffer
180 stepOn s = snd (nextChar s)
182 offsetBytes :: Int -> StringBuffer -> StringBuffer
183 offsetBytes i s = s { cur = cur s + i }
185 byteDiff :: StringBuffer -> StringBuffer -> Int
186 byteDiff s1 s2 = cur s2 - cur s1
188 atEnd :: StringBuffer -> Bool
189 atEnd (StringBuffer _ l c) = l == c
191 -- -----------------------------------------------------------------------------
194 lexemeToString :: StringBuffer -> Int {-bytes-} -> String
195 lexemeToString _ 0 = ""
196 lexemeToString (StringBuffer buf _ cur) bytes =
198 withForeignPtr buf $ \ptr ->
199 utf8DecodeString (ptr `plusPtr` cur) bytes
201 lexemeToFastString :: StringBuffer -> Int {-bytes-} -> FastString
202 lexemeToFastString _ 0 = nilFS
203 lexemeToFastString (StringBuffer buf _ cur) len =
205 withForeignPtr buf $ \ptr ->
206 return $! mkFastStringBytes (ptr `plusPtr` cur) len
208 -- -----------------------------------------------------------------------------
209 -- Parsing integer strings in various bases
211 byteOff :: StringBuffer -> Int -> Char
212 byteOff (StringBuffer buf _ cur) i =
213 inlinePerformIO $ withForeignPtr buf $ \ptr -> do
214 -- return $! cBox (indexWord8OffFastPtrAsFastChar
215 -- (pUnbox ptr) (iUnbox (cur+i)))
217 -- w <- peek (ptr `plusPtr` (cur+i))
218 -- return (unsafeChr (fromIntegral (w::Word8)))
220 -- | XXX assumes ASCII digits only (by using byteOff)
221 parseUnsignedInteger :: StringBuffer -> Int -> Integer -> (Char->Int) -> Integer
222 parseUnsignedInteger (StringBuffer buf _ cur) len radix char_to_int
223 = inlinePerformIO $ withForeignPtr buf $ \ptr -> return $! let
224 --LOL, in implementations where the indexing needs slow unsafePerformIO,
225 --this is less (not more) efficient than using the IO monad explicitly
228 byteOff i = cBox (indexWord8OffFastPtrAsFastChar ptr' (iUnbox (cur + i)))
229 go i x | i == len = x
230 | otherwise = case byteOff i of
231 char -> go (i + 1) (x * radix + toInteger (char_to_int char))