2 % (c) The University of Glasgow 2006
3 % (c) The University of Glasgow, 1997-2006
6 Buffers for scanning string input stored in external arrays.
12 -- non-abstract for vs\/HaskellService
14 -- * Creation\/destruction
16 hGetStringBufferBlock,
26 -- * Moving and comparison
39 #include "HsVersions.h"
42 import FastString hiding ( buf )
47 import System.IO ( hGetBuf, hFileSize,IOMode(ReadMode), hClose
52 import System.IO ( openBinaryFile )
54 -- -----------------------------------------------------------------------------
55 -- The StringBuffer type
57 -- |A StringBuffer is an internal pointer to a sized chunk of bytes.
58 -- The bytes are intended to be *immutable*. There are pure
59 -- operations to read the contents of a StringBuffer.
61 -- A StringBuffer may have a finalizer, depending on how it was
66 buf :: {-# UNPACK #-} !(ForeignPtr Word8),
67 len :: {-# UNPACK #-} !Int, -- length
68 cur :: {-# UNPACK #-} !Int -- current pos
70 -- The buffer is assumed to be UTF-8 encoded, and furthermore
71 -- we add three '\0' bytes to the end as sentinels so that the
72 -- decoder doesn't have to check for overflow at every single byte
73 -- of a multibyte sequence.
75 instance Show StringBuffer where
76 showsPrec _ s = showString "<stringbuffer("
77 . shows (len s) . showString "," . shows (cur s)
80 -- -----------------------------------------------------------------------------
81 -- Creation / Destruction
83 hGetStringBuffer :: FilePath -> IO StringBuffer
84 hGetStringBuffer fname = do
85 h <- openBinaryFile fname ReadMode
87 let size = fromIntegral size_i
88 buf <- mallocForeignPtrArray (size+3)
89 withForeignPtr buf $ \ptr -> do
90 r <- if size == 0 then return 0 else hGetBuf h ptr size
93 then ioError (userError "short read of file")
94 else newUTF8StringBuffer buf ptr size
96 hGetStringBufferBlock :: Handle -> Int -> IO StringBuffer
97 hGetStringBufferBlock handle wanted
98 = do size_i <- hFileSize handle
99 offset_i <- hTell handle
100 let size = min wanted (fromIntegral $ size_i-offset_i)
101 buf <- mallocForeignPtrArray (size+3)
102 withForeignPtr buf $ \ptr ->
103 do r <- if size == 0 then return 0 else hGetBuf handle ptr size
105 then ioError (userError $ "short read of file: "++show(r,size,size_i,handle))
106 else newUTF8StringBuffer buf ptr size
108 newUTF8StringBuffer :: ForeignPtr Word8 -> Ptr Word8 -> Int -> IO StringBuffer
109 newUTF8StringBuffer buf ptr size = do
110 pokeArray (ptr `plusPtr` size :: Ptr Word8) [0,0,0]
111 -- sentinels for UTF-8 decoding
113 sb0 = StringBuffer buf size 0
114 (first_char, sb1) = nextChar sb0
115 -- skip the byte-order mark if there is one (see #1744)
116 -- This is better than treating #FEFF as whitespace,
117 -- because that would mess up layout. We don't have a concept
118 -- of zero-width whitespace in Haskell: all whitespace codepoints
119 -- have a width of one column.
120 return (if first_char == '\xfeff' then sb1 else sb0)
122 appendStringBuffers :: StringBuffer -> StringBuffer -> IO StringBuffer
123 appendStringBuffers sb1 sb2
124 = do newBuf <- mallocForeignPtrArray (size+3)
125 withForeignPtr newBuf $ \ptr ->
126 withForeignPtr (buf sb1) $ \sb1Ptr ->
127 withForeignPtr (buf sb2) $ \sb2Ptr ->
128 do copyArray (sb1Ptr `advancePtr` cur sb1) ptr (calcLen sb1)
129 copyArray (sb2Ptr `advancePtr` cur sb2) (ptr `advancePtr` cur sb1) (calcLen sb2)
130 pokeArray (ptr `advancePtr` size) [0,0,0]
131 return (StringBuffer newBuf size 0)
132 where calcLen sb = len sb - cur sb
133 size = calcLen sb1 + calcLen sb2
135 stringToStringBuffer :: String -> IO StringBuffer
136 stringToStringBuffer str = do
137 let size = utf8EncodedLength str
138 buf <- mallocForeignPtrArray (size+3)
139 withForeignPtr buf $ \ptr -> do
140 utf8EncodeString ptr str
141 pokeArray (ptr `plusPtr` size :: Ptr Word8) [0,0,0]
142 -- sentinels for UTF-8 decoding
143 return (StringBuffer buf size 0)
145 -- -----------------------------------------------------------------------------
148 -- Getting our fingers dirty a little here, but this is performance-critical
149 {-# INLINE nextChar #-}
150 nextChar :: StringBuffer -> (Char,StringBuffer)
151 nextChar (StringBuffer buf len (I# cur#)) =
153 withForeignPtr buf $ \(Ptr a#) -> do
154 case utf8DecodeChar# (a# `plusAddr#` cur#) of
156 let cur' = I# (b# `minusAddr#` a#) in
157 return (C# c#, StringBuffer buf len cur')
159 currentChar :: StringBuffer -> Char
160 currentChar = fst . nextChar
162 prevChar :: StringBuffer -> Char -> Char
163 prevChar (StringBuffer _ _ 0) deflt = deflt
164 prevChar (StringBuffer buf _ cur) _ =
166 withForeignPtr buf $ \p -> do
167 p' <- utf8PrevChar (p `plusPtr` cur)
168 return (fst (utf8DecodeChar p'))
170 -- -----------------------------------------------------------------------------
173 stepOn :: StringBuffer -> StringBuffer
174 stepOn s = snd (nextChar s)
176 offsetBytes :: Int -> StringBuffer -> StringBuffer
177 offsetBytes i s = s { cur = cur s + i }
179 byteDiff :: StringBuffer -> StringBuffer -> Int
180 byteDiff s1 s2 = cur s2 - cur s1
182 atEnd :: StringBuffer -> Bool
183 atEnd (StringBuffer _ l c) = l == c
185 -- -----------------------------------------------------------------------------
188 lexemeToString :: StringBuffer -> Int {-bytes-} -> String
189 lexemeToString _ 0 = ""
190 lexemeToString (StringBuffer buf _ cur) bytes =
192 withForeignPtr buf $ \ptr ->
193 utf8DecodeString (ptr `plusPtr` cur) bytes
195 lexemeToFastString :: StringBuffer -> Int {-bytes-} -> FastString
196 lexemeToFastString _ 0 = nilFS
197 lexemeToFastString (StringBuffer buf _ cur) len =
199 withForeignPtr buf $ \ptr ->
200 return $! mkFastStringBytes (ptr `plusPtr` cur) len
202 -- -----------------------------------------------------------------------------
203 -- Parsing integer strings in various bases
205 byteOff :: StringBuffer -> Int -> Char
206 byteOff (StringBuffer buf _ cur) i =
207 inlinePerformIO $ withForeignPtr buf $ \ptr -> do
208 -- return $! cBox (indexWord8OffFastPtrAsFastChar
209 -- (pUnbox ptr) (iUnbox (cur+i)))
211 -- w <- peek (ptr `plusPtr` (cur+i))
212 -- return (unsafeChr (fromIntegral (w::Word8)))
214 -- | XXX assumes ASCII digits only (by using byteOff)
215 parseUnsignedInteger :: StringBuffer -> Int -> Integer -> (Char->Int) -> Integer
216 parseUnsignedInteger (StringBuffer buf _ cur) len radix char_to_int
217 = inlinePerformIO $ withForeignPtr buf $ \ptr -> return $! let
218 --LOL, in implementations where the indexing needs slow unsafePerformIO,
219 --this is less (not more) efficient than using the IO monad explicitly
222 byteOff i = cBox (indexWord8OffFastPtrAsFastChar ptr' (iUnbox (cur + i)))
223 go i x | i == len = x
224 | otherwise = case byteOff i of
225 char -> go (i + 1) (x * radix + toInteger (char_to_int char))