2 % (c) The University of Glasgow 2006
3 % (c) The University of Glasgow, 1997-2006
6 Buffers for scanning string input stored in external arrays.
10 -- The above warning supression flag is a temporary kludge.
11 -- While working on this module you are encouraged to remove it and fix
12 -- any warnings in the module. See
13 -- http://hackage.haskell.org/trac/ghc/wiki/Commentary/CodingStyle#Warnings
19 -- non-abstract for vs\/HaskellService
21 -- * Creation\/destruction
23 hGetStringBufferBlock,
33 -- * Moving and comparison
46 #include "HsVersions.h"
49 import FastString ( FastString,mkFastString,mkFastStringBytes )
52 import System.IO ( hGetBuf, hFileSize,IOMode(ReadMode), hClose
56 import GHC.IOBase ( IO(..) )
57 import GHC.Base ( unsafeChr )
59 #if __GLASGOW_HASKELL__ >= 601
60 import System.IO ( openBinaryFile )
62 import IOExts ( openFileEx, IOModeEx(..) )
65 #if __GLASGOW_HASKELL__ < 601
66 openBinaryFile fp mode = openFileEx fp (BinaryMode mode)
69 -- -----------------------------------------------------------------------------
70 -- The StringBuffer type
72 -- |A StringBuffer is an internal pointer to a sized chunk of bytes.
73 -- The bytes are intended to be *immutable*. There are pure
74 -- operations to read the contents of a StringBuffer.
76 -- A StringBuffer may have a finalizer, depending on how it was
81 buf :: {-# UNPACK #-} !(ForeignPtr Word8),
82 len :: {-# UNPACK #-} !Int, -- length
83 cur :: {-# UNPACK #-} !Int -- current pos
85 -- The buffer is assumed to be UTF-8 encoded, and furthermore
86 -- we add three '\0' bytes to the end as sentinels so that the
87 -- decoder doesn't have to check for overflow at every single byte
88 -- of a multibyte sequence.
90 instance Show StringBuffer where
91 showsPrec _ s = showString "<stringbuffer("
92 . shows (len s) . showString "," . shows (cur s)
95 -- -----------------------------------------------------------------------------
96 -- Creation / Destruction
98 hGetStringBuffer :: FilePath -> IO StringBuffer
99 hGetStringBuffer fname = do
100 h <- openBinaryFile fname ReadMode
101 size_i <- hFileSize h
102 let size = fromIntegral size_i
103 buf <- mallocForeignPtrArray (size+3)
104 withForeignPtr buf $ \ptr -> do
105 r <- if size == 0 then return 0 else hGetBuf h ptr size
108 then ioError (userError "short read of file")
109 else newUTF8StringBuffer buf ptr size
111 hGetStringBufferBlock :: Handle -> Int -> IO StringBuffer
112 hGetStringBufferBlock handle wanted
113 = do size_i <- hFileSize handle
114 offset_i <- hTell handle
115 let size = min wanted (fromIntegral $ size_i-offset_i)
116 buf <- mallocForeignPtrArray (size+3)
117 withForeignPtr buf $ \ptr ->
118 do r <- if size == 0 then return 0 else hGetBuf handle ptr size
120 then ioError (userError $ "short read of file: "++show(r,size,size_i,handle))
121 else newUTF8StringBuffer buf ptr size
123 newUTF8StringBuffer :: ForeignPtr Word8 -> Ptr Word8 -> Int -> IO StringBuffer
124 newUTF8StringBuffer buf ptr size = do
125 pokeArray (ptr `plusPtr` size :: Ptr Word8) [0,0,0]
126 -- sentinels for UTF-8 decoding
128 sb0 = StringBuffer buf size 0
129 (first_char, sb1) = nextChar sb0
130 -- skip the byte-order mark if there is one (see #1744)
131 -- This is better than treating #FEFF as whitespace,
132 -- because that would mess up layout. We don't have a concept
133 -- of zero-width whitespace in Haskell: all whitespace codepoints
134 -- have a width of one column.
135 return (if first_char == '\xfeff' then sb1 else sb0)
137 appendStringBuffers :: StringBuffer -> StringBuffer -> IO StringBuffer
138 appendStringBuffers sb1 sb2
139 = do newBuf <- mallocForeignPtrArray (size+3)
140 withForeignPtr newBuf $ \ptr ->
141 withForeignPtr (buf sb1) $ \sb1Ptr ->
142 withForeignPtr (buf sb2) $ \sb2Ptr ->
143 do copyArray (sb1Ptr `advancePtr` cur sb1) ptr (calcLen sb1)
144 copyArray (sb2Ptr `advancePtr` cur sb2) (ptr `advancePtr` cur sb1) (calcLen sb2)
145 pokeArray (ptr `advancePtr` size) [0,0,0]
146 return (StringBuffer newBuf size 0)
147 where calcLen sb = len sb - cur sb
148 size = calcLen sb1 + calcLen sb2
150 stringToStringBuffer :: String -> IO StringBuffer
151 stringToStringBuffer str = do
152 let size = utf8EncodedLength str
153 buf <- mallocForeignPtrArray (size+3)
154 withForeignPtr buf $ \ptr -> do
155 utf8EncodeString ptr str
156 pokeArray (ptr `plusPtr` size :: Ptr Word8) [0,0,0]
157 -- sentinels for UTF-8 decoding
158 return (StringBuffer buf size 0)
160 -- -----------------------------------------------------------------------------
163 -- Getting our fingers dirty a little here, but this is performance-critical
164 {-# INLINE nextChar #-}
165 nextChar :: StringBuffer -> (Char,StringBuffer)
166 nextChar (StringBuffer buf len (I# cur#)) =
168 withForeignPtr buf $ \(Ptr a#) -> do
169 case utf8DecodeChar# (a# `plusAddr#` cur#) of
171 let cur' = I# (b# `minusAddr#` a#) in
172 return (C# c#, StringBuffer buf len cur')
174 currentChar :: StringBuffer -> Char
175 currentChar = fst . nextChar
177 prevChar :: StringBuffer -> Char -> Char
178 prevChar (StringBuffer buf len 0) deflt = deflt
179 prevChar (StringBuffer buf len cur) deflt =
181 withForeignPtr buf $ \p -> do
182 p' <- utf8PrevChar (p `plusPtr` cur)
183 return (fst (utf8DecodeChar p'))
185 -- -----------------------------------------------------------------------------
188 stepOn :: StringBuffer -> StringBuffer
189 stepOn s = snd (nextChar s)
191 offsetBytes :: Int -> StringBuffer -> StringBuffer
192 offsetBytes i s = s { cur = cur s + i }
194 byteDiff :: StringBuffer -> StringBuffer -> Int
195 byteDiff s1 s2 = cur s2 - cur s1
197 atEnd :: StringBuffer -> Bool
198 atEnd (StringBuffer _ l c) = l == c
200 -- -----------------------------------------------------------------------------
203 lexemeToString :: StringBuffer -> Int {-bytes-} -> String
204 lexemeToString _ 0 = ""
205 lexemeToString (StringBuffer buf _ cur) bytes =
207 withForeignPtr buf $ \ptr ->
208 utf8DecodeString (ptr `plusPtr` cur) bytes
210 lexemeToFastString :: StringBuffer -> Int {-bytes-} -> FastString
211 lexemeToFastString _ 0 = mkFastString ""
212 lexemeToFastString (StringBuffer buf _ cur) len =
214 withForeignPtr buf $ \ptr ->
215 return $! mkFastStringBytes (ptr `plusPtr` cur) len
217 -- -----------------------------------------------------------------------------
218 -- Parsing integer strings in various bases
220 byteOff :: StringBuffer -> Int -> Char
221 byteOff (StringBuffer buf _ cur) i =
222 inlinePerformIO $ withForeignPtr buf $ \ptr -> do
223 w <- peek (ptr `plusPtr` (cur+i))
224 return (unsafeChr (fromIntegral (w::Word8)))
226 -- | XXX assumes ASCII digits only (by using byteOff)
227 parseUnsignedInteger :: StringBuffer -> Int -> Integer -> (Char->Int) -> Integer
228 parseUnsignedInteger buf len radix char_to_int
231 go i x | i == len = x
232 | otherwise = go (i+1)
233 (x * radix + toInteger (char_to_int (byteOff buf i)))
235 -- -----------------------------------------------------------------------------
238 -- Just like unsafePerformIO, but we inline it.
239 {-# INLINE inlinePerformIO #-}
240 inlinePerformIO :: IO a -> a
241 inlinePerformIO (IO m) = case m realWorld# of (# _, r #) -> r