compiler/utils/StringBuffer.lhs

   1 %
   2 % (c) The University of Glasgow 2006
   3 % (c) The University of Glasgow, 1997-2006
   4 %
   5
   6 Buffers for scanning string input stored in external arrays.
   7
   8 \begin{code}
   9 module StringBuffer
  10        (
  11         StringBuffer(..),
  12         -- non-abstract for vs\/HaskellService
  13
  14          -- * Creation\/destruction
  15         hGetStringBuffer,
  16         hGetStringBufferBlock,
  17         appendStringBuffers,
  18         stringToStringBuffer,
  19
  20         -- * Inspection
  21         nextChar,
  22         currentChar,
  23         prevChar,
  24         atEnd,
  25
  26         -- * Moving and comparison
  27         stepOn,
  28         offsetBytes,
  29         byteDiff,
  30
  31         -- * Conversion
  32         lexemeToString,
  33         lexemeToFastString,
  34
  35          -- * Parsing integers
  36         parseUnsignedInteger,
  37        ) where
  38
  39 #include "HsVersions.h"
  40
  41 import Encoding
  42 import FastString hiding ( buf )
  43 import FastTypes
  44 import FastFunctions
  45
  46 import Foreign
  47 import System.IO                ( hGetBuf, hFileSize,IOMode(ReadMode), hClose
  48                                 , Handle, hTell )
  49
  50 import GHC.Exts
  51
  52 #if !defined(__GLASGOW_HASKELL__) || __GLASGOW_HASKELL__ >= 601
  53 import System.IO                ( openBinaryFile )
  54 #else
  55 import IOExts                   ( openFileEx, IOModeEx(..) )
  56 #endif
  57
  58 #if defined(__GLASGOW_HASKELL__) && __GLASGOW_HASKELL__ < 601
  59 openBinaryFile fp mode = openFileEx fp (BinaryMode mode)
  60 #endif
  61
  62 -- -----------------------------------------------------------------------------
  63 -- The StringBuffer type
  64
  65 -- |A StringBuffer is an internal pointer to a sized chunk of bytes.
  66 -- The bytes are intended to be *immutable*.  There are pure
  67 -- operations to read the contents of a StringBuffer.
  68 --
  69 -- A StringBuffer may have a finalizer, depending on how it was
  70 -- obtained.
  71 --
  72 data StringBuffer
  73  = StringBuffer {
  74      buf :: {-# UNPACK #-} !(ForeignPtr Word8),
  75      len :: {-# UNPACK #-} !Int,        -- length
  76      cur :: {-# UNPACK #-} !Int         -- current pos
  77   }
  78   -- The buffer is assumed to be UTF-8 encoded, and furthermore
  79   -- we add three '\0' bytes to the end as sentinels so that the
  80   -- decoder doesn't have to check for overflow at every single byte
  81   -- of a multibyte sequence.
  82
  83 instance Show StringBuffer where
  84         showsPrec _ s = showString "<stringbuffer("
  85                       . shows (len s) . showString "," . shows (cur s)
  86                       . showString ">"
  87
  88 -- -----------------------------------------------------------------------------
  89 -- Creation / Destruction
  90
  91 hGetStringBuffer :: FilePath -> IO StringBuffer
  92 hGetStringBuffer fname = do
  93    h <- openBinaryFile fname ReadMode
  94    size_i <- hFileSize h
  95    let size = fromIntegral size_i
  96    buf <- mallocForeignPtrArray (size+3)
  97    withForeignPtr buf $ \ptr -> do
  98      r <- if size == 0 then return 0 else hGetBuf h ptr size
  99      hClose h
 100      if (r /= size)
 101         then ioError (userError "short read of file")
 102         else newUTF8StringBuffer buf ptr size
 103
 104 hGetStringBufferBlock :: Handle -> Int -> IO StringBuffer
 105 hGetStringBufferBlock handle wanted
 106     = do size_i <- hFileSize handle
 107          offset_i <- hTell handle
 108          let size = min wanted (fromIntegral $ size_i-offset_i)
 109          buf <- mallocForeignPtrArray (size+3)
 110          withForeignPtr buf $ \ptr ->
 111              do r <- if size == 0 then return 0 else hGetBuf handle ptr size
 112                 if r /= size
 113                    then ioError (userError $ "short read of file: "++show(r,size,size_i,handle))
 114                    else newUTF8StringBuffer buf ptr size
 115
 116 newUTF8StringBuffer :: ForeignPtr Word8 -> Ptr Word8 -> Int -> IO StringBuffer
 117 newUTF8StringBuffer buf ptr size = do
 118   pokeArray (ptr `plusPtr` size :: Ptr Word8) [0,0,0]
 119   -- sentinels for UTF-8 decoding
 120   let
 121       sb0 = StringBuffer buf size 0
 122       (first_char, sb1) = nextChar sb0
 123         -- skip the byte-order mark if there is one (see #1744)
 124         -- This is better than treating #FEFF as whitespace,
 125         -- because that would mess up layout.  We don't have a concept
 126         -- of zero-width whitespace in Haskell: all whitespace codepoints
 127         -- have a width of one column.
 128   return (if first_char == '\xfeff' then sb1 else sb0)
 129
 130 appendStringBuffers :: StringBuffer -> StringBuffer -> IO StringBuffer
 131 appendStringBuffers sb1 sb2
 132     = do newBuf <- mallocForeignPtrArray (size+3)
 133          withForeignPtr newBuf $ \ptr ->
 134           withForeignPtr (buf sb1) $ \sb1Ptr ->
 135            withForeignPtr (buf sb2) $ \sb2Ptr ->
 136              do copyArray (sb1Ptr `advancePtr` cur sb1) ptr (calcLen sb1)
 137                 copyArray (sb2Ptr `advancePtr` cur sb2) (ptr `advancePtr` cur sb1) (calcLen sb2)
 138                 pokeArray (ptr `advancePtr` size) [0,0,0]
 139                 return (StringBuffer newBuf size 0)
 140     where calcLen sb = len sb - cur sb
 141           size = calcLen sb1 + calcLen sb2
 142
 143 stringToStringBuffer :: String -> IO StringBuffer
 144 stringToStringBuffer str = do
 145   let size = utf8EncodedLength str
 146   buf <- mallocForeignPtrArray (size+3)
 147   withForeignPtr buf $ \ptr -> do
 148     utf8EncodeString ptr str
 149     pokeArray (ptr `plusPtr` size :: Ptr Word8) [0,0,0]
 150     -- sentinels for UTF-8 decoding
 151   return (StringBuffer buf size 0)
 152
 153 -- -----------------------------------------------------------------------------
 154 -- Grab a character
 155
 156 -- Getting our fingers dirty a little here, but this is performance-critical
 157 {-# INLINE nextChar #-}
 158 nextChar :: StringBuffer -> (Char,StringBuffer)
 159 nextChar (StringBuffer buf len (I# cur#)) =
 160   inlinePerformIO $ do
 161     withForeignPtr buf $ \(Ptr a#) -> do
 162         case utf8DecodeChar# (a# `plusAddr#` cur#) of
 163           (# c#, b# #) ->
 164              let cur' = I# (b# `minusAddr#` a#) in
 165              return (C# c#, StringBuffer buf len cur')
 166
 167 currentChar :: StringBuffer -> Char
 168 currentChar = fst . nextChar
 169
 170 prevChar :: StringBuffer -> Char -> Char
 171 prevChar (StringBuffer _   _   0)   deflt = deflt
 172 prevChar (StringBuffer buf _   cur) _     =
 173   inlinePerformIO $ do
 174     withForeignPtr buf $ \p -> do
 175       p' <- utf8PrevChar (p `plusPtr` cur)
 176       return (fst (utf8DecodeChar p'))
 177
 178 -- -----------------------------------------------------------------------------
 179 -- Moving
 180
 181 stepOn :: StringBuffer -> StringBuffer
 182 stepOn s = snd (nextChar s)
 183
 184 offsetBytes :: Int -> StringBuffer -> StringBuffer
 185 offsetBytes i s = s { cur = cur s + i }
 186
 187 byteDiff :: StringBuffer -> StringBuffer -> Int
 188 byteDiff s1 s2 = cur s2 - cur s1
 189
 190 atEnd :: StringBuffer -> Bool
 191 atEnd (StringBuffer _ l c) = l == c
 192
 193 -- -----------------------------------------------------------------------------
 194 -- Conversion
 195
 196 lexemeToString :: StringBuffer -> Int {-bytes-} -> String
 197 lexemeToString _ 0 = ""
 198 lexemeToString (StringBuffer buf _ cur) bytes =
 199   inlinePerformIO $
 200     withForeignPtr buf $ \ptr ->
 201       utf8DecodeString (ptr `plusPtr` cur) bytes
 202
 203 lexemeToFastString :: StringBuffer -> Int {-bytes-} -> FastString
 204 lexemeToFastString _ 0 = nilFS
 205 lexemeToFastString (StringBuffer buf _ cur) len =
 206    inlinePerformIO $
 207      withForeignPtr buf $ \ptr ->
 208        return $! mkFastStringBytes (ptr `plusPtr` cur) len
 209
 210 -- -----------------------------------------------------------------------------
 211 -- Parsing integer strings in various bases
 212 {-
 213 byteOff :: StringBuffer -> Int -> Char
 214 byteOff (StringBuffer buf _ cur) i =
 215   inlinePerformIO $ withForeignPtr buf $ \ptr -> do
 216 --    return $! cBox (indexWord8OffFastPtrAsFastChar
 217 --                         (pUnbox ptr) (iUnbox (cur+i)))
 218 --or
 219 --    w <- peek (ptr `plusPtr` (cur+i))
 220 --    return (unsafeChr (fromIntegral (w::Word8)))
 221 -}
 222 -- | XXX assumes ASCII digits only (by using byteOff)
 223 parseUnsignedInteger :: StringBuffer -> Int -> Integer -> (Char->Int) -> Integer
 224 parseUnsignedInteger (StringBuffer buf _ cur) len radix char_to_int
 225   = inlinePerformIO $ withForeignPtr buf $ \ptr -> return $! let
 226     --LOL, in implementations where the indexing needs slow unsafePerformIO,
 227     --this is less (not more) efficient than using the IO monad explicitly
 228     --here.
 229     ptr' = pUnbox ptr
 230     byteOff i = cBox (indexWord8OffFastPtrAsFastChar ptr' (iUnbox (cur + i)))
 231     go i x | i == len  = x
 232            | otherwise = case byteOff i of
 233                char -> go (i + 1) (x * radix + toInteger (char_to_int char))
 234   in go 0 0
 235
 236 \end{code}