compiler/utils/StringBuffer.lhs

   1 %
   2 % (c) The University of Glasgow 2006
   3 % (c) The University of Glasgow, 1997-2006
   4 %
   5
   6 Buffers for scanning string input stored in external arrays.
   7
   8 \begin{code}
   9 {-# OPTIONS_GHC -O -funbox-strict-fields #-}
  10 -- We always optimise this, otherwise performance of a non-optimised
  11 -- compiler is severely affected
  12
  13 module StringBuffer
  14        (
  15         StringBuffer(..),
  16         -- non-abstract for vs\/HaskellService
  17
  18          -- * Creation\/destruction
  19         hGetStringBuffer,
  20         hGetStringBufferBlock,
  21         appendStringBuffers,
  22         stringToStringBuffer,
  23
  24         -- * Inspection
  25         nextChar,
  26         currentChar,
  27         prevChar,
  28         atEnd,
  29
  30         -- * Moving and comparison
  31         stepOn,
  32         offsetBytes,
  33         byteDiff,
  34
  35         -- * Conversion
  36         lexemeToString,
  37         lexemeToFastString,
  38
  39          -- * Parsing integers
  40         parseUnsignedInteger,
  41        ) where
  42
  43 #include "HsVersions.h"
  44
  45 import Encoding
  46 import FastString hiding ( buf )
  47 import FastTypes
  48 import FastFunctions
  49
  50 import Foreign
  51 import System.IO                ( hGetBuf, hFileSize,IOMode(ReadMode), hClose
  52                                 , Handle, hTell )
  53
  54 import GHC.Exts
  55
  56 import System.IO                ( openBinaryFile )
  57
  58 -- -----------------------------------------------------------------------------
  59 -- The StringBuffer type
  60
  61 -- |A StringBuffer is an internal pointer to a sized chunk of bytes.
  62 -- The bytes are intended to be *immutable*.  There are pure
  63 -- operations to read the contents of a StringBuffer.
  64 --
  65 -- A StringBuffer may have a finalizer, depending on how it was
  66 -- obtained.
  67 --
  68 data StringBuffer
  69  = StringBuffer {
  70      buf :: {-# UNPACK #-} !(ForeignPtr Word8),
  71      len :: {-# UNPACK #-} !Int,        -- length
  72      cur :: {-# UNPACK #-} !Int         -- current pos
  73   }
  74   -- The buffer is assumed to be UTF-8 encoded, and furthermore
  75   -- we add three '\0' bytes to the end as sentinels so that the
  76   -- decoder doesn't have to check for overflow at every single byte
  77   -- of a multibyte sequence.
  78
  79 instance Show StringBuffer where
  80         showsPrec _ s = showString "<stringbuffer("
  81                       . shows (len s) . showString "," . shows (cur s)
  82                       . showString ")>"
  83
  84 -- -----------------------------------------------------------------------------
  85 -- Creation / Destruction
  86
  87 hGetStringBuffer :: FilePath -> IO StringBuffer
  88 hGetStringBuffer fname = do
  89    h <- openBinaryFile fname ReadMode
  90    size_i <- hFileSize h
  91    let size = fromIntegral size_i
  92    buf <- mallocForeignPtrArray (size+3)
  93    withForeignPtr buf $ \ptr -> do
  94      r <- if size == 0 then return 0 else hGetBuf h ptr size
  95      hClose h
  96      if (r /= size)
  97         then ioError (userError "short read of file")
  98         else newUTF8StringBuffer buf ptr size
  99
 100 hGetStringBufferBlock :: Handle -> Int -> IO StringBuffer
 101 hGetStringBufferBlock handle wanted
 102     = do size_i <- hFileSize handle
 103          offset_i <- hTell handle
 104          let size = min wanted (fromIntegral $ size_i-offset_i)
 105          buf <- mallocForeignPtrArray (size+3)
 106          withForeignPtr buf $ \ptr ->
 107              do r <- if size == 0 then return 0 else hGetBuf handle ptr size
 108                 if r /= size
 109                    then ioError (userError $ "short read of file: "++show(r,size,size_i,handle))
 110                    else newUTF8StringBuffer buf ptr size
 111
 112 newUTF8StringBuffer :: ForeignPtr Word8 -> Ptr Word8 -> Int -> IO StringBuffer
 113 newUTF8StringBuffer buf ptr size = do
 114   pokeArray (ptr `plusPtr` size :: Ptr Word8) [0,0,0]
 115   -- sentinels for UTF-8 decoding
 116   let
 117       sb0 = StringBuffer buf size 0
 118       (first_char, sb1) = nextChar sb0
 119         -- skip the byte-order mark if there is one (see #1744)
 120         -- This is better than treating #FEFF as whitespace,
 121         -- because that would mess up layout.  We don't have a concept
 122         -- of zero-width whitespace in Haskell: all whitespace codepoints
 123         -- have a width of one column.
 124   return (if first_char == '\xfeff' then sb1 else sb0)
 125
 126 appendStringBuffers :: StringBuffer -> StringBuffer -> IO StringBuffer
 127 appendStringBuffers sb1 sb2
 128     = do newBuf <- mallocForeignPtrArray (size+3)
 129          withForeignPtr newBuf $ \ptr ->
 130           withForeignPtr (buf sb1) $ \sb1Ptr ->
 131            withForeignPtr (buf sb2) $ \sb2Ptr ->
 132              do copyArray ptr (sb1Ptr `advancePtr` cur sb1) sb1_len
 133                 copyArray (ptr `advancePtr` sb1_len) (sb2Ptr `advancePtr` cur sb2) sb2_len
 134                 pokeArray (ptr `advancePtr` size) [0,0,0]
 135                 return (StringBuffer newBuf size 0)
 136     where sb1_len = calcLen sb1
 137           sb2_len = calcLen sb2
 138           calcLen sb = len sb - cur sb
 139           size =  sb1_len + sb2_len
 140
 141 stringToStringBuffer :: String -> IO StringBuffer
 142 stringToStringBuffer str = do
 143   let size = utf8EncodedLength str
 144   buf <- mallocForeignPtrArray (size+3)
 145   withForeignPtr buf $ \ptr -> do
 146     utf8EncodeString ptr str
 147     pokeArray (ptr `plusPtr` size :: Ptr Word8) [0,0,0]
 148     -- sentinels for UTF-8 decoding
 149   return (StringBuffer buf size 0)
 150
 151 -- -----------------------------------------------------------------------------
 152 -- Grab a character
 153
 154 -- Getting our fingers dirty a little here, but this is performance-critical
 155 {-# INLINE nextChar #-}
 156 nextChar :: StringBuffer -> (Char,StringBuffer)
 157 nextChar (StringBuffer buf len (I# cur#)) =
 158   inlinePerformIO $ do
 159     withForeignPtr buf $ \(Ptr a#) -> do
 160         case utf8DecodeChar# (a# `plusAddr#` cur#) of
 161           (# c#, b# #) ->
 162              let cur' = I# (b# `minusAddr#` a#) in
 163              return (C# c#, StringBuffer buf len cur')
 164
 165 currentChar :: StringBuffer -> Char
 166 currentChar = fst . nextChar
 167
 168 prevChar :: StringBuffer -> Char -> Char
 169 prevChar (StringBuffer _   _   0)   deflt = deflt
 170 prevChar (StringBuffer buf _   cur) _     =
 171   inlinePerformIO $ do
 172     withForeignPtr buf $ \p -> do
 173       p' <- utf8PrevChar (p `plusPtr` cur)
 174       return (fst (utf8DecodeChar p'))
 175
 176 -- -----------------------------------------------------------------------------
 177 -- Moving
 178
 179 stepOn :: StringBuffer -> StringBuffer
 180 stepOn s = snd (nextChar s)
 181
 182 offsetBytes :: Int -> StringBuffer -> StringBuffer
 183 offsetBytes i s = s { cur = cur s + i }
 184
 185 byteDiff :: StringBuffer -> StringBuffer -> Int
 186 byteDiff s1 s2 = cur s2 - cur s1
 187
 188 atEnd :: StringBuffer -> Bool
 189 atEnd (StringBuffer _ l c) = l == c
 190
 191 -- -----------------------------------------------------------------------------
 192 -- Conversion
 193
 194 lexemeToString :: StringBuffer -> Int {-bytes-} -> String
 195 lexemeToString _ 0 = ""
 196 lexemeToString (StringBuffer buf _ cur) bytes =
 197   inlinePerformIO $
 198     withForeignPtr buf $ \ptr ->
 199       utf8DecodeString (ptr `plusPtr` cur) bytes
 200
 201 lexemeToFastString :: StringBuffer -> Int {-bytes-} -> FastString
 202 lexemeToFastString _ 0 = nilFS
 203 lexemeToFastString (StringBuffer buf _ cur) len =
 204    inlinePerformIO $
 205      withForeignPtr buf $ \ptr ->
 206        return $! mkFastStringBytes (ptr `plusPtr` cur) len
 207
 208 -- -----------------------------------------------------------------------------
 209 -- Parsing integer strings in various bases
 210 {-
 211 byteOff :: StringBuffer -> Int -> Char
 212 byteOff (StringBuffer buf _ cur) i =
 213   inlinePerformIO $ withForeignPtr buf $ \ptr -> do
 214 --    return $! cBox (indexWord8OffFastPtrAsFastChar
 215 --                         (pUnbox ptr) (iUnbox (cur+i)))
 216 --or
 217 --    w <- peek (ptr `plusPtr` (cur+i))
 218 --    return (unsafeChr (fromIntegral (w::Word8)))
 219 -}
 220 -- | XXX assumes ASCII digits only (by using byteOff)
 221 parseUnsignedInteger :: StringBuffer -> Int -> Integer -> (Char->Int) -> Integer
 222 parseUnsignedInteger (StringBuffer buf _ cur) len radix char_to_int
 223   = inlinePerformIO $ withForeignPtr buf $ \ptr -> return $! let
 224     --LOL, in implementations where the indexing needs slow unsafePerformIO,
 225     --this is less (not more) efficient than using the IO monad explicitly
 226     --here.
 227     !ptr' = pUnbox ptr
 228     byteOff i = cBox (indexWord8OffFastPtrAsFastChar ptr' (iUnbox (cur + i)))
 229     go i x | i == len  = x
 230            | otherwise = case byteOff i of
 231                char -> go (i + 1) (x * radix + toInteger (char_to_int char))
 232   in go 0 0
 233
 234 \end{code}