compiler/utils/StringBuffer.lhs

   1 %
   2 % (c) The University of Glasgow 2006
   3 % (c) The University of Glasgow, 1997-2006
   4 %
   5
   6 Buffers for scanning string input stored in external arrays.
   7
   8 \begin{code}
   9 {-# OPTIONS -w #-}
  10 -- The above warning supression flag is a temporary kludge.
  11 -- While working on this module you are encouraged to remove it and fix
  12 -- any warnings in the module. See
  13 --     http://hackage.haskell.org/trac/ghc/wiki/Commentary/CodingStyle#Warnings
  14 -- for details
  15
  16 module StringBuffer
  17        (
  18         StringBuffer(..),
  19         -- non-abstract for vs\/HaskellService
  20
  21          -- * Creation\/destruction
  22         hGetStringBuffer,
  23         hGetStringBufferBlock,
  24         appendStringBuffers,
  25         stringToStringBuffer,
  26
  27         -- * Inspection
  28         nextChar,
  29         currentChar,
  30         prevChar,
  31         atEnd,
  32
  33         -- * Moving and comparison
  34         stepOn,
  35         offsetBytes,
  36         byteDiff,
  37
  38         -- * Conversion
  39         lexemeToString,
  40         lexemeToFastString,
  41
  42          -- * Parsing integers
  43         parseUnsignedInteger,
  44        ) where
  45
  46 #include "HsVersions.h"
  47
  48 import Encoding
  49 import FastString               ( FastString,mkFastString,mkFastStringBytes )
  50 import FastTypes
  51 import FastFunctions
  52
  53 import Foreign
  54 import System.IO                ( hGetBuf, hFileSize,IOMode(ReadMode), hClose
  55                                 , Handle, hTell )
  56
  57 import GHC.Exts
  58
  59 #if !defined(__GLASGOW_HASKELL__) || __GLASGOW_HASKELL__ >= 601
  60 import System.IO                ( openBinaryFile )
  61 #else
  62 import IOExts                   ( openFileEx, IOModeEx(..) )
  63 #endif
  64
  65 #if defined(__GLASGOW_HASKELL__) && __GLASGOW_HASKELL__ < 601
  66 openBinaryFile fp mode = openFileEx fp (BinaryMode mode)
  67 #endif
  68
  69 -- -----------------------------------------------------------------------------
  70 -- The StringBuffer type
  71
  72 -- |A StringBuffer is an internal pointer to a sized chunk of bytes.
  73 -- The bytes are intended to be *immutable*.  There are pure
  74 -- operations to read the contents of a StringBuffer.
  75 --
  76 -- A StringBuffer may have a finalizer, depending on how it was
  77 -- obtained.
  78 --
  79 data StringBuffer
  80  = StringBuffer {
  81      buf :: {-# UNPACK #-} !(ForeignPtr Word8),
  82      len :: {-# UNPACK #-} !Int,        -- length
  83      cur :: {-# UNPACK #-} !Int         -- current pos
  84   }
  85         -- The buffer is assumed to be UTF-8 encoded, and furthermore
  86         -- we add three '\0' bytes to the end as sentinels so that the
  87         -- decoder doesn't have to check for overflow at every single byte
  88         -- of a multibyte sequence.
  89
  90 instance Show StringBuffer where
  91         showsPrec _ s = showString "<stringbuffer("
  92                       . shows (len s) . showString "," . shows (cur s)
  93                       . showString ">"
  94
  95 -- -----------------------------------------------------------------------------
  96 -- Creation / Destruction
  97
  98 hGetStringBuffer :: FilePath -> IO StringBuffer
  99 hGetStringBuffer fname = do
 100    h <- openBinaryFile fname ReadMode
 101    size_i <- hFileSize h
 102    let size = fromIntegral size_i
 103    buf <- mallocForeignPtrArray (size+3)
 104    withForeignPtr buf $ \ptr -> do
 105      r <- if size == 0 then return 0 else hGetBuf h ptr size
 106      hClose h
 107      if (r /= size)
 108         then ioError (userError "short read of file")
 109         else newUTF8StringBuffer buf ptr size
 110
 111 hGetStringBufferBlock :: Handle -> Int -> IO StringBuffer
 112 hGetStringBufferBlock handle wanted
 113     = do size_i <- hFileSize handle
 114          offset_i <- hTell handle
 115          let size = min wanted (fromIntegral $ size_i-offset_i)
 116          buf <- mallocForeignPtrArray (size+3)
 117          withForeignPtr buf $ \ptr ->
 118              do r <- if size == 0 then return 0 else hGetBuf handle ptr size
 119                 if r /= size
 120                    then ioError (userError $ "short read of file: "++show(r,size,size_i,handle))
 121                    else newUTF8StringBuffer buf ptr size
 122
 123 newUTF8StringBuffer :: ForeignPtr Word8 -> Ptr Word8 -> Int -> IO StringBuffer
 124 newUTF8StringBuffer buf ptr size = do
 125   pokeArray (ptr `plusPtr` size :: Ptr Word8) [0,0,0]
 126          -- sentinels for UTF-8 decoding
 127   let
 128       sb0 = StringBuffer buf size 0
 129       (first_char, sb1) = nextChar sb0
 130         -- skip the byte-order mark if there is one (see #1744)
 131         -- This is better than treating #FEFF as whitespace,
 132         -- because that would mess up layout.  We don't have a concept
 133         -- of zero-width whitespace in Haskell: all whitespace codepoints
 134         -- have a width of one column.
 135   return (if first_char == '\xfeff' then sb1 else sb0)
 136
 137 appendStringBuffers :: StringBuffer -> StringBuffer -> IO StringBuffer
 138 appendStringBuffers sb1 sb2
 139     = do newBuf <- mallocForeignPtrArray (size+3)
 140          withForeignPtr newBuf $ \ptr ->
 141           withForeignPtr (buf sb1) $ \sb1Ptr ->
 142            withForeignPtr (buf sb2) $ \sb2Ptr ->
 143              do copyArray (sb1Ptr `advancePtr` cur sb1) ptr (calcLen sb1)
 144                 copyArray (sb2Ptr `advancePtr` cur sb2) (ptr `advancePtr` cur sb1) (calcLen sb2)
 145                 pokeArray (ptr `advancePtr` size) [0,0,0]
 146                 return (StringBuffer newBuf size 0)
 147     where calcLen sb = len sb - cur sb
 148           size = calcLen sb1 + calcLen sb2
 149
 150 stringToStringBuffer :: String -> IO StringBuffer
 151 stringToStringBuffer str = do
 152   let size = utf8EncodedLength str
 153   buf <- mallocForeignPtrArray (size+3)
 154   withForeignPtr buf $ \ptr -> do
 155     utf8EncodeString ptr str
 156     pokeArray (ptr `plusPtr` size :: Ptr Word8) [0,0,0]
 157          -- sentinels for UTF-8 decoding
 158   return (StringBuffer buf size 0)
 159
 160 -- -----------------------------------------------------------------------------
 161 -- Grab a character
 162
 163 -- Getting our fingers dirty a little here, but this is performance-critical
 164 {-# INLINE nextChar #-}
 165 nextChar :: StringBuffer -> (Char,StringBuffer)
 166 nextChar (StringBuffer buf len (I# cur#)) =
 167   inlinePerformIO $ do
 168     withForeignPtr buf $ \(Ptr a#) -> do
 169         case utf8DecodeChar# (a# `plusAddr#` cur#) of
 170           (# c#, b# #) ->
 171              let cur' = I# (b# `minusAddr#` a#) in
 172              return (C# c#, StringBuffer buf len cur')
 173
 174 currentChar :: StringBuffer -> Char
 175 currentChar = fst . nextChar
 176
 177 prevChar :: StringBuffer -> Char -> Char
 178 prevChar (StringBuffer buf len 0)   deflt = deflt
 179 prevChar (StringBuffer buf len cur) deflt =
 180   inlinePerformIO $ do
 181     withForeignPtr buf $ \p -> do
 182       p' <- utf8PrevChar (p `plusPtr` cur)
 183       return (fst (utf8DecodeChar p'))
 184
 185 -- -----------------------------------------------------------------------------
 186 -- Moving
 187
 188 stepOn :: StringBuffer -> StringBuffer
 189 stepOn s = snd (nextChar s)
 190
 191 offsetBytes :: Int -> StringBuffer -> StringBuffer
 192 offsetBytes i s = s { cur = cur s + i }
 193
 194 byteDiff :: StringBuffer -> StringBuffer -> Int
 195 byteDiff s1 s2 = cur s2 - cur s1
 196
 197 atEnd :: StringBuffer -> Bool
 198 atEnd (StringBuffer _ l c) = l == c
 199
 200 -- -----------------------------------------------------------------------------
 201 -- Conversion
 202
 203 lexemeToString :: StringBuffer -> Int {-bytes-} -> String
 204 lexemeToString _ 0 = ""
 205 lexemeToString (StringBuffer buf _ cur) bytes =
 206   inlinePerformIO $
 207     withForeignPtr buf $ \ptr ->
 208       utf8DecodeString (ptr `plusPtr` cur) bytes
 209
 210 lexemeToFastString :: StringBuffer -> Int {-bytes-} -> FastString
 211 lexemeToFastString _ 0 = mkFastString ""
 212 lexemeToFastString (StringBuffer buf _ cur) len =
 213    inlinePerformIO $
 214      withForeignPtr buf $ \ptr ->
 215        return $! mkFastStringBytes (ptr `plusPtr` cur) len
 216
 217 -- -----------------------------------------------------------------------------
 218 -- Parsing integer strings in various bases
 219 {-
 220 byteOff :: StringBuffer -> Int -> Char
 221 byteOff (StringBuffer buf _ cur) i =
 222   inlinePerformIO $ withForeignPtr buf $ \ptr -> do
 223 --    return $! cBox (indexWord8OffFastPtrAsFastChar
 224 --                         (pUnbox ptr) (iUnbox (cur+i)))
 225 --or
 226 --    w <- peek (ptr `plusPtr` (cur+i))
 227 --    return (unsafeChr (fromIntegral (w::Word8)))
 228 -}
 229 -- | XXX assumes ASCII digits only (by using byteOff)
 230 parseUnsignedInteger :: StringBuffer -> Int -> Integer -> (Char->Int) -> Integer
 231 parseUnsignedInteger (StringBuffer buf _ cur) len radix char_to_int
 232   = inlinePerformIO $ withForeignPtr buf $ \ptr -> return $! let
 233     --LOL, in implementations where the indexing needs slow unsafePerformIO,
 234     --this is less (not more) efficient than using the IO monad explicitly
 235     --here.
 236     byteOff p i = cBox (indexWord8OffFastPtrAsFastChar
 237                          (pUnbox ptr) (iUnbox (cur+i)))
 238     go i x | i == len  = x
 239            | otherwise = case byteOff ptr i of
 240                char -> go (i+1) (x * radix + toInteger (char_to_int char))
 241   in go 0 0
 242
 243 \end{code}