ghc/compiler/utils/StringBuffer.lhs

   1 %
   2 % (c) The GRASP/AQUA Project, Glasgow University, 1997-1998
   3 %
   4 \section{String buffers}
   5
   6 Buffers for scanning string input stored in external arrays.
   7
   8 \begin{code}
   9
  10 module StringBuffer
  11        (
  12         StringBuffer,
  13
  14          -- creation/destruction
  15         hGetStringBuffer,     -- :: FilePath     -> IO StringBuffer
  16         stringToStringBuffer, -- :: String       -> IO StringBuffer
  17         freeStringBuffer,     -- :: StringBuffer -> IO ()
  18
  19          -- Lookup
  20         currentChar,      -- :: StringBuffer -> Char
  21         currentChar#,     -- :: StringBuffer -> Char#
  22         indexSBuffer,     -- :: StringBuffer -> Int -> Char
  23         indexSBuffer#,    -- :: StringBuffer -> Int# -> Char#
  24          -- relative lookup, i.e, currentChar = lookAhead 0
  25         lookAhead,        -- :: StringBuffer -> Int  -> Char
  26         lookAhead#,       -- :: StringBuffer -> Int# -> Char#
  27
  28         -- offsets
  29         currentIndex#,    -- :: StringBuffer -> Int#
  30         lexemeIndex,      -- :: StringBuffer -> Int#
  31
  32          -- moving the end point of the current lexeme.
  33         setCurrentPos#,   -- :: StringBuffer -> Int# -> StringBuffer
  34         incLexeme,        -- :: StringBuffer -> StringBuffer
  35         decLexeme,        -- :: StringBuffer -> StringBuffer
  36
  37          -- move the start and end lexeme pointer on by x units.
  38         stepOn,           -- :: StringBuffer -> StringBuffer
  39         stepOnBy#,        -- :: StringBuffer -> Int# -> StringBuffer
  40         stepOnTo#,        -- :: StringBuffer -> Int# -> StringBuffer
  41         stepOnUntil,      -- :: (Char -> Bool) -> StringBuffer -> StringBuffer
  42         stepOnUntilChar#, -- :: StringBuffer -> Char# -> StringBuffer
  43         stepOverLexeme,   -- :: StringBuffer   -> StringBuffer
  44         scanNumLit,       -- :: Int -> StringBuffer -> (Int, StringBuffer)
  45         squeezeLexeme,    -- :: StringBuffer -> Int# -> StringBuffer
  46         mergeLexemes,     -- :: StringBuffer -> StringBuffer -> StringBuffer
  47         expandWhile,      -- :: (Char  -> Bool) -> StringBuffer -> StringBuffer
  48         expandWhile#,     -- :: (Char# -> Bool) -> StringBuffer -> StringBuffer
  49         expandUntilMatch, -- :: StrinBuffer -> String -> StringBuffer
  50          -- at or beyond end of buffer?
  51         bufferExhausted,  -- :: StringBuffer -> Bool
  52         emptyLexeme,      -- :: StringBuffer -> Bool
  53
  54          -- matching
  55         prefixMatch,       -- :: StringBuffer -> String -> Bool
  56         untilEndOfString#, -- :: StringBuffer -> Int#
  57
  58          -- conversion
  59         lexemeToString,     -- :: StringBuffer -> String
  60         lexemeToByteArray,  -- :: StringBuffer -> _ByteArray Int
  61         lexemeToFastString, -- :: StringBuffer -> FastString
  62         lexemeToBuffer,     -- :: StringBuffer -> StringBuffer
  63
  64         FastString,
  65         ByteArray
  66        ) where
  67
  68 #include "HsVersions.h"
  69
  70 import GlaExts
  71 #if __GLASGOW_HASKELL__ < 411
  72 import PrelAddr         ( Addr(..) )
  73 #else
  74 import Addr             ( Addr(..) )
  75 #endif
  76 import Foreign
  77 import Char             ( chr )
  78 import Panic            ( panic )
  79
  80 import IO               ( openFile  )
  81 import IOExts           ( slurpFile )
  82 import PrelIOBase
  83 import PrelHandle
  84 import Addr
  85 #if __GLASGOW_HASKELL__ >= 411
  86 import Ptr              ( Ptr(..) )
  87 #endif
  88
  89 import PrelPack         ( unpackCStringBA )
  90
  91 #if __GLASGOW_HASKELL__ >= 501
  92 import PrelIO           ( hGetcBuffered )
  93 import PrelCError       ( throwErrnoIfMinus1RetryMayBlock )
  94 import PrelConc         ( threadWaitRead )
  95 #endif
  96
  97 import Exception        ( bracket )
  98 import PrimPacked
  99 import FastString
 100 import Char             ( isDigit )
 101 \end{code}
 102
 103 \begin{code}
 104 data StringBuffer
 105  = StringBuffer
 106      Addr#
 107      Int#         -- length
 108      Int#         -- lexeme start
 109      Int#         -- current pos
 110 \end{code}
 111
 112 \begin{code}
 113 instance Show StringBuffer where
 114         showsPrec _ s = showString ""
 115 \end{code}
 116
 117 \begin{code}
 118 hGetStringBuffer :: Bool -> FilePath -> IO StringBuffer
 119 hGetStringBuffer expand_tabs fname = do
 120    (a, read) <- if expand_tabs
 121                                 then slurpFileExpandTabs fname
 122 #if __GLASGOW_HASKELL__ < 411
 123                                 else slurpFile fname
 124 #else
 125                                 else do
 126                                     (Ptr a#, read) <- slurpFile fname
 127                                     return (A# a#, read)
 128 #endif
 129
 130    let (A# a#) = a;  (I# read#) = read
 131
 132          -- add sentinel '\NUL'
 133    _casm_ `` ((char *)%0)[(int)%1]=(char)0; '' (A# a#) (I# (read# -# 1#))
 134    return (StringBuffer a# read# 0# 0#)
 135
 136 unsafeWriteBuffer :: StringBuffer -> Int# -> Char# -> StringBuffer
 137 unsafeWriteBuffer s@(StringBuffer a _ _ _) i# ch# =
 138  unsafePerformIO (
 139    _casm_ `` ((char *)%0)[(int)%1]=(char)%2; '' (A# a) (I# i#) (C# ch#) >>= \ () ->
 140    return s
 141  )
 142 \end{code}
 143
 144 -----------------------------------------------------------------------------
 145 -- Turn a String into a StringBuffer
 146
 147 \begin{code}
 148 stringToStringBuffer :: String -> IO StringBuffer
 149 freeStringBuffer :: StringBuffer -> IO ()
 150
 151 #if __GLASGOW_HASKELL__ >= 411
 152 stringToStringBuffer str =
 153   do let sz@(I# sz#) = length str
 154      (Ptr a#) <- mallocBytes (sz+1)
 155      fill_in str (A# a#)
 156      writeCharOffAddr (A# a#) sz '\0'           -- sentinel
 157      return (StringBuffer a# sz# 0# 0#)
 158  where
 159   fill_in [] _ = return ()
 160   fill_in (c:cs) a = do
 161     writeCharOffAddr a 0 c
 162     fill_in cs (a `plusAddr` 1)
 163
 164 freeStringBuffer (StringBuffer a# _ _ _) = Foreign.free (Ptr a#)
 165 #else
 166 stringToStringBuffer = panic "stringToStringBuffer: not implemented"
 167 freeStringBuffer sb  = return ()
 168 #endif
 169
 170 \end{code}
 171
 172 -----------------------------------------------------------------------------
 173 This very disturbing bit of code is used for expanding the tabs in a
 174 file before we start parsing it.  Expanding the tabs early makes the
 175 lexer a lot simpler: we only have to record the beginning of the line
 176 in order to be able to calculate the column offset of the current
 177 token.
 178
 179 We guess the size of the buffer required as 20% extra for
 180 expanded tabs, and enlarge it if necessary.
 181
 182 \begin{code}
 183 getErrType :: IO Int
 184 getErrType =  _ccall_ getErrType__
 185
 186 slurpFileExpandTabs :: FilePath -> IO (Addr,Int)
 187 slurpFileExpandTabs fname = do
 188   bracket (openFile fname ReadMode) (hClose)
 189    (\ handle ->
 190      do sz <- hFileSize handle
 191         if sz > toInteger (maxBound::Int)
 192           then IOERROR (userError "slurpFile: file too big")
 193           else do
 194             let sz_i = fromInteger sz
 195                 sz_i' = (sz_i * 12) `div` 10            -- add 20% for tabs
 196             chunk <- allocMem sz_i'
 197             trySlurp handle sz_i' chunk
 198    )
 199
 200 trySlurp :: Handle -> Int -> Addr -> IO (Addr, Int)
 201 trySlurp handle sz_i chunk =
 202 #if __GLASGOW_HASKELL__ < 501
 203   wantReadableHandle "hGetChar" handle $ \ handle_ ->
 204   let fo = haFO__ handle_ in
 205 #else
 206   wantReadableHandle "hGetChar" handle $
 207       \ handle_@Handle__{ haFD=fd, haBuffer=ref, haBufferMode=mode } ->
 208 #endif
 209   let
 210         (I# chunk_sz) = sz_i
 211
 212         tAB_SIZE = 8#
 213
 214         slurpFile :: Int# -> Int# -> Addr -> Int# -> Int# -> IO (Addr, Int)
 215         slurpFile c off chunk chunk_sz max_off = slurp c off
 216          where
 217
 218           slurp :: Int# -> Int# -> IO (Addr, Int)
 219           slurp c off | off >=# max_off = do
 220                 let new_sz = chunk_sz *# 2#
 221                 chunk' <- reAllocMem chunk (I# new_sz)
 222                 slurpFile c off chunk' new_sz (new_sz -# (tAB_SIZE +# 1#))
 223           slurp c off = do
 224 #if __GLASGOW_HASKELL__ < 501
 225                 intc <- mayBlock fo (_ccall_ fileGetc fo)
 226                 if intc == ((-1)::Int)
 227                   then do errtype <- getErrType
 228                           if errtype == (19{-ERR_EOF-} :: Int)
 229                             then return (chunk, I# off)
 230                             else constructErrorAndFail "slurpFile"
 231                   else case chr intc of
 232 #else
 233                 buf <- readIORef ref
 234                 ch <- (if not (bufferEmpty buf)
 235                       then hGetcBuffered fd ref buf
 236                       else do new_buf <- fillReadBuffer fd True buf
 237                               hGetcBuffered fd ref new_buf)
 238                     `catch` \e -> if isEOFError e
 239                         then return '\xFFFF'
 240                         else ioError e
 241                 case ch of
 242                          '\xFFFF' -> return (chunk, I# off)
 243 #endif
 244                          '\t' -> tabIt c off
 245                          ch   -> do  writeCharOffAddr chunk (I# off) ch
 246                                      let c' | ch == '\n' = 0#
 247                                             | otherwise  = c +# 1#
 248                                      slurp c' (off +# 1#)
 249
 250           tabIt :: Int# -> Int# -> IO (Addr, Int)
 251           -- can't run out of buffer in here, because we reserved an
 252           -- extra tAB_SIZE bytes at the end earlier.
 253           tabIt c off = do
 254                 writeCharOffAddr chunk (I# off) ' '
 255                 let c' = c +# 1#
 256                     off' = off +# 1#
 257                 if c' `remInt#` tAB_SIZE ==# 0#
 258                         then slurp c' off'
 259                         else tabIt c' off'
 260   in do
 261
 262         -- allow space for a full tab at the end of the buffer
 263         -- (that's what the max_off thing is for),
 264         -- and add 1 to allow room for the final sentinel \NUL at
 265         -- the end of the file.
 266   (chunk', rc) <- slurpFile 0# 0# chunk chunk_sz (chunk_sz -# (tAB_SIZE +# 1#))
 267 #if __GLASGOW_HASKELL__ < 404
 268   writeHandle handle handle_
 269 #endif
 270   return (chunk', rc+1 {-room for sentinel-})
 271
 272
 273 reAllocMem :: Addr -> Int -> IO Addr
 274 reAllocMem ptr sz = do
 275    chunk <- _ccall_ realloc ptr sz
 276    if chunk == nullAddr
 277       then fail "reAllocMem"
 278       else return chunk
 279
 280 allocMem :: Int -> IO Addr
 281 allocMem sz = do
 282    chunk <- _ccall_ malloc sz
 283    if chunk == nullAddr
 284 #if __GLASGOW_HASKELL__ < 501
 285       then constructErrorAndFail "allocMem"
 286 #else
 287       then ioException (IOError Nothing ResourceExhausted "malloc"
 288                                         "out of memory" Nothing)
 289 #endif
 290       else return chunk
 291 \end{code}
 292
 293 Lookup
 294
 295 \begin{code}
 296 currentChar  :: StringBuffer -> Char
 297 currentChar sb = case currentChar# sb of c -> C# c
 298
 299 lookAhead :: StringBuffer -> Int  -> Char
 300 lookAhead sb (I# i#) = case lookAhead# sb i# of c -> C# c
 301
 302 indexSBuffer :: StringBuffer -> Int -> Char
 303 indexSBuffer sb (I# i#) = case indexSBuffer# sb i# of c -> C# c
 304
 305 currentChar# :: StringBuffer -> Char#
 306 indexSBuffer# :: StringBuffer -> Int# -> Char#
 307 lookAhead# :: StringBuffer -> Int# -> Char#
 308 currentChar# (StringBuffer fo# _ _ current#) = indexCharOffAddr# fo# current#
 309 indexSBuffer# (StringBuffer fo# _ _ _) i# = indexCharOffAddr# fo# i#
 310
 311  -- relative lookup, i.e, currentChar = lookAhead 0
 312 lookAhead# (StringBuffer fo# _ _ c#) i# = indexCharOffAddr# fo# (c# +# i#)
 313
 314 currentIndex# :: StringBuffer -> Int#
 315 currentIndex# (StringBuffer fo# _ _ c#) = c#
 316
 317 lexemeIndex :: StringBuffer -> Int#
 318 lexemeIndex (StringBuffer fo# _ c# _) = c#
 319 \end{code}
 320
 321  moving the start point of the current lexeme.
 322
 323 \begin{code}
 324  -- moving the end point of the current lexeme.
 325 setCurrentPos# :: StringBuffer -> Int# -> StringBuffer
 326 setCurrentPos# (StringBuffer fo l# s# c#) i# =
 327  StringBuffer fo l# s# (c# +# i#)
 328
 329 -- augmenting the current lexeme by one.
 330 incLexeme :: StringBuffer -> StringBuffer
 331 incLexeme (StringBuffer fo l# s# c#) = StringBuffer fo l# s# (c# +# 1#)
 332
 333 decLexeme :: StringBuffer -> StringBuffer
 334 decLexeme (StringBuffer fo l# s# c#) = StringBuffer fo l# s# (c# -# 1#)
 335
 336 \end{code}
 337
 338 -- move the start and end point of the buffer on by
 339 -- x units.
 340
 341 \begin{code}
 342 stepOn :: StringBuffer -> StringBuffer
 343 stepOn (StringBuffer fo l# s# c#) = StringBuffer fo l# (s# +# 1#) (s# +# 1#) -- assume they're the same.
 344
 345 stepOnBy# :: StringBuffer -> Int# -> StringBuffer
 346 stepOnBy# (StringBuffer fo# l# s# c#) i# =
 347  case s# +# i# of
 348   new_s# -> StringBuffer fo# l# new_s# new_s#
 349
 350 -- jump to pos.
 351 stepOnTo# :: StringBuffer -> Int# -> StringBuffer
 352 stepOnTo# (StringBuffer fo l _ _) s# = StringBuffer fo l s# s#
 353
 354 squeezeLexeme :: StringBuffer -> Int# -> StringBuffer
 355 squeezeLexeme (StringBuffer fo l s# c#) i# = StringBuffer fo l (s# +# i#) c#
 356
 357 mergeLexemes :: StringBuffer -> StringBuffer -> StringBuffer
 358 mergeLexemes (StringBuffer fo l s# _) (StringBuffer _ _ _ c#)
 359    = StringBuffer fo l s# c#
 360
 361 stepOnUntil :: (Char -> Bool) -> StringBuffer -> StringBuffer
 362
 363 stepOnUntil pred (StringBuffer fo l# s# c#) =
 364  loop c#
 365   where
 366    loop c# =
 367     case indexCharOffAddr# fo c# of
 368      ch# | pred (C# ch#) -> StringBuffer fo l# c# c#
 369          | ch# `eqChar#` '\NUL'# && c# >=# l# -> StringBuffer fo l# l# l# -- EOB, return immediately.
 370          | otherwise     -> loop (c# +# 1#)
 371
 372 stepOverLexeme :: StringBuffer -> StringBuffer
 373 stepOverLexeme (StringBuffer fo l s# c#) = StringBuffer fo l c# c#
 374
 375 expandWhile :: (Char -> Bool) -> StringBuffer -> StringBuffer
 376 expandWhile pred (StringBuffer fo l# s# c#) =
 377  loop c#
 378   where
 379    loop c# =
 380     case indexCharOffAddr# fo c# of
 381      ch# | pred (C# ch#) -> loop (c# +# 1#)
 382          | ch# `eqChar#` '\NUL'# && c# >=# l# -> StringBuffer fo l# l# l# -- EOB, return immediately.
 383          | otherwise     -> StringBuffer fo l# s# c#
 384
 385 expandWhile# :: (Char# -> Bool) -> StringBuffer -> StringBuffer
 386 expandWhile# pred (StringBuffer fo l# s# c#) =
 387  loop c#
 388   where
 389    loop c# =
 390     case indexCharOffAddr# fo c# of
 391      ch# | pred ch# -> loop (c# +# 1#)
 392          | ch# `eqChar#` '\NUL'# && c# >=# l# -> StringBuffer fo l# s# c# -- EOB, return immediately.
 393          | otherwise     -> StringBuffer fo l# s# c#
 394
 395 scanNumLit :: Integer -> StringBuffer -> (Integer,StringBuffer)
 396 scanNumLit acc (StringBuffer fo l# s# c#) =
 397  loop acc c#
 398   where
 399    loop acc c# =
 400     case indexCharOffAddr# fo c# of
 401      ch# | isDigit (C# ch#) -> loop (acc*10 + (toInteger (I# (ord# ch# -# ord# '0'#)))) (c# +# 1#)
 402          | ch# `eqChar#` '\NUL'# && c# >=# l# -> (acc, StringBuffer fo l# s# c#) -- EOB, return immediately.
 403          | otherwise        -> (acc,StringBuffer fo l# s# c#)
 404
 405
 406 expandUntilMatch :: StringBuffer -> String -> Maybe StringBuffer
 407 expandUntilMatch (StringBuffer fo l# s# c#) str =
 408   loop c# str
 409   where
 410    loop c# [] = Just (StringBuffer fo l# s# c#)
 411    loop c# ((C# x#):xs) =
 412     case indexCharOffAddr# fo c# of
 413       ch# | ch# `eqChar#` '\NUL'# && c# >=# l# -> Nothing
 414           | ch# `eqChar#` x# -> loop (c# +# 1#) xs
 415           | otherwise        -> loop (c# +# 1#) str
 416
 417 \end{code}
 418
 419 \begin{code}
 420    -- at or beyond end of buffer?
 421 bufferExhausted :: StringBuffer -> Bool
 422 bufferExhausted (StringBuffer fo l# _ c#) = c# >=# l#
 423
 424 emptyLexeme :: StringBuffer -> Bool
 425 emptyLexeme (StringBuffer fo l# s# c#) = s# ==# c#
 426
 427  -- matching
 428 prefixMatch :: StringBuffer -> String -> Maybe StringBuffer
 429 prefixMatch (StringBuffer fo l# s# c#) str =
 430   loop c# str
 431   where
 432    loop c# [] = Just (StringBuffer fo l# s# c#)
 433    loop c# ((C# x#):xs)
 434      | indexCharOffAddr# fo c# `eqChar#` x#
 435      = loop (c# +# 1#) xs
 436      | otherwise
 437      = Nothing
 438
 439 untilEndOfString# :: StringBuffer -> StringBuffer
 440 untilEndOfString# (StringBuffer fo l# s# c#) =
 441  loop c#
 442  where
 443   getch# i# = indexCharOffAddr# fo i#
 444
 445   loop c# =
 446    case getch# c# of
 447     '\"'# ->
 448       case getch# (c# -# 1#) of
 449         '\\'# ->
 450                   -- looks like an escaped something or other to me,
 451                   -- better count the number of "\\"s that are immediately
 452                   -- preceeding to decide if the " is escaped.
 453               let
 454                odd_slashes flg i# =
 455                 case getch# i# of
 456                  '\\'# -> odd_slashes (not flg) (i# -# 1#)
 457                  _     -> flg
 458               in
 459               if odd_slashes True (c# -# 2#) then
 460                   -- odd number, " is ecaped.
 461                   loop (c# +# 1#)
 462               else  -- a real end of string delimiter after all.
 463                   StringBuffer fo l# s# c#
 464         _ -> StringBuffer fo l# s# c#
 465     '\NUL'# ->
 466         if c# >=# l# then -- hit sentinel, this doesn't look too good..
 467            StringBuffer fo l# l# l#
 468         else
 469            loop (c# +# 1#)
 470     _ -> loop (c# +# 1#)
 471
 472
 473 stepOnUntilChar# :: StringBuffer -> Char# -> StringBuffer
 474 stepOnUntilChar# (StringBuffer fo l# s# c#) x# =
 475  loop c#
 476  where
 477   loop c#
 478    | c# >=# l# || indexCharOffAddr# fo c# `eqChar#` x#
 479    = StringBuffer fo l# c# c#
 480    | otherwise
 481    = loop (c# +# 1#)
 482
 483          -- conversion
 484 lexemeToString :: StringBuffer -> String
 485 lexemeToString (StringBuffer fo _ start_pos# current#) =
 486  if start_pos# ==# current# then
 487     ""
 488  else
 489     unpackCStringBA (copySubStr (A# fo) (I# start_pos#) (I# (current# -# start_pos#)))
 490
 491 lexemeToByteArray :: StringBuffer -> ByteArray Int
 492 lexemeToByteArray (StringBuffer fo _ start_pos# current#) =
 493  if start_pos# ==# current# then
 494     error "lexemeToByteArray"
 495  else
 496     copySubStr (A# fo) (I# start_pos#) (I# (current# -# start_pos#))
 497
 498 lexemeToFastString :: StringBuffer -> FastString
 499 lexemeToFastString (StringBuffer fo l# start_pos# current#) =
 500  if start_pos# ==# current# then
 501     mkFastCharString2 (A# fo) (I# 0#)
 502  else
 503     mkFastSubString (A# fo) (I# start_pos#) (I# (current# -# start_pos#))
 504
 505 {-
 506  Create a StringBuffer from the current lexeme, and add a sentinel
 507  at the end. Know What You're Doing before taking this function
 508  into use..
 509 -}
 510 lexemeToBuffer :: StringBuffer -> StringBuffer
 511 lexemeToBuffer (StringBuffer fo l# start_pos# current#) =
 512  if start_pos# ==# current# then
 513     StringBuffer fo 0# start_pos# current# -- an error, really.
 514  else
 515     unsafeWriteBuffer (StringBuffer fo (current# -# start_pos#) start_pos# start_pos#)
 516                       (current# -# 1#)
 517                       '\NUL'#
 518
 519 \end{code}