2 % (c) The University of Glasgow, 1997-2006
6 FastString: A compact, hash-consed, representation of character strings.
7 Comparison is O(1), and you can get a Unique from them.
9 Turn into SDoc with Outputable.ftext
11 LitString: Just a wrapper for the Addr# of a C string (Ptr CChar).
12 Practically no operations
13 Outputing them is fast
15 Turn into SDoc with Outputable.ptext
17 Use LitString unless you want the facilities of FastString
22 FastString(..), -- not abstract, for now.
28 mkFastStringForeignPtr,
29 #if defined(__GLASGOW_HASKELL__)
36 unpackFS, -- :: FastString -> String
37 bytesFS, -- :: FastString -> [Word8]
63 #if defined(__GLASGOW_HASKELL__)
76 #include "HsVersions.h"
87 import System.IO.Unsafe ( unsafePerformIO )
88 import Data.IORef ( IORef, newIORef, readIORef, writeIORef )
89 import Data.Maybe ( isJust )
90 import Data.Char ( ord )
92 import GHC.IOBase ( IO(..) )
93 import GHC.Ptr ( Ptr(..) )
94 #if defined(__GLASGOW_HASKELL__)
95 import GHC.Base ( unpackCString# )
98 #define hASH_TBL_SIZE 4091
99 #define hASH_TBL_SIZE_UNBOXED 4091#
103 A 'FastString' is an array of bytes, hashed to support fast O(1)
104 comparison. It is also associated with a character encoding, so that
105 we know how to convert a 'FastString' to the local encoding, or to the
106 Z-encoding used by the compiler internally.
108 'FastString's support a memoized conversion to the Z-encoding via zEncodeFS.
111 data FastString = FastString {
112 uniq :: {-# UNPACK #-} !Int, -- unique id
113 n_bytes :: {-# UNPACK #-} !Int, -- number of bytes
114 n_chars :: {-# UNPACK #-} !Int, -- number of chars
115 buf :: {-# UNPACK #-} !(ForeignPtr Word8),
120 -- including strings that don't need any encoding
122 -- A UTF-8 string with a memoized Z-encoding
123 | UTF8Encoded {-# UNPACK #-} !(IORef (Maybe FastString))
125 instance Eq FastString where
126 f1 == f2 = uniq f1 == uniq f2
128 instance Ord FastString where
129 -- Compares lexicographically, not by unique
130 a <= b = case cmpFS a b of { LT -> True; EQ -> True; GT -> False }
131 a < b = case cmpFS a b of { LT -> True; EQ -> False; GT -> False }
132 a >= b = case cmpFS a b of { LT -> False; EQ -> True; GT -> True }
133 a > b = case cmpFS a b of { LT -> False; EQ -> False; GT -> True }
138 compare a b = cmpFS a b
140 instance Show FastString where
141 show fs = show (unpackFS fs)
143 cmpFS :: FastString -> FastString -> Ordering
144 cmpFS (FastString u1 l1 _ buf1 _) (FastString u2 l2 _ buf2 _) =
145 if u1 == u2 then EQ else
146 case unsafeMemcmp buf1 buf2 (min l1 l2) `compare` 0 of
151 unsafeMemcmp :: ForeignPtr a -> ForeignPtr b -> Int -> Int
152 unsafeMemcmp buf1 buf2 l =
154 withForeignPtr buf1 $ \p1 ->
155 withForeignPtr buf2 $ \p2 ->
159 foreign import ccall unsafe "ghc_memcmp"
160 memcmp :: Ptr a -> Ptr b -> Int -> IO Int
163 -- -----------------------------------------------------------------------------
167 Internally, the compiler will maintain a fast string symbol
168 table, providing sharing and fast comparison. Creation of
169 new @FastString@s then covertly does a lookup, re-using the
170 @FastString@ if there was a hit.
173 data FastStringTable =
176 (MutableArray# RealWorld [FastString])
178 {-# NOINLINE string_table #-}
179 string_table :: IORef FastStringTable
182 tab <- IO $ \s1# -> case newArray# hASH_TBL_SIZE_UNBOXED [] s1# of
184 (# s2#, FastStringTable 0 arr# #)
187 lookupTbl :: FastStringTable -> Int -> IO [FastString]
188 lookupTbl (FastStringTable _ arr#) (I# i#) =
189 IO $ \ s# -> readArray# arr# i# s#
191 updTbl :: IORef FastStringTable -> FastStringTable -> Int -> [FastString] -> IO ()
192 updTbl fs_table_var (FastStringTable uid arr#) (I# i#) ls = do
193 (IO $ \ s# -> case writeArray# arr# i# ls s# of { s2# -> (# s2#, () #) })
194 writeIORef fs_table_var (FastStringTable (uid+1) arr#)
196 mkFastString# :: Addr# -> FastString
197 mkFastString# a# = mkFastStringBytes ptr (ptrStrLength ptr)
200 mkFastStringBytes :: Ptr Word8 -> Int -> FastString
201 mkFastStringBytes ptr len = unsafePerformIO $ do
202 ft@(FastStringTable uid _) <- readIORef string_table
206 fs <- copyNewFastString uid ptr len
207 updTbl string_table ft h (fs:ls)
208 {- _trace ("new: " ++ show f_str) $ -}
211 lookup_result <- lookupTbl ft h
212 case lookup_result of
215 b <- bucket_match ls len ptr
218 Just v -> {- _trace ("re-use: "++show v) $ -} return v
220 mkZFastStringBytes :: Ptr Word8 -> Int -> FastString
221 mkZFastStringBytes ptr len = unsafePerformIO $ do
222 ft@(FastStringTable uid _) <- readIORef string_table
226 fs <- copyNewZFastString uid ptr len
227 updTbl string_table ft h (fs:ls)
228 {- _trace ("new: " ++ show f_str) $ -}
231 lookup_result <- lookupTbl ft h
232 case lookup_result of
235 b <- bucket_match ls len ptr
238 Just v -> {- _trace ("re-use: "++show v) $ -} return v
240 -- | Create a 'FastString' from an existing 'ForeignPtr'; the difference
241 -- between this and 'mkFastStringBytes' is that we don't have to copy
242 -- the bytes if the string is new to the table.
243 mkFastStringForeignPtr :: Ptr Word8 -> ForeignPtr Word8 -> Int -> IO FastString
244 mkFastStringForeignPtr ptr fp len = do
245 ft@(FastStringTable uid _) <- readIORef string_table
246 -- _trace ("hashed: "++show (I# h)) $
250 fs <- mkNewFastString uid ptr fp len
251 updTbl string_table ft h (fs:ls)
252 {- _trace ("new: " ++ show f_str) $ -}
255 lookup_result <- lookupTbl ft h
256 case lookup_result of
259 b <- bucket_match ls len ptr
262 Just v -> {- _trace ("re-use: "++show v) $ -} return v
264 mkZFastStringForeignPtr :: Ptr Word8 -> ForeignPtr Word8 -> Int -> IO FastString
265 mkZFastStringForeignPtr ptr fp len = do
266 ft@(FastStringTable uid _) <- readIORef string_table
267 -- _trace ("hashed: "++show (I# h)) $
271 fs <- mkNewZFastString uid ptr fp len
272 updTbl string_table ft h (fs:ls)
273 {- _trace ("new: " ++ show f_str) $ -}
276 lookup_result <- lookupTbl ft h
277 case lookup_result of
280 b <- bucket_match ls len ptr
283 Just v -> {- _trace ("re-use: "++show v) $ -} return v
286 -- | Creates a UTF-8 encoded 'FastString' from a 'String'
287 mkFastString :: String -> FastString
290 let l = utf8EncodedLength str
291 buf <- mallocForeignPtrBytes l
292 withForeignPtr buf $ \ptr -> do
293 utf8EncodeString ptr str
294 mkFastStringForeignPtr ptr buf l
296 -- | Creates a 'FastString' from a UTF-8 encoded @[Word8]@
297 mkFastStringByteList :: [Word8] -> FastString
298 mkFastStringByteList str =
300 let l = Prelude.length str
301 buf <- mallocForeignPtrBytes l
302 withForeignPtr buf $ \ptr -> do
303 pokeArray (castPtr ptr) str
304 mkFastStringForeignPtr ptr buf l
306 -- | Creates a Z-encoded 'FastString' from a 'String'
307 mkZFastString :: String -> FastString
310 let l = Prelude.length str
311 buf <- mallocForeignPtrBytes l
312 withForeignPtr buf $ \ptr -> do
313 pokeCAString (castPtr ptr) str
314 mkZFastStringForeignPtr ptr buf l
316 bucket_match :: [FastString] -> Int -> Ptr Word8 -> IO (Maybe FastString)
317 bucket_match [] _ _ = return Nothing
318 bucket_match (v@(FastString _ l _ buf _):ls) len ptr
320 b <- cmpStringPrefix ptr buf len
321 if b then return (Just v)
322 else bucket_match ls len ptr
324 bucket_match ls len ptr
326 mkNewFastString :: Int -> Ptr Word8 -> ForeignPtr Word8 -> Int
328 mkNewFastString uid ptr fp len = do
329 ref <- newIORef Nothing
330 n_chars <- countUTF8Chars ptr len
331 return (FastString uid len n_chars fp (UTF8Encoded ref))
333 mkNewZFastString :: Int -> Ptr Word8 -> ForeignPtr Word8 -> Int
335 mkNewZFastString uid _ fp len = do
336 return (FastString uid len len fp ZEncoded)
338 copyNewFastString :: Int -> Ptr Word8 -> Int -> IO FastString
339 copyNewFastString uid ptr len = do
340 fp <- copyBytesToForeignPtr ptr len
341 ref <- newIORef Nothing
342 n_chars <- countUTF8Chars ptr len
343 return (FastString uid len n_chars fp (UTF8Encoded ref))
345 copyNewZFastString :: Int -> Ptr Word8 -> Int -> IO FastString
346 copyNewZFastString uid ptr len = do
347 fp <- copyBytesToForeignPtr ptr len
348 return (FastString uid len len fp ZEncoded)
350 copyBytesToForeignPtr :: Ptr Word8 -> Int -> IO (ForeignPtr Word8)
351 copyBytesToForeignPtr ptr len = do
352 fp <- mallocForeignPtrBytes len
353 withForeignPtr fp $ \ptr' -> copyBytes ptr' ptr len
356 cmpStringPrefix :: Ptr Word8 -> ForeignPtr Word8 -> Int -> IO Bool
357 cmpStringPrefix ptr fp len =
358 withForeignPtr fp $ \ptr' -> do
359 r <- memcmp ptr ptr' len
363 hashStr :: Ptr Word8 -> Int -> Int
364 -- use the Addr to produce a hash value between 0 & m (inclusive)
365 hashStr (Ptr a#) (I# len#) = loop 0# 0#
367 loop h n | n GHC.Exts.==# len# = I# h
368 | otherwise = loop h2 (n GHC.Exts.+# 1#)
369 where c = ord# (indexCharOffAddr# a# n)
370 h2 = (c GHC.Exts.+# (h GHC.Exts.*# 128#)) `remInt#`
373 -- -----------------------------------------------------------------------------
376 -- | Returns the length of the 'FastString' in characters
377 lengthFS :: FastString -> Int
378 lengthFS f = n_chars f
380 -- | Returns 'True' if the 'FastString' is Z-encoded
381 isZEncoded :: FastString -> Bool
382 isZEncoded fs | ZEncoded <- enc fs = True
385 -- | Returns 'True' if this 'FastString' is not Z-encoded but already has
386 -- a Z-encoding cached (used in producing stats).
387 hasZEncoding :: FastString -> Bool
388 hasZEncoding (FastString _ _ _ _ enc) =
396 -- | Returns 'True' if the 'FastString' is empty
397 nullFS :: FastString -> Bool
398 nullFS f = n_bytes f == 0
400 -- | unpacks and decodes the FastString
401 unpackFS :: FastString -> String
402 unpackFS (FastString _ n_bytes _ buf enc) =
403 inlinePerformIO $ withForeignPtr buf $ \ptr ->
405 ZEncoded -> peekCAStringLen (castPtr ptr,n_bytes)
406 UTF8Encoded _ -> utf8DecodeString ptr n_bytes
408 bytesFS :: FastString -> [Word8]
409 bytesFS (FastString _ n_bytes _ buf _) =
410 inlinePerformIO $ withForeignPtr buf $ \ptr ->
411 peekArray n_bytes ptr
413 -- | returns a Z-encoded version of a 'FastString'. This might be the
414 -- original, if it was already Z-encoded. The first time this
415 -- function is applied to a particular 'FastString', the results are
418 zEncodeFS :: FastString -> FastString
419 zEncodeFS fs@(FastString _ _ _ _ enc) =
428 let efs = mkZFastString (zEncodeString (unpackFS fs))
429 writeIORef ref (Just efs)
432 appendFS :: FastString -> FastString -> FastString
433 appendFS fs1 fs2 = mkFastString (unpackFS fs1 ++ unpackFS fs2)
435 concatFS :: [FastString] -> FastString
436 concatFS ls = mkFastString (Prelude.concat (map unpackFS ls)) -- ToDo: do better
438 headFS :: FastString -> Char
439 headFS (FastString _ 0 _ _ _) = panic "headFS: Empty FastString"
440 headFS (FastString _ _ _ buf enc) =
441 inlinePerformIO $ withForeignPtr buf $ \ptr -> do
444 w <- peek (castPtr ptr)
445 return (castCCharToChar w)
447 return (fst (utf8DecodeChar ptr))
449 tailFS :: FastString -> FastString
450 tailFS (FastString _ 0 _ _ _) = panic "tailFS: Empty FastString"
451 tailFS (FastString _ n_bytes _ buf enc) =
452 inlinePerformIO $ withForeignPtr buf $ \ptr -> do
455 return $! mkZFastStringBytes (ptr `plusPtr` 1) (n_bytes - 1)
457 let (_,ptr') = utf8DecodeChar ptr
458 let off = ptr' `minusPtr` ptr
459 return $! mkFastStringBytes (ptr `plusPtr` off) (n_bytes - off)
461 consFS :: Char -> FastString -> FastString
462 consFS c fs = mkFastString (c : unpackFS fs)
464 uniqueOfFS :: FastString -> FastInt
465 uniqueOfFS (FastString u _ _ _ _) = iUnbox u
468 nilFS = mkFastString ""
470 -- -----------------------------------------------------------------------------
473 getFastStringTable :: IO [[FastString]]
474 getFastStringTable = do
475 tbl <- readIORef string_table
476 buckets <- mapM (lookupTbl tbl) [0 .. hASH_TBL_SIZE]
479 -- -----------------------------------------------------------------------------
480 -- Outputting 'FastString's
482 -- |Outputs a 'FastString' with /no decoding at all/, that is, you
483 -- get the actual bytes in the 'FastString' written to the 'Handle'.
484 hPutFS :: Handle -> FastString -> IO ()
485 hPutFS handle (FastString _ len _ fp _)
486 | len == 0 = return ()
487 | otherwise = do withForeignPtr fp $ \ptr -> hPutBuf handle ptr len
489 -- ToDo: we'll probably want an hPutFSLocal, or something, to output
490 -- in the current locale's encoding (for error messages and suchlike).
492 -- -----------------------------------------------------------------------------
493 -- LitStrings, here for convenience only.
495 -- hmm, not unboxed (or rather FastPtr), interesting
496 --a.k.a. Ptr CChar, Ptr Word8, Ptr (), hmph. We don't
497 --really care about C types in naming, where we can help it.
498 type LitString = Ptr Word8
499 --Why do we recalculate length every time it's requested?
500 --If it's commonly needed, we should perhaps have
501 --data LitString = LitString {-#UNPACK#-}!(FastPtr Word8) {-#UNPACK#-}!FastInt
503 #if defined(__GLASGOW_HASKELL__)
504 mkLitString# :: Addr# -> LitString
505 mkLitString# a# = Ptr a#
507 --can/should we use FastTypes here?
508 --Is this likely to be memory-preserving if only used on constant strings?
509 --should we inline it? If lucky, that would make a CAF that wouldn't
510 --be computationally repeated... although admittedly we're not
511 --really intending to use mkLitString when __GLASGOW_HASKELL__...
512 --(I wonder, is unicode / multi-byte characters allowed in LitStrings
514 {-# INLINE mkLitString #-}
515 mkLitString :: String -> LitString
518 p <- mallocBytes (length s + 1)
520 loop :: Int -> String -> IO ()
521 loop n cs | n `seq` null cs = pokeByteOff p n (0 :: Word8)
523 pokeByteOff p n (fromIntegral (ord c) :: Word8)
525 -- XXX GHC isn't smart enough to know that we have already covered
527 loop _ [] = panic "mkLitString"
532 unpackLitString :: LitString -> String
533 unpackLitString p_ = case pUnbox p_ of
534 p -> unpack (_ILIT(0))
536 unpack n = case indexWord8OffFastPtrAsFastChar p n of
537 ch -> if ch `eqFastChar` _CLIT('\0')
538 then [] else cBox ch : unpack (n +# _ILIT(1))
540 strLength :: LitString -> Int
541 strLength = ptrStrLength
543 -- for now, use a simple String representation
544 --no, let's not do that right now - it's work in other places
546 type LitString = String
548 mkLitString :: String -> LitString
551 unpackLitString :: LitString -> String
554 strLength :: LitString -> Int
559 -- -----------------------------------------------------------------------------
562 foreign import ccall unsafe "ghc_strlen"
563 ptrStrLength :: Ptr Word8 -> Int
565 -- NB. does *not* add a '\0'-terminator.
566 -- We only use CChar here to be parallel to the imported
567 -- peekC(A)StringLen.
568 pokeCAString :: Ptr CChar -> String -> IO ()
569 pokeCAString ptr str =
572 go (c:cs) n = do pokeElemOff ptr n (castCharToCChar c); go cs (n+1)
576 #if defined(__GLASGOW_HASKELL__) && __GLASGOW_HASKELL__ <= 602
577 peekCAStringLen = peekCStringLen
580 {-# NOINLINE sLit #-}
581 sLit :: String -> LitString
582 sLit x = mkLitString x
584 {-# NOINLINE fsLit #-}
585 fsLit :: String -> FastString
586 fsLit x = mkFastString x
589 forall x . sLit (unpackCString# x) = mkLitString# x #-}
591 forall x . fsLit (unpackCString# x) = mkFastString# x #-}