2 % (c) The University of Glasgow, 1997-2006
6 FastString: A compact, hash-consed, representation of character strings.
7 Comparison is O(1), and you can get a Unique from them.
8 Generated by the FSLIT macro
9 Turn into SDoc with Outputable.ftext
11 LitString: Just a wrapper for the Addr# of a C string (Ptr CChar).
12 Practically no operations
13 Outputing them is fast
14 Generated by the SLIT macro
15 Turn into SDoc with Outputable.ptext
17 Use LitString unless you want the facilities of FastString
22 FastString(..), -- not abstract, for now.
28 mkFastStringForeignPtr,
29 #if defined(__GLASGOW_HASKELL__)
36 unpackFS, -- :: FastString -> String
37 bytesFS, -- :: FastString -> [Word8]
63 #if defined(__GLASGOW_HASKELL__)
74 #include "HsVersions.h"
85 import System.IO.Unsafe ( unsafePerformIO )
86 import Data.IORef ( IORef, newIORef, readIORef, writeIORef )
87 import Data.Maybe ( isJust )
88 #if !defined(__GLASGOW_HASKELL__)
89 import Data.Char ( ord )
92 import GHC.IOBase ( IO(..) )
93 import GHC.Ptr ( Ptr(..) )
95 #define hASH_TBL_SIZE 4091
96 #define hASH_TBL_SIZE_UNBOXED 4091#
100 A 'FastString' is an array of bytes, hashed to support fast O(1)
101 comparison. It is also associated with a character encoding, so that
102 we know how to convert a 'FastString' to the local encoding, or to the
103 Z-encoding used by the compiler internally.
105 'FastString's support a memoized conversion to the Z-encoding via zEncodeFS.
108 data FastString = FastString {
109 uniq :: {-# UNPACK #-} !Int, -- unique id
110 n_bytes :: {-# UNPACK #-} !Int, -- number of bytes
111 n_chars :: {-# UNPACK #-} !Int, -- number of chars
112 buf :: {-# UNPACK #-} !(ForeignPtr Word8),
117 -- including strings that don't need any encoding
119 -- A UTF-8 string with a memoized Z-encoding
120 | UTF8Encoded {-# UNPACK #-} !(IORef (Maybe FastString))
122 instance Eq FastString where
123 f1 == f2 = uniq f1 == uniq f2
125 instance Ord FastString where
126 -- Compares lexicographically, not by unique
127 a <= b = case cmpFS a b of { LT -> True; EQ -> True; GT -> False }
128 a < b = case cmpFS a b of { LT -> True; EQ -> False; GT -> False }
129 a >= b = case cmpFS a b of { LT -> False; EQ -> True; GT -> True }
130 a > b = case cmpFS a b of { LT -> False; EQ -> False; GT -> True }
135 compare a b = cmpFS a b
137 instance Show FastString where
138 show fs = show (unpackFS fs)
140 cmpFS :: FastString -> FastString -> Ordering
141 cmpFS (FastString u1 l1 _ buf1 _) (FastString u2 l2 _ buf2 _) =
142 if u1 == u2 then EQ else
143 case unsafeMemcmp buf1 buf2 (min l1 l2) `compare` 0 of
148 unsafeMemcmp :: ForeignPtr a -> ForeignPtr b -> Int -> Int
149 unsafeMemcmp buf1 buf2 l =
151 withForeignPtr buf1 $ \p1 ->
152 withForeignPtr buf2 $ \p2 ->
156 foreign import ccall unsafe "ghc_memcmp"
157 memcmp :: Ptr a -> Ptr b -> Int -> IO Int
160 -- -----------------------------------------------------------------------------
164 Internally, the compiler will maintain a fast string symbol
165 table, providing sharing and fast comparison. Creation of
166 new @FastString@s then covertly does a lookup, re-using the
167 @FastString@ if there was a hit.
170 data FastStringTable =
173 (MutableArray# RealWorld [FastString])
175 {-# NOINLINE string_table #-}
176 string_table :: IORef FastStringTable
179 tab <- IO $ \s1# -> case newArray# hASH_TBL_SIZE_UNBOXED [] s1# of
181 (# s2#, FastStringTable 0 arr# #)
184 lookupTbl :: FastStringTable -> Int -> IO [FastString]
185 lookupTbl (FastStringTable _ arr#) (I# i#) =
186 IO $ \ s# -> readArray# arr# i# s#
188 updTbl :: IORef FastStringTable -> FastStringTable -> Int -> [FastString] -> IO ()
189 updTbl fs_table_var (FastStringTable uid arr#) (I# i#) ls = do
190 (IO $ \ s# -> case writeArray# arr# i# ls s# of { s2# -> (# s2#, () #) })
191 writeIORef fs_table_var (FastStringTable (uid+1) arr#)
193 mkFastString# :: Addr# -> FastString
194 mkFastString# a# = mkFastStringBytes ptr (ptrStrLength ptr)
197 mkFastStringBytes :: Ptr Word8 -> Int -> FastString
198 mkFastStringBytes ptr len = unsafePerformIO $ do
199 ft@(FastStringTable uid _) <- readIORef string_table
203 fs <- copyNewFastString uid ptr len
204 updTbl string_table ft h (fs:ls)
205 {- _trace ("new: " ++ show f_str) $ -}
208 lookup_result <- lookupTbl ft h
209 case lookup_result of
212 b <- bucket_match ls len ptr
215 Just v -> {- _trace ("re-use: "++show v) $ -} return v
217 mkZFastStringBytes :: Ptr Word8 -> Int -> FastString
218 mkZFastStringBytes ptr len = unsafePerformIO $ do
219 ft@(FastStringTable uid _) <- readIORef string_table
223 fs <- copyNewZFastString uid ptr len
224 updTbl string_table ft h (fs:ls)
225 {- _trace ("new: " ++ show f_str) $ -}
228 lookup_result <- lookupTbl ft h
229 case lookup_result of
232 b <- bucket_match ls len ptr
235 Just v -> {- _trace ("re-use: "++show v) $ -} return v
237 -- | Create a 'FastString' from an existing 'ForeignPtr'; the difference
238 -- between this and 'mkFastStringBytes' is that we don't have to copy
239 -- the bytes if the string is new to the table.
240 mkFastStringForeignPtr :: Ptr Word8 -> ForeignPtr Word8 -> Int -> IO FastString
241 mkFastStringForeignPtr ptr fp len = do
242 ft@(FastStringTable uid _) <- readIORef string_table
243 -- _trace ("hashed: "++show (I# h)) $
247 fs <- mkNewFastString uid ptr fp len
248 updTbl string_table ft h (fs:ls)
249 {- _trace ("new: " ++ show f_str) $ -}
252 lookup_result <- lookupTbl ft h
253 case lookup_result of
256 b <- bucket_match ls len ptr
259 Just v -> {- _trace ("re-use: "++show v) $ -} return v
261 mkZFastStringForeignPtr :: Ptr Word8 -> ForeignPtr Word8 -> Int -> IO FastString
262 mkZFastStringForeignPtr ptr fp len = do
263 ft@(FastStringTable uid _) <- readIORef string_table
264 -- _trace ("hashed: "++show (I# h)) $
268 fs <- mkNewZFastString uid ptr fp len
269 updTbl string_table ft h (fs:ls)
270 {- _trace ("new: " ++ show f_str) $ -}
273 lookup_result <- lookupTbl ft h
274 case lookup_result of
277 b <- bucket_match ls len ptr
280 Just v -> {- _trace ("re-use: "++show v) $ -} return v
283 -- | Creates a UTF-8 encoded 'FastString' from a 'String'
284 mkFastString :: String -> FastString
287 let l = utf8EncodedLength str
288 buf <- mallocForeignPtrBytes l
289 withForeignPtr buf $ \ptr -> do
290 utf8EncodeString ptr str
291 mkFastStringForeignPtr ptr buf l
293 -- | Creates a 'FastString' from a UTF-8 encoded @[Word8]@
294 mkFastStringByteList :: [Word8] -> FastString
295 mkFastStringByteList str =
297 let l = Prelude.length str
298 buf <- mallocForeignPtrBytes l
299 withForeignPtr buf $ \ptr -> do
300 pokeArray (castPtr ptr) str
301 mkFastStringForeignPtr ptr buf l
303 -- | Creates a Z-encoded 'FastString' from a 'String'
304 mkZFastString :: String -> FastString
307 let l = Prelude.length str
308 buf <- mallocForeignPtrBytes l
309 withForeignPtr buf $ \ptr -> do
310 pokeCAString (castPtr ptr) str
311 mkZFastStringForeignPtr ptr buf l
313 bucket_match :: [FastString] -> Int -> Ptr Word8 -> IO (Maybe FastString)
314 bucket_match [] _ _ = return Nothing
315 bucket_match (v@(FastString _ l _ buf _):ls) len ptr
317 b <- cmpStringPrefix ptr buf len
318 if b then return (Just v)
319 else bucket_match ls len ptr
321 bucket_match ls len ptr
323 mkNewFastString :: Int -> Ptr Word8 -> ForeignPtr Word8 -> Int
325 mkNewFastString uid ptr fp len = do
326 ref <- newIORef Nothing
327 n_chars <- countUTF8Chars ptr len
328 return (FastString uid len n_chars fp (UTF8Encoded ref))
330 mkNewZFastString :: Int -> Ptr Word8 -> ForeignPtr Word8 -> Int
332 mkNewZFastString uid _ fp len = do
333 return (FastString uid len len fp ZEncoded)
335 copyNewFastString :: Int -> Ptr Word8 -> Int -> IO FastString
336 copyNewFastString uid ptr len = do
337 fp <- copyBytesToForeignPtr ptr len
338 ref <- newIORef Nothing
339 n_chars <- countUTF8Chars ptr len
340 return (FastString uid len n_chars fp (UTF8Encoded ref))
342 copyNewZFastString :: Int -> Ptr Word8 -> Int -> IO FastString
343 copyNewZFastString uid ptr len = do
344 fp <- copyBytesToForeignPtr ptr len
345 return (FastString uid len len fp ZEncoded)
347 copyBytesToForeignPtr :: Ptr Word8 -> Int -> IO (ForeignPtr Word8)
348 copyBytesToForeignPtr ptr len = do
349 fp <- mallocForeignPtrBytes len
350 withForeignPtr fp $ \ptr' -> copyBytes ptr' ptr len
353 cmpStringPrefix :: Ptr Word8 -> ForeignPtr Word8 -> Int -> IO Bool
354 cmpStringPrefix ptr fp len =
355 withForeignPtr fp $ \ptr' -> do
356 r <- memcmp ptr ptr' len
360 hashStr :: Ptr Word8 -> Int -> Int
361 -- use the Addr to produce a hash value between 0 & m (inclusive)
362 hashStr (Ptr a#) (I# len#) = loop 0# 0#
364 loop h n | n GHC.Exts.==# len# = I# h
365 | otherwise = loop h2 (n GHC.Exts.+# 1#)
366 where c = ord# (indexCharOffAddr# a# n)
367 h2 = (c GHC.Exts.+# (h GHC.Exts.*# 128#)) `remInt#`
370 -- -----------------------------------------------------------------------------
373 -- | Returns the length of the 'FastString' in characters
374 lengthFS :: FastString -> Int
375 lengthFS f = n_chars f
377 -- | Returns 'True' if the 'FastString' is Z-encoded
378 isZEncoded :: FastString -> Bool
379 isZEncoded fs | ZEncoded <- enc fs = True
382 -- | Returns 'True' if this 'FastString' is not Z-encoded but already has
383 -- a Z-encoding cached (used in producing stats).
384 hasZEncoding :: FastString -> Bool
385 hasZEncoding (FastString _ _ _ _ enc) =
393 -- | Returns 'True' if the 'FastString' is empty
394 nullFS :: FastString -> Bool
395 nullFS f = n_bytes f == 0
397 -- | unpacks and decodes the FastString
398 unpackFS :: FastString -> String
399 unpackFS (FastString _ n_bytes _ buf enc) =
400 inlinePerformIO $ withForeignPtr buf $ \ptr ->
402 ZEncoded -> peekCAStringLen (castPtr ptr,n_bytes)
403 UTF8Encoded _ -> utf8DecodeString ptr n_bytes
405 bytesFS :: FastString -> [Word8]
406 bytesFS (FastString _ n_bytes _ buf _) =
407 inlinePerformIO $ withForeignPtr buf $ \ptr ->
408 peekArray n_bytes ptr
410 -- | returns a Z-encoded version of a 'FastString'. This might be the
411 -- original, if it was already Z-encoded. The first time this
412 -- function is applied to a particular 'FastString', the results are
415 zEncodeFS :: FastString -> FastString
416 zEncodeFS fs@(FastString _ _ _ _ enc) =
425 let efs = mkZFastString (zEncodeString (unpackFS fs))
426 writeIORef ref (Just efs)
429 appendFS :: FastString -> FastString -> FastString
430 appendFS fs1 fs2 = mkFastString (unpackFS fs1 ++ unpackFS fs2)
432 concatFS :: [FastString] -> FastString
433 concatFS ls = mkFastString (Prelude.concat (map unpackFS ls)) -- ToDo: do better
435 headFS :: FastString -> Char
436 headFS (FastString _ 0 _ _ _) = panic "headFS: Empty FastString"
437 headFS (FastString _ _ _ buf enc) =
438 inlinePerformIO $ withForeignPtr buf $ \ptr -> do
441 w <- peek (castPtr ptr)
442 return (castCCharToChar w)
444 return (fst (utf8DecodeChar ptr))
446 tailFS :: FastString -> FastString
447 tailFS (FastString _ 0 _ _ _) = panic "tailFS: Empty FastString"
448 tailFS (FastString _ n_bytes _ buf enc) =
449 inlinePerformIO $ withForeignPtr buf $ \ptr -> do
452 return $! mkZFastStringBytes (ptr `plusPtr` 1) (n_bytes - 1)
454 let (_,ptr') = utf8DecodeChar ptr
455 let off = ptr' `minusPtr` ptr
456 return $! mkFastStringBytes (ptr `plusPtr` off) (n_bytes - off)
458 consFS :: Char -> FastString -> FastString
459 consFS c fs = mkFastString (c : unpackFS fs)
461 uniqueOfFS :: FastString -> FastInt
462 uniqueOfFS (FastString u _ _ _ _) = iUnbox u
465 nilFS = mkFastString ""
467 -- -----------------------------------------------------------------------------
470 getFastStringTable :: IO [[FastString]]
471 getFastStringTable = do
472 tbl <- readIORef string_table
473 buckets <- mapM (lookupTbl tbl) [0 .. hASH_TBL_SIZE]
476 -- -----------------------------------------------------------------------------
477 -- Outputting 'FastString's
479 -- |Outputs a 'FastString' with /no decoding at all/, that is, you
480 -- get the actual bytes in the 'FastString' written to the 'Handle'.
481 hPutFS :: Handle -> FastString -> IO ()
482 hPutFS handle (FastString _ len _ fp _)
483 | len == 0 = return ()
484 | otherwise = do withForeignPtr fp $ \ptr -> hPutBuf handle ptr len
486 -- ToDo: we'll probably want an hPutFSLocal, or something, to output
487 -- in the current locale's encoding (for error messages and suchlike).
489 -- -----------------------------------------------------------------------------
490 -- LitStrings, here for convenience only.
492 -- hmm, not unboxed (or rather FastPtr), interesting
493 --a.k.a. Ptr CChar, Ptr Word8, Ptr (), hmph. We don't
494 --really care about C types in naming, where we can help it.
495 type LitString = Ptr Word8
496 --Why do we recalculate length every time it's requested?
497 --If it's commonly needed, we should perhaps have
498 --data LitString = LitString {-#UNPACK#-}!(FastPtr Word8) {-#UNPACK#-}!FastInt
500 #if defined(__GLASGOW_HASKELL__)
501 mkLitString# :: Addr# -> LitString
502 mkLitString# a# = Ptr a#
504 --can/should we use FastTypes here?
505 --Is this likely to be memory-preserving if only used on constant strings?
506 --should we inline it? If lucky, that would make a CAF that wouldn't
507 --be computationally repeated... although admittedly we're not
508 --really intending to use mkLitString when __GLASGOW_HASKELL__...
509 --(I wonder, is unicode / multi-byte characters allowed in LitStrings
511 {-# INLINE mkLitString #-}
512 mkLitString :: String -> LitString
515 p <- mallocBytes (length s + 1)
517 loop :: Int -> String -> IO ()
518 loop n cs | n `seq` null cs = pokeByteOff p n (0 :: Word8)
520 pokeByteOff p n (fromIntegral (ord c) :: Word8)
527 unpackLitString :: LitString -> String
528 unpackLitString p_ = case pUnbox p_ of
529 p -> unpack (_ILIT(0))
531 unpack n = case indexWord8OffFastPtrAsFastChar p n of
532 ch -> if ch `eqFastChar` _CLIT('\0')
533 then [] else cBox ch : unpack (n +# _ILIT(1))
535 strLength :: LitString -> Int
536 strLength = ptrStrLength
538 -- for now, use a simple String representation
539 --no, let's not do that right now - it's work in other places
541 type LitString = String
543 mkLitString :: String -> LitString
546 unpackLitString :: LitString -> String
549 strLength :: LitString -> Int
554 -- -----------------------------------------------------------------------------
557 foreign import ccall unsafe "ghc_strlen"
558 ptrStrLength :: Ptr Word8 -> Int
560 -- NB. does *not* add a '\0'-terminator.
561 -- We only use CChar here to be parallel to the imported
562 -- peekC(A)StringLen.
563 pokeCAString :: Ptr CChar -> String -> IO ()
564 pokeCAString ptr str =
567 go (c:cs) n = do pokeElemOff ptr n (castCharToCChar c); go cs (n+1)
571 #if defined(__GLASGOW_HASKELL__) && __GLASGOW_HASKELL__ <= 602
572 peekCAStringLen = peekCStringLen