2 % (c) The University of Glasgow, 1997-2006
5 {-# OPTIONS -fno-warn-unused-imports #-}
6 -- XXX GHC 6.9 seems to be confused by unpackCString# being used only in
9 {-# OPTIONS_GHC -O -funbox-strict-fields #-}
10 -- We always optimise this, otherwise performance of a non-optimised
11 -- compiler is severely affected
14 FastString: A compact, hash-consed, representation of character strings.
15 Comparison is O(1), and you can get a Unique from them.
17 Turn into SDoc with Outputable.ftext
19 LitString: Just a wrapper for the Addr# of a C string (Ptr CChar).
20 Practically no operations
21 Outputing them is fast
23 Turn into SDoc with Outputable.ptext
25 Use LitString unless you want the facilities of FastString
30 FastString(..), -- not abstract, for now.
36 mkFastStringForeignPtr,
37 #if defined(__GLASGOW_HASKELL__)
44 unpackFS, -- :: FastString -> String
45 bytesFS, -- :: FastString -> [Word8]
71 #if defined(__GLASGOW_HASKELL__)
84 #include "HsVersions.h"
95 import System.IO.Unsafe ( unsafePerformIO )
96 import Data.IORef ( IORef, newIORef, readIORef, writeIORef )
97 import Data.Maybe ( isJust )
98 import Data.Char ( ord )
100 import GHC.IOBase ( IO(..) )
101 import GHC.Ptr ( Ptr(..) )
102 #if defined(__GLASGOW_HASKELL__)
103 import GHC.Base ( unpackCString# )
106 #define hASH_TBL_SIZE 4091
107 #define hASH_TBL_SIZE_UNBOXED 4091#
111 A 'FastString' is an array of bytes, hashed to support fast O(1)
112 comparison. It is also associated with a character encoding, so that
113 we know how to convert a 'FastString' to the local encoding, or to the
114 Z-encoding used by the compiler internally.
116 'FastString's support a memoized conversion to the Z-encoding via zEncodeFS.
119 data FastString = FastString {
120 uniq :: {-# UNPACK #-} !Int, -- unique id
121 n_bytes :: {-# UNPACK #-} !Int, -- number of bytes
122 n_chars :: {-# UNPACK #-} !Int, -- number of chars
123 buf :: {-# UNPACK #-} !(ForeignPtr Word8),
128 -- including strings that don't need any encoding
130 -- A UTF-8 string with a memoized Z-encoding
131 | UTF8Encoded {-# UNPACK #-} !(IORef (Maybe FastString))
133 instance Eq FastString where
134 f1 == f2 = uniq f1 == uniq f2
136 instance Ord FastString where
137 -- Compares lexicographically, not by unique
138 a <= b = case cmpFS a b of { LT -> True; EQ -> True; GT -> False }
139 a < b = case cmpFS a b of { LT -> True; EQ -> False; GT -> False }
140 a >= b = case cmpFS a b of { LT -> False; EQ -> True; GT -> True }
141 a > b = case cmpFS a b of { LT -> False; EQ -> False; GT -> True }
146 compare a b = cmpFS a b
148 instance Show FastString where
149 show fs = show (unpackFS fs)
151 cmpFS :: FastString -> FastString -> Ordering
152 cmpFS (FastString u1 l1 _ buf1 _) (FastString u2 l2 _ buf2 _) =
153 if u1 == u2 then EQ else
154 case unsafeMemcmp buf1 buf2 (min l1 l2) `compare` 0 of
159 unsafeMemcmp :: ForeignPtr a -> ForeignPtr b -> Int -> Int
160 unsafeMemcmp buf1 buf2 l =
162 withForeignPtr buf1 $ \p1 ->
163 withForeignPtr buf2 $ \p2 ->
167 foreign import ccall unsafe "ghc_memcmp"
168 memcmp :: Ptr a -> Ptr b -> Int -> IO Int
171 -- -----------------------------------------------------------------------------
175 Internally, the compiler will maintain a fast string symbol
176 table, providing sharing and fast comparison. Creation of
177 new @FastString@s then covertly does a lookup, re-using the
178 @FastString@ if there was a hit.
181 data FastStringTable =
184 (MutableArray# RealWorld [FastString])
186 {-# NOINLINE string_table #-}
187 string_table :: IORef FastStringTable
190 tab <- IO $ \s1# -> case newArray# hASH_TBL_SIZE_UNBOXED [] s1# of
192 (# s2#, FastStringTable 0 arr# #)
195 lookupTbl :: FastStringTable -> Int -> IO [FastString]
196 lookupTbl (FastStringTable _ arr#) (I# i#) =
197 IO $ \ s# -> readArray# arr# i# s#
199 updTbl :: IORef FastStringTable -> FastStringTable -> Int -> [FastString] -> IO ()
200 updTbl fs_table_var (FastStringTable uid arr#) (I# i#) ls = do
201 (IO $ \ s# -> case writeArray# arr# i# ls s# of { s2# -> (# s2#, () #) })
202 writeIORef fs_table_var (FastStringTable (uid+1) arr#)
204 mkFastString# :: Addr# -> FastString
205 mkFastString# a# = mkFastStringBytes ptr (ptrStrLength ptr)
208 mkFastStringBytes :: Ptr Word8 -> Int -> FastString
209 mkFastStringBytes ptr len = unsafePerformIO $ do
210 ft@(FastStringTable uid _) <- readIORef string_table
214 fs <- copyNewFastString uid ptr len
215 updTbl string_table ft h (fs:ls)
216 {- _trace ("new: " ++ show f_str) $ -}
219 lookup_result <- lookupTbl ft h
220 case lookup_result of
223 b <- bucket_match ls len ptr
226 Just v -> {- _trace ("re-use: "++show v) $ -} return v
228 mkZFastStringBytes :: Ptr Word8 -> Int -> FastString
229 mkZFastStringBytes ptr len = unsafePerformIO $ do
230 ft@(FastStringTable uid _) <- readIORef string_table
234 fs <- copyNewZFastString uid ptr len
235 updTbl string_table ft h (fs:ls)
236 {- _trace ("new: " ++ show f_str) $ -}
239 lookup_result <- lookupTbl ft h
240 case lookup_result of
243 b <- bucket_match ls len ptr
246 Just v -> {- _trace ("re-use: "++show v) $ -} return v
248 -- | Create a 'FastString' from an existing 'ForeignPtr'; the difference
249 -- between this and 'mkFastStringBytes' is that we don't have to copy
250 -- the bytes if the string is new to the table.
251 mkFastStringForeignPtr :: Ptr Word8 -> ForeignPtr Word8 -> Int -> IO FastString
252 mkFastStringForeignPtr ptr fp len = do
253 ft@(FastStringTable uid _) <- readIORef string_table
254 -- _trace ("hashed: "++show (I# h)) $
258 fs <- mkNewFastString uid ptr fp len
259 updTbl string_table ft h (fs:ls)
260 {- _trace ("new: " ++ show f_str) $ -}
263 lookup_result <- lookupTbl ft h
264 case lookup_result of
267 b <- bucket_match ls len ptr
270 Just v -> {- _trace ("re-use: "++show v) $ -} return v
272 mkZFastStringForeignPtr :: Ptr Word8 -> ForeignPtr Word8 -> Int -> IO FastString
273 mkZFastStringForeignPtr ptr fp len = do
274 ft@(FastStringTable uid _) <- readIORef string_table
275 -- _trace ("hashed: "++show (I# h)) $
279 fs <- mkNewZFastString uid ptr fp len
280 updTbl string_table ft h (fs:ls)
281 {- _trace ("new: " ++ show f_str) $ -}
284 lookup_result <- lookupTbl ft h
285 case lookup_result of
288 b <- bucket_match ls len ptr
291 Just v -> {- _trace ("re-use: "++show v) $ -} return v
294 -- | Creates a UTF-8 encoded 'FastString' from a 'String'
295 mkFastString :: String -> FastString
298 let l = utf8EncodedLength str
299 buf <- mallocForeignPtrBytes l
300 withForeignPtr buf $ \ptr -> do
301 utf8EncodeString ptr str
302 mkFastStringForeignPtr ptr buf l
304 -- | Creates a 'FastString' from a UTF-8 encoded @[Word8]@
305 mkFastStringByteList :: [Word8] -> FastString
306 mkFastStringByteList str =
308 let l = Prelude.length str
309 buf <- mallocForeignPtrBytes l
310 withForeignPtr buf $ \ptr -> do
311 pokeArray (castPtr ptr) str
312 mkFastStringForeignPtr ptr buf l
314 -- | Creates a Z-encoded 'FastString' from a 'String'
315 mkZFastString :: String -> FastString
318 let l = Prelude.length str
319 buf <- mallocForeignPtrBytes l
320 withForeignPtr buf $ \ptr -> do
321 pokeCAString (castPtr ptr) str
322 mkZFastStringForeignPtr ptr buf l
324 bucket_match :: [FastString] -> Int -> Ptr Word8 -> IO (Maybe FastString)
325 bucket_match [] _ _ = return Nothing
326 bucket_match (v@(FastString _ l _ buf _):ls) len ptr
328 b <- cmpStringPrefix ptr buf len
329 if b then return (Just v)
330 else bucket_match ls len ptr
332 bucket_match ls len ptr
334 mkNewFastString :: Int -> Ptr Word8 -> ForeignPtr Word8 -> Int
336 mkNewFastString uid ptr fp len = do
337 ref <- newIORef Nothing
338 n_chars <- countUTF8Chars ptr len
339 return (FastString uid len n_chars fp (UTF8Encoded ref))
341 mkNewZFastString :: Int -> Ptr Word8 -> ForeignPtr Word8 -> Int
343 mkNewZFastString uid _ fp len = do
344 return (FastString uid len len fp ZEncoded)
346 copyNewFastString :: Int -> Ptr Word8 -> Int -> IO FastString
347 copyNewFastString uid ptr len = do
348 fp <- copyBytesToForeignPtr ptr len
349 ref <- newIORef Nothing
350 n_chars <- countUTF8Chars ptr len
351 return (FastString uid len n_chars fp (UTF8Encoded ref))
353 copyNewZFastString :: Int -> Ptr Word8 -> Int -> IO FastString
354 copyNewZFastString uid ptr len = do
355 fp <- copyBytesToForeignPtr ptr len
356 return (FastString uid len len fp ZEncoded)
358 copyBytesToForeignPtr :: Ptr Word8 -> Int -> IO (ForeignPtr Word8)
359 copyBytesToForeignPtr ptr len = do
360 fp <- mallocForeignPtrBytes len
361 withForeignPtr fp $ \ptr' -> copyBytes ptr' ptr len
364 cmpStringPrefix :: Ptr Word8 -> ForeignPtr Word8 -> Int -> IO Bool
365 cmpStringPrefix ptr fp len =
366 withForeignPtr fp $ \ptr' -> do
367 r <- memcmp ptr ptr' len
371 hashStr :: Ptr Word8 -> Int -> Int
372 -- use the Addr to produce a hash value between 0 & m (inclusive)
373 hashStr (Ptr a#) (I# len#) = loop 0# 0#
375 loop h n | n GHC.Exts.==# len# = I# h
376 | otherwise = loop h2 (n GHC.Exts.+# 1#)
377 where c = ord# (indexCharOffAddr# a# n)
378 h2 = (c GHC.Exts.+# (h GHC.Exts.*# 128#)) `remInt#`
381 -- -----------------------------------------------------------------------------
384 -- | Returns the length of the 'FastString' in characters
385 lengthFS :: FastString -> Int
386 lengthFS f = n_chars f
388 -- | Returns 'True' if the 'FastString' is Z-encoded
389 isZEncoded :: FastString -> Bool
390 isZEncoded fs | ZEncoded <- enc fs = True
393 -- | Returns 'True' if this 'FastString' is not Z-encoded but already has
394 -- a Z-encoding cached (used in producing stats).
395 hasZEncoding :: FastString -> Bool
396 hasZEncoding (FastString _ _ _ _ enc) =
404 -- | Returns 'True' if the 'FastString' is empty
405 nullFS :: FastString -> Bool
406 nullFS f = n_bytes f == 0
408 -- | unpacks and decodes the FastString
409 unpackFS :: FastString -> String
410 unpackFS (FastString _ n_bytes _ buf enc) =
411 inlinePerformIO $ withForeignPtr buf $ \ptr ->
413 ZEncoded -> peekCAStringLen (castPtr ptr,n_bytes)
414 UTF8Encoded _ -> utf8DecodeString ptr n_bytes
416 bytesFS :: FastString -> [Word8]
417 bytesFS (FastString _ n_bytes _ buf _) =
418 inlinePerformIO $ withForeignPtr buf $ \ptr ->
419 peekArray n_bytes ptr
421 -- | returns a Z-encoded version of a 'FastString'. This might be the
422 -- original, if it was already Z-encoded. The first time this
423 -- function is applied to a particular 'FastString', the results are
426 zEncodeFS :: FastString -> FastString
427 zEncodeFS fs@(FastString _ _ _ _ enc) =
436 let efs = mkZFastString (zEncodeString (unpackFS fs))
437 writeIORef ref (Just efs)
440 appendFS :: FastString -> FastString -> FastString
441 appendFS fs1 fs2 = mkFastString (unpackFS fs1 ++ unpackFS fs2)
443 concatFS :: [FastString] -> FastString
444 concatFS ls = mkFastString (Prelude.concat (map unpackFS ls)) -- ToDo: do better
446 headFS :: FastString -> Char
447 headFS (FastString _ 0 _ _ _) = panic "headFS: Empty FastString"
448 headFS (FastString _ _ _ buf enc) =
449 inlinePerformIO $ withForeignPtr buf $ \ptr -> do
452 w <- peek (castPtr ptr)
453 return (castCCharToChar w)
455 return (fst (utf8DecodeChar ptr))
457 tailFS :: FastString -> FastString
458 tailFS (FastString _ 0 _ _ _) = panic "tailFS: Empty FastString"
459 tailFS (FastString _ n_bytes _ buf enc) =
460 inlinePerformIO $ withForeignPtr buf $ \ptr -> do
463 return $! mkZFastStringBytes (ptr `plusPtr` 1) (n_bytes - 1)
465 let (_,ptr') = utf8DecodeChar ptr
466 let off = ptr' `minusPtr` ptr
467 return $! mkFastStringBytes (ptr `plusPtr` off) (n_bytes - off)
469 consFS :: Char -> FastString -> FastString
470 consFS c fs = mkFastString (c : unpackFS fs)
472 uniqueOfFS :: FastString -> FastInt
473 uniqueOfFS (FastString u _ _ _ _) = iUnbox u
476 nilFS = mkFastString ""
478 -- -----------------------------------------------------------------------------
481 getFastStringTable :: IO [[FastString]]
482 getFastStringTable = do
483 tbl <- readIORef string_table
484 buckets <- mapM (lookupTbl tbl) [0 .. hASH_TBL_SIZE]
487 -- -----------------------------------------------------------------------------
488 -- Outputting 'FastString's
490 -- |Outputs a 'FastString' with /no decoding at all/, that is, you
491 -- get the actual bytes in the 'FastString' written to the 'Handle'.
492 hPutFS :: Handle -> FastString -> IO ()
493 hPutFS handle (FastString _ len _ fp _)
494 | len == 0 = return ()
495 | otherwise = do withForeignPtr fp $ \ptr -> hPutBuf handle ptr len
497 -- ToDo: we'll probably want an hPutFSLocal, or something, to output
498 -- in the current locale's encoding (for error messages and suchlike).
500 -- -----------------------------------------------------------------------------
501 -- LitStrings, here for convenience only.
503 -- hmm, not unboxed (or rather FastPtr), interesting
504 --a.k.a. Ptr CChar, Ptr Word8, Ptr (), hmph. We don't
505 --really care about C types in naming, where we can help it.
506 type LitString = Ptr Word8
507 --Why do we recalculate length every time it's requested?
508 --If it's commonly needed, we should perhaps have
509 --data LitString = LitString {-#UNPACK#-}!(FastPtr Word8) {-#UNPACK#-}!FastInt
511 #if defined(__GLASGOW_HASKELL__)
512 mkLitString# :: Addr# -> LitString
513 mkLitString# a# = Ptr a#
515 --can/should we use FastTypes here?
516 --Is this likely to be memory-preserving if only used on constant strings?
517 --should we inline it? If lucky, that would make a CAF that wouldn't
518 --be computationally repeated... although admittedly we're not
519 --really intending to use mkLitString when __GLASGOW_HASKELL__...
520 --(I wonder, is unicode / multi-byte characters allowed in LitStrings
522 {-# INLINE mkLitString #-}
523 mkLitString :: String -> LitString
526 p <- mallocBytes (length s + 1)
528 loop :: Int -> String -> IO ()
529 loop n cs | n `seq` null cs = pokeByteOff p n (0 :: Word8)
531 pokeByteOff p n (fromIntegral (ord c) :: Word8)
533 -- XXX GHC isn't smart enough to know that we have already covered
535 loop _ [] = panic "mkLitString"
540 unpackLitString :: LitString -> String
541 unpackLitString p_ = case pUnbox p_ of
542 p -> unpack (_ILIT(0))
544 unpack n = case indexWord8OffFastPtrAsFastChar p n of
545 ch -> if ch `eqFastChar` _CLIT('\0')
546 then [] else cBox ch : unpack (n +# _ILIT(1))
548 strLength :: LitString -> Int
549 strLength = ptrStrLength
551 -- for now, use a simple String representation
552 --no, let's not do that right now - it's work in other places
554 type LitString = String
556 mkLitString :: String -> LitString
559 unpackLitString :: LitString -> String
562 strLength :: LitString -> Int
567 -- -----------------------------------------------------------------------------
570 foreign import ccall unsafe "ghc_strlen"
571 ptrStrLength :: Ptr Word8 -> Int
573 -- NB. does *not* add a '\0'-terminator.
574 -- We only use CChar here to be parallel to the imported
575 -- peekC(A)StringLen.
576 pokeCAString :: Ptr CChar -> String -> IO ()
577 pokeCAString ptr str =
580 go (c:cs) n = do pokeElemOff ptr n (castCharToCChar c); go cs (n+1)
584 {-# NOINLINE sLit #-}
585 sLit :: String -> LitString
586 sLit x = mkLitString x
588 {-# NOINLINE fsLit #-}
589 fsLit :: String -> FastString
590 fsLit x = mkFastString x
593 forall x . sLit (unpackCString# x) = mkLitString# x #-}
595 forall x . fsLit (unpackCString# x) = mkFastString# x #-}