2 % (c) The University of Glasgow, 1997-2006
5 {-# OPTIONS -fno-warn-unused-imports #-}
6 -- XXX GHC 6.9 seems to be confused by unpackCString# being used only in
9 {-# OPTIONS_GHC -O -funbox-strict-fields #-}
10 -- We always optimise this, otherwise performance of a non-optimised
11 -- compiler is severely affected
14 -- There are two principal string types used internally by GHC:
17 -- * A compact, hash-consed, representation of character strings.
18 -- * Comparison is O(1), and you can get a 'Unique.Unique' from them.
19 -- * Generated by 'fsLit'.
20 -- * Turn into 'Outputable.SDoc' with 'Outputable.ftext'.
23 -- * Just a wrapper for the @Addr#@ of a C string (@Ptr CChar@).
24 -- * Practically no operations.
25 -- * Outputing them is fast.
26 -- * Generated by 'sLit'.
27 -- * Turn into 'Outputable.SDoc' with 'Outputable.ptext'
29 -- Use 'LitString' unless you want the facilities of 'FastString'.
33 FastString(..), -- not abstract, for now.
40 mkFastStringForeignPtr,
41 #if defined(__GLASGOW_HASKELL__)
48 unpackFS, -- :: FastString -> String
49 bytesFS, -- :: FastString -> [Word8]
78 #if defined(__GLASGOW_HASKELL__)
90 #include "HsVersions.h"
101 import System.IO.Unsafe ( unsafePerformIO )
102 import Data.IORef ( IORef, newIORef, readIORef, writeIORef )
103 import Data.Maybe ( isJust )
104 import Data.Char ( ord )
106 import GHC.IOBase ( IO(..) )
107 import GHC.Ptr ( Ptr(..) )
108 #if defined(__GLASGOW_HASKELL__)
109 import GHC.Base ( unpackCString# )
112 #define hASH_TBL_SIZE 4091
113 #define hASH_TBL_SIZE_UNBOXED 4091#
117 A 'FastString' is an array of bytes, hashed to support fast O(1)
118 comparison. It is also associated with a character encoding, so that
119 we know how to convert a 'FastString' to the local encoding, or to the
120 Z-encoding used by the compiler internally.
122 'FastString's support a memoized conversion to the Z-encoding via zEncodeFS.
125 data FastString = FastString {
126 uniq :: {-# UNPACK #-} !Int, -- unique id
127 n_bytes :: {-# UNPACK #-} !Int, -- number of bytes
128 n_chars :: {-# UNPACK #-} !Int, -- number of chars
129 buf :: {-# UNPACK #-} !(ForeignPtr Word8),
134 -- including strings that don't need any encoding
136 -- A UTF-8 string with a memoized Z-encoding
137 | UTF8Encoded {-# UNPACK #-} !(IORef (Maybe FastString))
139 instance Eq FastString where
140 f1 == f2 = uniq f1 == uniq f2
142 instance Ord FastString where
143 -- Compares lexicographically, not by unique
144 a <= b = case cmpFS a b of { LT -> True; EQ -> True; GT -> False }
145 a < b = case cmpFS a b of { LT -> True; EQ -> False; GT -> False }
146 a >= b = case cmpFS a b of { LT -> False; EQ -> True; GT -> True }
147 a > b = case cmpFS a b of { LT -> False; EQ -> False; GT -> True }
152 compare a b = cmpFS a b
154 instance Show FastString where
155 show fs = show (unpackFS fs)
157 cmpFS :: FastString -> FastString -> Ordering
158 cmpFS (FastString u1 l1 _ buf1 _) (FastString u2 l2 _ buf2 _) =
159 if u1 == u2 then EQ else
160 case unsafeMemcmp buf1 buf2 (min l1 l2) `compare` 0 of
165 unsafeMemcmp :: ForeignPtr a -> ForeignPtr b -> Int -> Int
166 unsafeMemcmp buf1 buf2 l =
168 withForeignPtr buf1 $ \p1 ->
169 withForeignPtr buf2 $ \p2 ->
173 foreign import ccall unsafe "ghc_memcmp"
174 memcmp :: Ptr a -> Ptr b -> Int -> IO Int
177 -- -----------------------------------------------------------------------------
181 Internally, the compiler will maintain a fast string symbol
182 table, providing sharing and fast comparison. Creation of
183 new @FastString@s then covertly does a lookup, re-using the
184 @FastString@ if there was a hit.
187 data FastStringTable =
190 (MutableArray# RealWorld [FastString])
192 {-# NOINLINE string_table #-}
193 string_table :: IORef FastStringTable
196 tab <- IO $ \s1# -> case newArray# hASH_TBL_SIZE_UNBOXED [] s1# of
198 (# s2#, FastStringTable 0 arr# #)
201 lookupTbl :: FastStringTable -> Int -> IO [FastString]
202 lookupTbl (FastStringTable _ arr#) (I# i#) =
203 IO $ \ s# -> readArray# arr# i# s#
205 updTbl :: IORef FastStringTable -> FastStringTable -> Int -> [FastString] -> IO ()
206 updTbl fs_table_var (FastStringTable uid arr#) (I# i#) ls = do
207 (IO $ \ s# -> case writeArray# arr# i# ls s# of { s2# -> (# s2#, () #) })
208 writeIORef fs_table_var (FastStringTable (uid+1) arr#)
210 mkFastString# :: Addr# -> FastString
211 mkFastString# a# = mkFastStringBytes ptr (ptrStrLength ptr)
214 mkFastStringBytes :: Ptr Word8 -> Int -> FastString
215 mkFastStringBytes ptr len = unsafePerformIO $ do
216 ft@(FastStringTable uid _) <- readIORef string_table
220 fs <- copyNewFastString uid ptr len
221 updTbl string_table ft h (fs:ls)
222 {- _trace ("new: " ++ show f_str) $ -}
225 lookup_result <- lookupTbl ft h
226 case lookup_result of
229 b <- bucket_match ls len ptr
232 Just v -> {- _trace ("re-use: "++show v) $ -} return v
234 mkZFastStringBytes :: Ptr Word8 -> Int -> FastString
235 mkZFastStringBytes ptr len = unsafePerformIO $ do
236 ft@(FastStringTable uid _) <- readIORef string_table
240 fs <- copyNewZFastString uid ptr len
241 updTbl string_table ft h (fs:ls)
242 {- _trace ("new: " ++ show f_str) $ -}
245 lookup_result <- lookupTbl ft h
246 case lookup_result of
249 b <- bucket_match ls len ptr
252 Just v -> {- _trace ("re-use: "++show v) $ -} return v
254 -- | Create a 'FastString' from an existing 'ForeignPtr'; the difference
255 -- between this and 'mkFastStringBytes' is that we don't have to copy
256 -- the bytes if the string is new to the table.
257 mkFastStringForeignPtr :: Ptr Word8 -> ForeignPtr Word8 -> Int -> IO FastString
258 mkFastStringForeignPtr ptr fp len = do
259 ft@(FastStringTable uid _) <- readIORef string_table
260 -- _trace ("hashed: "++show (I# h)) $
264 fs <- mkNewFastString uid ptr fp len
265 updTbl string_table ft h (fs:ls)
266 {- _trace ("new: " ++ show f_str) $ -}
269 lookup_result <- lookupTbl ft h
270 case lookup_result of
273 b <- bucket_match ls len ptr
276 Just v -> {- _trace ("re-use: "++show v) $ -} return v
278 mkZFastStringForeignPtr :: Ptr Word8 -> ForeignPtr Word8 -> Int -> IO FastString
279 mkZFastStringForeignPtr ptr fp len = do
280 ft@(FastStringTable uid _) <- readIORef string_table
281 -- _trace ("hashed: "++show (I# h)) $
285 fs <- mkNewZFastString uid ptr fp len
286 updTbl string_table ft h (fs:ls)
287 {- _trace ("new: " ++ show f_str) $ -}
290 lookup_result <- lookupTbl ft h
291 case lookup_result of
294 b <- bucket_match ls len ptr
297 Just v -> {- _trace ("re-use: "++show v) $ -} return v
300 -- | Creates a UTF-8 encoded 'FastString' from a 'String'
301 mkFastString :: String -> FastString
304 let l = utf8EncodedLength str
305 buf <- mallocForeignPtrBytes l
306 withForeignPtr buf $ \ptr -> do
307 utf8EncodeString ptr str
308 mkFastStringForeignPtr ptr buf l
310 -- | Creates a 'FastString' from a UTF-8 encoded @[Word8]@
311 mkFastStringByteList :: [Word8] -> FastString
312 mkFastStringByteList str =
314 let l = Prelude.length str
315 buf <- mallocForeignPtrBytes l
316 withForeignPtr buf $ \ptr -> do
317 pokeArray (castPtr ptr) str
318 mkFastStringForeignPtr ptr buf l
320 -- | Creates a Z-encoded 'FastString' from a 'String'
321 mkZFastString :: String -> FastString
324 let l = Prelude.length str
325 buf <- mallocForeignPtrBytes l
326 withForeignPtr buf $ \ptr -> do
327 pokeCAString (castPtr ptr) str
328 mkZFastStringForeignPtr ptr buf l
330 bucket_match :: [FastString] -> Int -> Ptr Word8 -> IO (Maybe FastString)
331 bucket_match [] _ _ = return Nothing
332 bucket_match (v@(FastString _ l _ buf _):ls) len ptr
334 b <- cmpStringPrefix ptr buf len
335 if b then return (Just v)
336 else bucket_match ls len ptr
338 bucket_match ls len ptr
340 mkNewFastString :: Int -> Ptr Word8 -> ForeignPtr Word8 -> Int
342 mkNewFastString uid ptr fp len = do
343 ref <- newIORef Nothing
344 n_chars <- countUTF8Chars ptr len
345 return (FastString uid len n_chars fp (UTF8Encoded ref))
347 mkNewZFastString :: Int -> Ptr Word8 -> ForeignPtr Word8 -> Int
349 mkNewZFastString uid _ fp len = do
350 return (FastString uid len len fp ZEncoded)
352 copyNewFastString :: Int -> Ptr Word8 -> Int -> IO FastString
353 copyNewFastString uid ptr len = do
354 fp <- copyBytesToForeignPtr ptr len
355 ref <- newIORef Nothing
356 n_chars <- countUTF8Chars ptr len
357 return (FastString uid len n_chars fp (UTF8Encoded ref))
359 copyNewZFastString :: Int -> Ptr Word8 -> Int -> IO FastString
360 copyNewZFastString uid ptr len = do
361 fp <- copyBytesToForeignPtr ptr len
362 return (FastString uid len len fp ZEncoded)
364 copyBytesToForeignPtr :: Ptr Word8 -> Int -> IO (ForeignPtr Word8)
365 copyBytesToForeignPtr ptr len = do
366 fp <- mallocForeignPtrBytes len
367 withForeignPtr fp $ \ptr' -> copyBytes ptr' ptr len
370 cmpStringPrefix :: Ptr Word8 -> ForeignPtr Word8 -> Int -> IO Bool
371 cmpStringPrefix ptr fp len =
372 withForeignPtr fp $ \ptr' -> do
373 r <- memcmp ptr ptr' len
377 hashStr :: Ptr Word8 -> Int -> Int
378 -- use the Addr to produce a hash value between 0 & m (inclusive)
379 hashStr (Ptr a#) (I# len#) = loop 0# 0#
381 loop h n | n GHC.Exts.==# len# = I# h
382 | otherwise = loop h2 (n GHC.Exts.+# 1#)
383 where !c = ord# (indexCharOffAddr# a# n)
384 !h2 = (c GHC.Exts.+# (h GHC.Exts.*# 128#)) `remInt#`
387 -- -----------------------------------------------------------------------------
390 -- | Returns the length of the 'FastString' in characters
391 lengthFS :: FastString -> Int
392 lengthFS f = n_chars f
394 -- | Returns @True@ if the 'FastString' is Z-encoded
395 isZEncoded :: FastString -> Bool
396 isZEncoded fs | ZEncoded <- enc fs = True
399 -- | Returns @True@ if this 'FastString' is not Z-encoded but already has
400 -- a Z-encoding cached (used in producing stats).
401 hasZEncoding :: FastString -> Bool
402 hasZEncoding (FastString _ _ _ _ enc) =
410 -- | Returns @True@ if the 'FastString' is empty
411 nullFS :: FastString -> Bool
412 nullFS f = n_bytes f == 0
414 -- | Unpacks and decodes the FastString
415 unpackFS :: FastString -> String
416 unpackFS (FastString _ n_bytes _ buf enc) =
417 inlinePerformIO $ withForeignPtr buf $ \ptr ->
419 ZEncoded -> peekCAStringLen (castPtr ptr,n_bytes)
420 UTF8Encoded _ -> utf8DecodeString ptr n_bytes
422 bytesFS :: FastString -> [Word8]
423 bytesFS (FastString _ n_bytes _ buf _) =
424 inlinePerformIO $ withForeignPtr buf $ \ptr ->
425 peekArray n_bytes ptr
427 -- | Returns a Z-encoded version of a 'FastString'. This might be the
428 -- original, if it was already Z-encoded. The first time this
429 -- function is applied to a particular 'FastString', the results are
432 zEncodeFS :: FastString -> FastString
433 zEncodeFS fs@(FastString _ _ _ _ enc) =
442 let efs = mkZFastString (zEncodeString (unpackFS fs))
443 writeIORef ref (Just efs)
446 appendFS :: FastString -> FastString -> FastString
447 appendFS fs1 fs2 = mkFastString (unpackFS fs1 ++ unpackFS fs2)
449 concatFS :: [FastString] -> FastString
450 concatFS ls = mkFastString (Prelude.concat (map unpackFS ls)) -- ToDo: do better
452 headFS :: FastString -> Char
453 headFS (FastString _ 0 _ _ _) = panic "headFS: Empty FastString"
454 headFS (FastString _ _ _ buf enc) =
455 inlinePerformIO $ withForeignPtr buf $ \ptr -> do
458 w <- peek (castPtr ptr)
459 return (castCCharToChar w)
461 return (fst (utf8DecodeChar ptr))
463 tailFS :: FastString -> FastString
464 tailFS (FastString _ 0 _ _ _) = panic "tailFS: Empty FastString"
465 tailFS (FastString _ n_bytes _ buf enc) =
466 inlinePerformIO $ withForeignPtr buf $ \ptr -> do
469 return $! mkZFastStringBytes (ptr `plusPtr` 1) (n_bytes - 1)
471 let (_,ptr') = utf8DecodeChar ptr
472 let off = ptr' `minusPtr` ptr
473 return $! mkFastStringBytes (ptr `plusPtr` off) (n_bytes - off)
475 consFS :: Char -> FastString -> FastString
476 consFS c fs = mkFastString (c : unpackFS fs)
478 uniqueOfFS :: FastString -> FastInt
479 uniqueOfFS (FastString u _ _ _ _) = iUnbox u
482 nilFS = mkFastString ""
484 -- -----------------------------------------------------------------------------
487 getFastStringTable :: IO [[FastString]]
488 getFastStringTable = do
489 tbl <- readIORef string_table
490 buckets <- mapM (lookupTbl tbl) [0 .. hASH_TBL_SIZE]
493 -- -----------------------------------------------------------------------------
494 -- Outputting 'FastString's
496 -- |Outputs a 'FastString' with /no decoding at all/, that is, you
497 -- get the actual bytes in the 'FastString' written to the 'Handle'.
498 hPutFS :: Handle -> FastString -> IO ()
499 hPutFS handle (FastString _ len _ fp _)
500 | len == 0 = return ()
501 | otherwise = do withForeignPtr fp $ \ptr -> hPutBuf handle ptr len
503 -- ToDo: we'll probably want an hPutFSLocal, or something, to output
504 -- in the current locale's encoding (for error messages and suchlike).
506 -- -----------------------------------------------------------------------------
507 -- LitStrings, here for convenience only.
509 -- hmm, not unboxed (or rather FastPtr), interesting
510 --a.k.a. Ptr CChar, Ptr Word8, Ptr (), hmph. We don't
511 --really care about C types in naming, where we can help it.
512 type LitString = Ptr Word8
513 --Why do we recalculate length every time it's requested?
514 --If it's commonly needed, we should perhaps have
515 --data LitString = LitString {-#UNPACK#-}!(FastPtr Word8) {-#UNPACK#-}!FastInt
517 #if defined(__GLASGOW_HASKELL__)
518 mkLitString# :: Addr# -> LitString
519 mkLitString# a# = Ptr a#
521 --can/should we use FastTypes here?
522 --Is this likely to be memory-preserving if only used on constant strings?
523 --should we inline it? If lucky, that would make a CAF that wouldn't
524 --be computationally repeated... although admittedly we're not
525 --really intending to use mkLitString when __GLASGOW_HASKELL__...
526 --(I wonder, is unicode / multi-byte characters allowed in LitStrings
528 {-# INLINE mkLitString #-}
529 mkLitString :: String -> LitString
532 p <- mallocBytes (length s + 1)
534 loop :: Int -> String -> IO ()
535 loop n cs | n `seq` null cs = pokeByteOff p n (0 :: Word8)
537 pokeByteOff p n (fromIntegral (ord c) :: Word8)
539 -- XXX GHC isn't smart enough to know that we have already covered
541 loop _ [] = panic "mkLitString"
546 unpackLitString :: LitString -> String
547 unpackLitString p_ = case pUnbox p_ of
548 p -> unpack (_ILIT(0))
550 unpack n = case indexWord8OffFastPtrAsFastChar p n of
551 ch -> if ch `eqFastChar` _CLIT('\0')
552 then [] else cBox ch : unpack (n +# _ILIT(1))
554 lengthLS :: LitString -> Int
555 lengthLS = ptrStrLength
557 -- for now, use a simple String representation
558 --no, let's not do that right now - it's work in other places
560 type LitString = String
562 mkLitString :: String -> LitString
565 unpackLitString :: LitString -> String
568 lengthLS :: LitString -> Int
573 -- -----------------------------------------------------------------------------
576 foreign import ccall unsafe "ghc_strlen"
577 ptrStrLength :: Ptr Word8 -> Int
579 -- NB. does *not* add a '\0'-terminator.
580 -- We only use CChar here to be parallel to the imported
581 -- peekC(A)StringLen.
582 pokeCAString :: Ptr CChar -> String -> IO ()
583 pokeCAString ptr str =
586 go (c:cs) n = do pokeElemOff ptr n (castCharToCChar c); go cs (n+1)
590 {-# NOINLINE sLit #-}
591 sLit :: String -> LitString
592 sLit x = mkLitString x
594 {-# NOINLINE fsLit #-}
595 fsLit :: String -> FastString
596 fsLit x = mkFastString x
599 forall x . sLit (unpackCString# x) = mkLitString# x #-}
601 forall x . fsLit (unpackCString# x) = mkFastString# x #-}