2 % (c) The University of Glasgow, 1997-2006
5 {-# OPTIONS -fno-warn-unused-imports #-}
6 -- XXX GHC 6.9 seems to be confused by unpackCString# being used only in
9 {-# OPTIONS_GHC -O -funbox-strict-fields #-}
10 -- We always optimise this, otherwise performance of a non-optimised
11 -- compiler is severely affected
14 -- There are two principal string types used internally by GHC:
17 -- * A compact, hash-consed, representation of character strings.
18 -- * Comparison is O(1), and you can get a 'Unique.Unique' from them.
19 -- * Generated by 'fsLit'.
20 -- * Turn into 'Outputable.SDoc' with 'Outputable.ftext'.
23 -- * Just a wrapper for the @Addr#@ of a C string (@Ptr CChar@).
24 -- * Practically no operations.
25 -- * Outputing them is fast.
26 -- * Generated by 'sLit'.
27 -- * Turn into 'Outputable.SDoc' with 'Outputable.ptext'
29 -- Use 'LitString' unless you want the facilities of 'FastString'.
33 FastString(..), -- not abstract, for now.
40 mkFastStringForeignPtr,
41 #if defined(__GLASGOW_HASKELL__)
48 unpackFS, -- :: FastString -> String
49 bytesFS, -- :: FastString -> [Word8]
78 #if defined(__GLASGOW_HASKELL__)
90 #include "HsVersions.h"
101 import System.IO.Unsafe ( unsafePerformIO )
102 import Data.IORef ( IORef, newIORef, readIORef, atomicModifyIORef )
103 import Data.Maybe ( isJust )
104 import Data.Char ( ord )
106 #if __GLASGOW_HASKELL__ >= 611
107 import GHC.IO ( IO(..) )
109 import GHC.IOBase ( IO(..) )
112 import GHC.Ptr ( Ptr(..) )
113 #if defined(__GLASGOW_HASKELL__)
114 import GHC.Base ( unpackCString# )
117 #define hASH_TBL_SIZE 4091
118 #define hASH_TBL_SIZE_UNBOXED 4091#
122 A 'FastString' is an array of bytes, hashed to support fast O(1)
123 comparison. It is also associated with a character encoding, so that
124 we know how to convert a 'FastString' to the local encoding, or to the
125 Z-encoding used by the compiler internally.
127 'FastString's support a memoized conversion to the Z-encoding via zEncodeFS.
130 data FastString = FastString {
131 uniq :: {-# UNPACK #-} !Int, -- unique id
132 n_bytes :: {-# UNPACK #-} !Int, -- number of bytes
133 n_chars :: {-# UNPACK #-} !Int, -- number of chars
134 buf :: {-# UNPACK #-} !(ForeignPtr Word8),
139 -- including strings that don't need any encoding
141 -- A UTF-8 string with a memoized Z-encoding
142 | UTF8Encoded {-# UNPACK #-} !(IORef (Maybe FastString))
144 instance Eq FastString where
145 f1 == f2 = uniq f1 == uniq f2
147 instance Ord FastString where
148 -- Compares lexicographically, not by unique
149 a <= b = case cmpFS a b of { LT -> True; EQ -> True; GT -> False }
150 a < b = case cmpFS a b of { LT -> True; EQ -> False; GT -> False }
151 a >= b = case cmpFS a b of { LT -> False; EQ -> True; GT -> True }
152 a > b = case cmpFS a b of { LT -> False; EQ -> False; GT -> True }
157 compare a b = cmpFS a b
159 instance Show FastString where
160 show fs = show (unpackFS fs)
162 cmpFS :: FastString -> FastString -> Ordering
163 cmpFS (FastString u1 l1 _ buf1 _) (FastString u2 l2 _ buf2 _) =
164 if u1 == u2 then EQ else
165 case unsafeMemcmp buf1 buf2 (min l1 l2) `compare` 0 of
170 unsafeMemcmp :: ForeignPtr a -> ForeignPtr b -> Int -> Int
171 unsafeMemcmp buf1 buf2 l =
173 withForeignPtr buf1 $ \p1 ->
174 withForeignPtr buf2 $ \p2 ->
178 foreign import ccall unsafe "ghc_memcmp"
179 memcmp :: Ptr a -> Ptr b -> Int -> IO Int
182 -- -----------------------------------------------------------------------------
186 Internally, the compiler will maintain a fast string symbol
187 table, providing sharing and fast comparison. Creation of
188 new @FastString@s then covertly does a lookup, re-using the
189 @FastString@ if there was a hit.
192 data FastStringTable =
195 (MutableArray# RealWorld [FastString])
197 {-# NOINLINE string_table #-}
198 string_table :: IORef FastStringTable
201 tab <- IO $ \s1# -> case newArray# hASH_TBL_SIZE_UNBOXED [] s1# of
203 (# s2#, FastStringTable 0 arr# #)
206 lookupTbl :: FastStringTable -> Int -> IO [FastString]
207 lookupTbl (FastStringTable _ arr#) (I# i#) =
208 IO $ \ s# -> readArray# arr# i# s#
210 updTbl :: FastStringTable -> Int -> [FastString] -> IO FastStringTable
211 updTbl (FastStringTable uid arr#) (I# i#) ls = do
212 (IO $ \ s# -> case writeArray# arr# i# ls s# of { s2# -> (# s2#, () #) })
213 return (FastStringTable (uid+1) arr#)
215 -- | Helper function for various forms of fast string constructors.
216 mkFSInternal :: Ptr Word8 -> Int
217 -> (Int -> IO FastString)
219 -- The interesting part is the use of unsafePerformIO to make the
220 -- argument to atomicModifyIORef pure. This is safe because any
221 -- effect dependencies are enforced by data dependencies.
222 -- Furthermore, every result is used and hence there should be no
224 mkFSInternal ptr len mk_it = do
225 r <- atomicModifyIORef string_table $
226 \fs_tbl@(FastStringTable uid _) ->
227 let h = hashStr ptr len
230 fst' <- updTbl fs_tbl h (fs:ls)
231 fs `seq` fst' `seq` return (fst', fs)
232 in unsafePerformIO $ do
233 lookup_result <- lookupTbl fs_tbl h
234 case lookup_result of
237 b <- bucket_match ls len ptr
240 Just v -> return (fs_tbl, v)
243 mkFastString# :: Addr# -> FastString
244 mkFastString# a# = mkFastStringBytes ptr (ptrStrLength ptr)
247 mkFastStringBytes :: Ptr Word8 -> Int -> FastString
248 mkFastStringBytes ptr len = inlinePerformIO $ do
249 mkFSInternal ptr len (\uid -> copyNewFastString uid ptr len)
251 mkZFastStringBytes :: Ptr Word8 -> Int -> FastString
252 mkZFastStringBytes ptr len = inlinePerformIO $ do
253 mkFSInternal ptr len (\uid -> copyNewZFastString uid ptr len)
255 -- | Create a 'FastString' from an existing 'ForeignPtr'; the difference
256 -- between this and 'mkFastStringBytes' is that we don't have to copy
257 -- the bytes if the string is new to the table.
258 mkFastStringForeignPtr :: Ptr Word8 -> ForeignPtr Word8 -> Int -> IO FastString
259 mkFastStringForeignPtr ptr fp len = do
260 mkFSInternal ptr len (\uid -> mkNewFastString uid ptr fp len)
262 mkZFastStringForeignPtr :: Ptr Word8 -> ForeignPtr Word8 -> Int -> IO FastString
263 mkZFastStringForeignPtr ptr fp len = do
264 mkFSInternal ptr len (\uid -> mkNewZFastString uid ptr fp len)
266 -- | Creates a UTF-8 encoded 'FastString' from a 'String'
267 mkFastString :: String -> FastString
270 let l = utf8EncodedLength str
271 buf <- mallocForeignPtrBytes l
272 withForeignPtr buf $ \ptr -> do
273 utf8EncodeString ptr str
274 mkFastStringForeignPtr ptr buf l
276 -- | Creates a 'FastString' from a UTF-8 encoded @[Word8]@
277 mkFastStringByteList :: [Word8] -> FastString
278 mkFastStringByteList str =
280 let l = Prelude.length str
281 buf <- mallocForeignPtrBytes l
282 withForeignPtr buf $ \ptr -> do
283 pokeArray (castPtr ptr) str
284 mkFastStringForeignPtr ptr buf l
286 -- | Creates a Z-encoded 'FastString' from a 'String'
287 mkZFastString :: String -> FastString
290 let l = Prelude.length str
291 buf <- mallocForeignPtrBytes l
292 withForeignPtr buf $ \ptr -> do
293 pokeCAString (castPtr ptr) str
294 mkZFastStringForeignPtr ptr buf l
296 bucket_match :: [FastString] -> Int -> Ptr Word8 -> IO (Maybe FastString)
297 bucket_match [] _ _ = return Nothing
298 bucket_match (v@(FastString _ l _ buf _):ls) len ptr
300 b <- cmpStringPrefix ptr buf len
301 if b then return (Just v)
302 else bucket_match ls len ptr
304 bucket_match ls len ptr
306 mkNewFastString :: Int -> Ptr Word8 -> ForeignPtr Word8 -> Int
308 mkNewFastString uid ptr fp len = do
309 ref <- newIORef Nothing
310 n_chars <- countUTF8Chars ptr len
311 return (FastString uid len n_chars fp (UTF8Encoded ref))
313 mkNewZFastString :: Int -> Ptr Word8 -> ForeignPtr Word8 -> Int
315 mkNewZFastString uid _ fp len = do
316 return (FastString uid len len fp ZEncoded)
318 copyNewFastString :: Int -> Ptr Word8 -> Int -> IO FastString
319 copyNewFastString uid ptr len = do
320 fp <- copyBytesToForeignPtr ptr len
321 ref <- newIORef Nothing
322 n_chars <- countUTF8Chars ptr len
323 return (FastString uid len n_chars fp (UTF8Encoded ref))
325 copyNewZFastString :: Int -> Ptr Word8 -> Int -> IO FastString
326 copyNewZFastString uid ptr len = do
327 fp <- copyBytesToForeignPtr ptr len
328 return (FastString uid len len fp ZEncoded)
330 copyBytesToForeignPtr :: Ptr Word8 -> Int -> IO (ForeignPtr Word8)
331 copyBytesToForeignPtr ptr len = do
332 fp <- mallocForeignPtrBytes len
333 withForeignPtr fp $ \ptr' -> copyBytes ptr' ptr len
336 cmpStringPrefix :: Ptr Word8 -> ForeignPtr Word8 -> Int -> IO Bool
337 cmpStringPrefix ptr fp len =
338 withForeignPtr fp $ \ptr' -> do
339 r <- memcmp ptr ptr' len
343 hashStr :: Ptr Word8 -> Int -> Int
344 -- use the Addr to produce a hash value between 0 & m (inclusive)
345 hashStr (Ptr a#) (I# len#) = loop 0# 0#
347 loop h n | n GHC.Exts.==# len# = I# h
348 | otherwise = loop h2 (n GHC.Exts.+# 1#)
349 where !c = ord# (indexCharOffAddr# a# n)
350 !h2 = (c GHC.Exts.+# (h GHC.Exts.*# 128#)) `remInt#`
353 -- -----------------------------------------------------------------------------
356 -- | Returns the length of the 'FastString' in characters
357 lengthFS :: FastString -> Int
358 lengthFS f = n_chars f
360 -- | Returns @True@ if the 'FastString' is Z-encoded
361 isZEncoded :: FastString -> Bool
362 isZEncoded fs | ZEncoded <- enc fs = True
365 -- | Returns @True@ if this 'FastString' is not Z-encoded but already has
366 -- a Z-encoding cached (used in producing stats).
367 hasZEncoding :: FastString -> Bool
368 hasZEncoding (FastString _ _ _ _ enc) =
376 -- | Returns @True@ if the 'FastString' is empty
377 nullFS :: FastString -> Bool
378 nullFS f = n_bytes f == 0
380 -- | Unpacks and decodes the FastString
381 unpackFS :: FastString -> String
382 unpackFS (FastString _ n_bytes _ buf enc) =
383 inlinePerformIO $ withForeignPtr buf $ \ptr ->
385 ZEncoded -> peekCAStringLen (castPtr ptr,n_bytes)
386 UTF8Encoded _ -> utf8DecodeString ptr n_bytes
388 bytesFS :: FastString -> [Word8]
389 bytesFS (FastString _ n_bytes _ buf _) =
390 inlinePerformIO $ withForeignPtr buf $ \ptr ->
391 peekArray n_bytes ptr
393 -- | Returns a Z-encoded version of a 'FastString'. This might be the
394 -- original, if it was already Z-encoded. The first time this
395 -- function is applied to a particular 'FastString', the results are
398 zEncodeFS :: FastString -> FastString
399 zEncodeFS fs@(FastString _ _ _ _ enc) =
404 r <- atomicModifyIORef ref $ \m ->
408 let efs = mkZFastString (zEncodeString (unpackFS fs)) in
409 efs `seq` (Just efs, efs)
412 appendFS :: FastString -> FastString -> FastString
413 appendFS fs1 fs2 = mkFastString (unpackFS fs1 ++ unpackFS fs2)
415 concatFS :: [FastString] -> FastString
416 concatFS ls = mkFastString (Prelude.concat (map unpackFS ls)) -- ToDo: do better
418 headFS :: FastString -> Char
419 headFS (FastString _ 0 _ _ _) = panic "headFS: Empty FastString"
420 headFS (FastString _ _ _ buf enc) =
421 inlinePerformIO $ withForeignPtr buf $ \ptr -> do
424 w <- peek (castPtr ptr)
425 return (castCCharToChar w)
427 return (fst (utf8DecodeChar ptr))
429 tailFS :: FastString -> FastString
430 tailFS (FastString _ 0 _ _ _) = panic "tailFS: Empty FastString"
431 tailFS (FastString _ n_bytes _ buf enc) =
432 inlinePerformIO $ withForeignPtr buf $ \ptr -> do
435 return $! mkZFastStringBytes (ptr `plusPtr` 1) (n_bytes - 1)
437 let (_,ptr') = utf8DecodeChar ptr
438 let off = ptr' `minusPtr` ptr
439 return $! mkFastStringBytes (ptr `plusPtr` off) (n_bytes - off)
441 consFS :: Char -> FastString -> FastString
442 consFS c fs = mkFastString (c : unpackFS fs)
444 uniqueOfFS :: FastString -> FastInt
445 uniqueOfFS (FastString u _ _ _ _) = iUnbox u
448 nilFS = mkFastString ""
450 -- -----------------------------------------------------------------------------
453 getFastStringTable :: IO [[FastString]]
454 getFastStringTable = do
455 tbl <- readIORef string_table
456 buckets <- mapM (lookupTbl tbl) [0 .. hASH_TBL_SIZE]
459 -- -----------------------------------------------------------------------------
460 -- Outputting 'FastString's
462 -- |Outputs a 'FastString' with /no decoding at all/, that is, you
463 -- get the actual bytes in the 'FastString' written to the 'Handle'.
464 hPutFS :: Handle -> FastString -> IO ()
465 hPutFS handle (FastString _ len _ fp _)
466 | len == 0 = return ()
467 | otherwise = do withForeignPtr fp $ \ptr -> hPutBuf handle ptr len
469 -- ToDo: we'll probably want an hPutFSLocal, or something, to output
470 -- in the current locale's encoding (for error messages and suchlike).
472 -- -----------------------------------------------------------------------------
473 -- LitStrings, here for convenience only.
475 -- hmm, not unboxed (or rather FastPtr), interesting
476 --a.k.a. Ptr CChar, Ptr Word8, Ptr (), hmph. We don't
477 --really care about C types in naming, where we can help it.
478 type LitString = Ptr Word8
479 --Why do we recalculate length every time it's requested?
480 --If it's commonly needed, we should perhaps have
481 --data LitString = LitString {-#UNPACK#-}!(FastPtr Word8) {-#UNPACK#-}!FastInt
483 #if defined(__GLASGOW_HASKELL__)
484 mkLitString# :: Addr# -> LitString
485 mkLitString# a# = Ptr a#
487 --can/should we use FastTypes here?
488 --Is this likely to be memory-preserving if only used on constant strings?
489 --should we inline it? If lucky, that would make a CAF that wouldn't
490 --be computationally repeated... although admittedly we're not
491 --really intending to use mkLitString when __GLASGOW_HASKELL__...
492 --(I wonder, is unicode / multi-byte characters allowed in LitStrings
494 {-# INLINE mkLitString #-}
495 mkLitString :: String -> LitString
498 p <- mallocBytes (length s + 1)
500 loop :: Int -> String -> IO ()
501 loop n cs | n `seq` null cs = pokeByteOff p n (0 :: Word8)
503 pokeByteOff p n (fromIntegral (ord c) :: Word8)
505 -- XXX GHC isn't smart enough to know that we have already covered
507 loop _ [] = panic "mkLitString"
512 unpackLitString :: LitString -> String
513 unpackLitString p_ = case pUnbox p_ of
514 p -> unpack (_ILIT(0))
516 unpack n = case indexWord8OffFastPtrAsFastChar p n of
517 ch -> if ch `eqFastChar` _CLIT('\0')
518 then [] else cBox ch : unpack (n +# _ILIT(1))
520 lengthLS :: LitString -> Int
521 lengthLS = ptrStrLength
523 -- for now, use a simple String representation
524 --no, let's not do that right now - it's work in other places
526 type LitString = String
528 mkLitString :: String -> LitString
531 unpackLitString :: LitString -> String
534 lengthLS :: LitString -> Int
539 -- -----------------------------------------------------------------------------
542 foreign import ccall unsafe "ghc_strlen"
543 ptrStrLength :: Ptr Word8 -> Int
545 -- NB. does *not* add a '\0'-terminator.
546 -- We only use CChar here to be parallel to the imported
547 -- peekC(A)StringLen.
548 pokeCAString :: Ptr CChar -> String -> IO ()
549 pokeCAString ptr str =
552 go (c:cs) n = do pokeElemOff ptr n (castCharToCChar c); go cs (n+1)
556 {-# NOINLINE sLit #-}
557 sLit :: String -> LitString
558 sLit x = mkLitString x
560 {-# NOINLINE fsLit #-}
561 fsLit :: String -> FastString
562 fsLit x = mkFastString x
565 forall x . sLit (unpackCString# x) = mkLitString# x #-}
567 forall x . fsLit (unpackCString# x) = mkFastString# x #-}