From: Simon Marlow Date: Fri, 6 Mar 2009 10:00:18 +0000 (+0000) Subject: Partial fix for #2917 X-Git-Tag: 2009-03-13~10 X-Git-Url: http://git.megacz.com/?a=commitdiff_plain;h=1b62aecee4a58f52999cfa53f1c6b7744b29b808;p=ghc-hetmet.git Partial fix for #2917 - add newAlignedPinnedByteArray# for allocating pinned BAs with arbitrary alignment - the old newPinnedByteArray# now aligns to 16 bytes Foreign.alloca will use newAlignedPinnedByteArray#, and so might end up wasting less space than before (we used to align to 8 by default). Foreign.allocaBytes and Foreign.mallocForeignPtrBytes will get 16-byte aligned memory, which is enough to avoid problems with SSE instructions on x86, for example. There was a bug in the old newPinnedByteArray#: it aligned to 8 bytes, but would have failed if the header was not a multiple of 8 (fortunately it always was, even with profiling). Also we occasionally wasted some space unnecessarily due to alignment in allocatePinned(). I haven't done anything about Foreign.malloc/mallocBytes, which will give you the same alignment guarantees as malloc() (8 bytes on Linux/x86 here). --- diff --git a/compiler/prelude/primops.txt.pp b/compiler/prelude/primops.txt.pp index 3779a79..942adb0 100644 --- a/compiler/prelude/primops.txt.pp +++ b/compiler/prelude/primops.txt.pp @@ -814,6 +814,11 @@ primop NewPinnedByteArrayOp_Char "newPinnedByteArray#" GenPrimOp {Create a mutable byte array that the GC guarantees not to move.} with out_of_line = True +primop NewAlignedPinnedByteArrayOp_Char "newAlignedPinnedByteArray#" GenPrimOp + Int# -> Int# -> State# s -> (# State# s, MutableByteArray# s #) + {Create a mutable byte array, aligned by the specified amount, that the GC guarantees not to move.} + with out_of_line = True + primop ByteArrayContents_Char "byteArrayContents#" GenPrimOp ByteArray# -> Addr# {Intended for use with pinned arrays; otherwise very unsafe!} diff --git a/includes/StgMiscClosures.h b/includes/StgMiscClosures.h index 94244a9..e2ec9e4 100644 --- a/includes/StgMiscClosures.h +++ b/includes/StgMiscClosures.h @@ -540,6 +540,7 @@ RTS_FUN(word64ToIntegerzh_fast); RTS_FUN(unsafeThawArrayzh_fast); RTS_FUN(newByteArrayzh_fast); RTS_FUN(newPinnedByteArrayzh_fast); +RTS_FUN(newAlignedPinnedByteArrayzh_fast); RTS_FUN(newArrayzh_fast); RTS_FUN(newMutVarzh_fast); diff --git a/rts/Linker.c b/rts/Linker.c index dc0d344..f701377 100644 --- a/rts/Linker.c +++ b/rts/Linker.c @@ -692,6 +692,7 @@ typedef struct _RtsSymbolVal { SymI_HasProto(noDuplicatezh_fast) \ SymI_HasProto(atomicModifyMutVarzh_fast) \ SymI_HasProto(newPinnedByteArrayzh_fast) \ + SymI_HasProto(newAlignedPinnedByteArrayzh_fast) \ SymI_HasProto(newSpark) \ SymI_HasProto(orIntegerzh_fast) \ SymI_HasProto(performGC) \ diff --git a/rts/PrimOps.cmm b/rts/PrimOps.cmm index a6e221b..272c705 100644 --- a/rts/PrimOps.cmm +++ b/rts/PrimOps.cmm @@ -83,6 +83,9 @@ newByteArrayzh_fast RET_P(p); } +#define BA_ALIGN 16 +#define BA_MASK (BA_ALIGN-1) + newPinnedByteArrayzh_fast { W_ words, payload_words, n, p; @@ -91,24 +94,45 @@ newPinnedByteArrayzh_fast n = R1; payload_words = ROUNDUP_BYTES_TO_WDS(n); - // We want an 8-byte aligned array. allocatePinned() gives us + words = payload_words + ((SIZEOF_StgArrWords + BA_MASK) & ~BA_MASK); + + ("ptr" p) = foreign "C" allocatePinned(words) []; + TICK_ALLOC_PRIM(SIZEOF_StgArrWords,WDS(payload_words),0); + + // This bumps p forwards so that the payload falls on an R2-byte boundary. + p = p + ((-p - SIZEOF_StgArrWords) & BA_MASK); + + SET_HDR(p, stg_ARR_WORDS_info, W_[CCCS]); + StgArrWords_words(p) = payload_words; + RET_P(p); +} + +newAlignedPinnedByteArrayzh_fast +{ + W_ words, payload_words, n, p, mask; + + MAYBE_GC(NO_PTRS,newAlignedPinnedByteArrayzh_fast); + n = R1; + + if (R2 > SIZEOF_W) { + mask = R2 - 1; + } else { + mask = 0; + } + + payload_words = ROUNDUP_BYTES_TO_WDS(n); + + // We want an -byte aligned array. allocatePinned() gives us // 8-byte aligned memory by default, but we want to align the // *goods* inside the ArrWords object, so we have to check the // size of the ArrWords header and adjust our size accordingly. - words = BYTES_TO_WDS(SIZEOF_StgArrWords) + payload_words; - if ((SIZEOF_StgArrWords & 7) != 0) { - words = words + 1; - } + words = payload_words + ((SIZEOF_StgArrWords + mask) & ~mask); ("ptr" p) = foreign "C" allocatePinned(words) []; TICK_ALLOC_PRIM(SIZEOF_StgArrWords,WDS(payload_words),0); - // Again, if the ArrWords header isn't a multiple of 8 bytes, we - // have to push the object forward one word so that the goods - // fall on an 8-byte boundary. - if ((SIZEOF_StgArrWords & 7) != 0) { - p = p + WDS(1); - } + // This bumps p forwards so that the payload falls on an R2-byte boundary. + p = p + ((-p - SIZEOF_StgArrWords) & mask); SET_HDR(p, stg_ARR_WORDS_info, W_[CCCS]); StgArrWords_words(p) = payload_words; diff --git a/rts/sm/Compact.c b/rts/sm/Compact.c index 6758cfa..5f56c1e 100644 --- a/rts/sm/Compact.c +++ b/rts/sm/Compact.c @@ -490,6 +490,10 @@ update_fwd_large( bdescr *bd ) for (; bd != NULL; bd = bd->link) { + // nothing to do in a pinned block; it might not even have an object + // at the beginning. + if (bd->flags & BF_PINNED) continue; + p = bd->start; info = get_itbl((StgClosure *)p); diff --git a/rts/sm/Evac.c b/rts/sm/Evac.c index 1c453fc..8d37f27 100644 --- a/rts/sm/Evac.c +++ b/rts/sm/Evac.c @@ -248,10 +248,6 @@ evacuate_large(StgPtr p) stp = bd->step; ACQUIRE_SPIN_LOCK(&stp->sync_large_objects); - // object must be at the beginning of the block (or be a ByteArray) - ASSERT(get_itbl((StgClosure *)p)->type == ARR_WORDS || - (((W_)p & BLOCK_MASK) == 0)); - // already evacuated? if (bd->flags & BF_EVACUATED) { /* Don't forget to set the gct->failed_to_evac flag if we didn't get @@ -287,11 +283,23 @@ evacuate_large(StgPtr p) } ws = &gct->steps[new_stp->abs_no]; + bd->flags |= BF_EVACUATED; bd->step = new_stp; bd->gen_no = new_stp->gen_no; - bd->link = ws->todo_large_objects; - ws->todo_large_objects = bd; + + // If this is a block of pinned objects, we don't have to scan + // these objects, because they aren't allowed to contain any + // pointers. For these blocks, we skip the scavenge stage and put + // them straight on the scavenged_large_objects list. + if (bd->flags & BF_PINNED) { + ASSERT(get_itbl((StgClosure *)p)->type == ARR_WORDS); + dbl_link_onto(bd, &ws->step->scavenged_large_objects); + ws->step->n_scavenged_large_blocks += bd->blocks; + } else { + bd->link = ws->todo_large_objects; + ws->todo_large_objects = bd; + } RELEASE_SPIN_LOCK(&stp->sync_large_objects); } diff --git a/rts/sm/Storage.c b/rts/sm/Storage.c index 36741a6..9dea30e 100644 --- a/rts/sm/Storage.c +++ b/rts/sm/Storage.c @@ -808,7 +808,9 @@ allocatePinned( lnat n ) // If the request is for a large object, then allocate() // will give us a pinned object anyway. if (n >= LARGE_OBJECT_THRESHOLD/sizeof(W_)) { - return allocate(n); + p = allocate(n); + Bdescr(p)->flags |= BF_PINNED; + return p; } ACQUIRE_SM_LOCK; @@ -816,13 +818,6 @@ allocatePinned( lnat n ) TICK_ALLOC_HEAP_NOCTR(n); CCS_ALLOC(CCCS,n); - // we always return 8-byte aligned memory. bd->free must be - // 8-byte aligned to begin with, so we just round up n to - // the nearest multiple of 8 bytes. - if (sizeof(StgWord) == 4) { - n = (n+1) & ~1; - } - // If we don't have a block of pinned objects yet, or the current // one isn't large enough to hold the new object, allocate a new one. if (bd == NULL || (bd->free + n) > (bd->start + BLOCK_SIZE_W)) {