From 04db0e9fa47ce4dfbcb73ec1752d94195f3b394e Mon Sep 17 00:00:00 2001 From: Simon Marlow Date: Tue, 28 Feb 2006 15:25:24 +0000 Subject: [PATCH] pass arguments to unknown function calls in registers We now have more stg_ap entry points: stg_ap_*_fast, which take arguments in registers according to the platform calling convention. This is faster if the function being called is evaluated and has the right arity, which is the common case (see the eval/apply paper for measurements). We still need the stg_ap_*_info entry points for stack-based application, such as an overflows when a function is applied to too many argumnets. The stg_ap_*_fast functions actually just check for an evaluated function, and if they don't find one, push the args on the stack and invoke stg_ap_*_info. (this might be slightly slower in some cases, but not the common case). --- ghc/compiler/cmm/CLabel.hs | 9 ++ ghc/compiler/codeGen/CgCallConv.hs | 15 +-- ghc/compiler/codeGen/CgTailCall.lhs | 75 +++++++------- ghc/includes/StgMiscClosures.h | 18 +++- ghc/rts/Apply.cmm | 24 +---- ghc/rts/Exception.cmm | 12 +-- ghc/rts/Linker.c | 17 ++- ghc/rts/PrimOps.cmm | 27 ++--- ghc/rts/StgStdThunks.cmm | 15 ++- ghc/utils/genapply/GenApply.hs | 193 ++++++++++++++++++++++++++++++----- 10 files changed, 275 insertions(+), 130 deletions(-) diff --git a/ghc/compiler/cmm/CLabel.hs b/ghc/compiler/cmm/CLabel.hs index 2f52e42..e42b92d 100644 --- a/ghc/compiler/cmm/CLabel.hs +++ b/ghc/compiler/cmm/CLabel.hs @@ -80,6 +80,8 @@ module CLabel ( mkRtsCodeLabelFS, mkRtsDataLabelFS, + mkRtsApFastLabel, + mkForeignLabel, mkCCLabel, mkCCSLabel, @@ -259,6 +261,8 @@ data RtsLabelInfo | RtsDataFS FastString -- misc rts data bits, eg CHARLIKE_closure | RtsCodeFS FastString -- misc rts code + | RtsApFast LitString -- _fast versions of generic apply + | RtsSlowTickyCtr String deriving (Eq, Ord) @@ -393,6 +397,8 @@ mkRtsRetLabelFS str = RtsLabel (RtsRetFS str) mkRtsCodeLabelFS str = RtsLabel (RtsCodeFS str) mkRtsDataLabelFS str = RtsLabel (RtsDataFS str) +mkRtsApFastLabel str = RtsLabel (RtsApFast str) + mkRtsSlowTickyCtrLabel :: String -> CLabel mkRtsSlowTickyCtrLabel pat = RtsLabel (RtsSlowTickyCtr pat) @@ -520,6 +526,7 @@ labelType (RtsLabel (RtsInfoFS _)) = DataLabel labelType (RtsLabel (RtsEntryFS _)) = CodeLabel labelType (RtsLabel (RtsRetInfoFS _)) = DataLabel labelType (RtsLabel (RtsRetFS _)) = CodeLabel +labelType (RtsLabel (RtsApFast _)) = CodeLabel labelType (CaseLabel _ CaseReturnInfo) = DataLabel labelType (CaseLabel _ _) = CodeLabel labelType (ModuleInitLabel _ _ _) = CodeLabel @@ -676,6 +683,8 @@ pprCLbl (RtsLabel (RtsData str)) = ptext str pprCLbl (RtsLabel (RtsCodeFS str)) = ftext str pprCLbl (RtsLabel (RtsDataFS str)) = ftext str +pprCLbl (RtsLabel (RtsApFast str)) = ptext str <> ptext SLIT("_fast") + pprCLbl (RtsLabel (RtsSelectorInfoTable upd_reqd offset)) = hcat [ptext SLIT("stg_sel_"), text (show offset), ptext (if upd_reqd diff --git a/ghc/compiler/codeGen/CgCallConv.hs b/ghc/compiler/codeGen/CgCallConv.hs index f5232a5..f463255 100644 --- a/ghc/compiler/codeGen/CgCallConv.hs +++ b/ghc/compiler/codeGen/CgCallConv.hs @@ -206,17 +206,20 @@ mkRegLiveness regs ptrs nptrs -- For a slow call, we must take a bunch of arguments and intersperse -- some stg_ap__ret_info return addresses. -constructSlowCall :: [(CgRep,CmmExpr)] -> (CLabel, [(CgRep,CmmExpr)]) +constructSlowCall + :: [(CgRep,CmmExpr)] + -> (CLabel, -- RTS entry point for call + [(CgRep,CmmExpr)], -- args to pass to the entry point + [(CgRep,CmmExpr)]) -- stuff to save on the stack + -- don't forget the zero case constructSlowCall [] - = (stg_ap_0, []) - where - stg_ap_0 = enterRtsRetLabel SLIT("stg_ap_0") + = (mkRtsApFastLabel SLIT("stg_ap_0"), [], []) constructSlowCall amodes - = (stg_ap_pat, these ++ slowArgs rest) + = (stg_ap_pat, these, rest) where - stg_ap_pat = enterRtsRetLabel arg_pat + stg_ap_pat = mkRtsApFastLabel arg_pat (arg_pat, these, rest) = matchSlowPattern amodes enterRtsRetLabel arg_pat diff --git a/ghc/compiler/codeGen/CgTailCall.lhs b/ghc/compiler/codeGen/CgTailCall.lhs index f76fcbd..dd7327b 100644 --- a/ghc/compiler/codeGen/CgTailCall.lhs +++ b/ghc/compiler/codeGen/CgTailCall.lhs @@ -149,56 +149,34 @@ performTailCall fun_info arg_amodes pending_assts -- A slow function call via the RTS apply routines -- Node must definitely point to the thing SlowCall -> do - { let (apply_lbl, new_amodes) = constructSlowCall arg_amodes - - -- Fill in all the arguments on the stack - ; (final_sp,stk_assts) <- mkStkAmodes sp new_amodes - - ; emitSimultaneously (node_asst `plusStmts` stk_assts - `plusStmts` pending_assts) - - ; when (not (null arg_amodes)) $ do + { when (not (null arg_amodes)) $ do { if (isKnownFun lf_info) then tickyKnownCallTooFewArgs else tickyUnknownCall - ; tickySlowCallPat (map fst arg_amodes) - } + ; tickySlowCallPat (map fst arg_amodes) + } - ; doFinalJump (final_sp + 1) - -- Add one, because the stg_ap functions - -- expect there to be a free slot on the stk - False (jumpToLbl apply_lbl) + ; let (apply_lbl, args, extra_args) + = constructSlowCall arg_amodes + + ; directCall sp apply_lbl args extra_args + (node_asst `plusStmts` pending_assts) } -- A direct function call (possibly with some left-over arguments) DirectEntry lbl arity -> do - { let - -- The args beyond the arity go straight on the stack - (arity_args, extra_stk_args) = splitAt arity arg_amodes - - -- First chunk of args go in registers - (reg_arg_amodes, stk_args) = assignCallRegs arity_args - - -- Any "extra" arguments are placed in frames on the - -- stack after the other arguments. - slow_stk_args = slowArgs extra_stk_args - - reg_assts = assignToRegs reg_arg_amodes - - ; if null slow_stk_args + { if arity == length arg_amodes then tickyKnownCallExact else do tickyKnownCallExtraArgs - tickySlowCallPat (map fst extra_stk_args) + tickySlowCallPat (map fst (drop arity arg_amodes)) - ; (final_sp, stk_assts) <- mkStkAmodes sp - (stk_args ++ slow_stk_args) - - ; emitSimultaneously (opt_node_asst `plusStmts` - reg_assts `plusStmts` - stk_assts `plusStmts` - pending_assts) - - ; doFinalJump final_sp False (jumpToLbl lbl) } + ; let + -- The args beyond the arity go straight on the stack + (arity_args, extra_args) = splitAt arity arg_amodes + + ; directCall sp lbl arity_args extra_args + (opt_node_asst `plusStmts` pending_assts) + } } where fun_name = idName (cgIdInfoId fun_info) @@ -206,6 +184,25 @@ performTailCall fun_info arg_amodes pending_assts +directCall sp lbl args extra_args assts = do + let + -- First chunk of args go in registers + (reg_arg_amodes, stk_args) = assignCallRegs args + + -- Any "extra" arguments are placed in frames on the + -- stack after the other arguments. + slow_stk_args = slowArgs extra_args + + reg_assts = assignToRegs reg_arg_amodes + -- + (final_sp, stk_assts) <- mkStkAmodes sp (stk_args ++ slow_stk_args) + + emitSimultaneously (reg_assts `plusStmts` + stk_assts `plusStmts` + assts) + + doFinalJump final_sp False (jumpToLbl lbl) + -- ----------------------------------------------------------------------------- -- The final clean-up before we do a jump at the end of a basic block. -- This code is shared by tail-calls and returns. diff --git a/ghc/includes/StgMiscClosures.h b/ghc/includes/StgMiscClosures.h index 432767d..844c846 100644 --- a/ghc/includes/StgMiscClosures.h +++ b/ghc/includes/StgMiscClosures.h @@ -379,7 +379,6 @@ RTS_ENTRY(stg_ap_7_upd_entry); /* standard application routines (see also rts/gen_apply.py, * and compiler/codeGen/CgStackery.lhs). */ -RTS_RET_INFO(stg_ap_0_info); RTS_RET_INFO(stg_ap_v_info); RTS_RET_INFO(stg_ap_f_info); RTS_RET_INFO(stg_ap_d_info); @@ -395,7 +394,6 @@ RTS_RET_INFO(stg_ap_pppp_info); RTS_RET_INFO(stg_ap_ppppp_info); RTS_RET_INFO(stg_ap_pppppp_info); -RTS_ENTRY(stg_ap_0_ret); RTS_ENTRY(stg_ap_v_ret); RTS_ENTRY(stg_ap_f_ret); RTS_ENTRY(stg_ap_d_ret); @@ -411,6 +409,22 @@ RTS_ENTRY(stg_ap_pppp_ret); RTS_ENTRY(stg_ap_ppppp_ret); RTS_ENTRY(stg_ap_pppppp_ret); +RTS_FUN(stg_ap_0_fast); +RTS_FUN(stg_ap_v_fast); +RTS_FUN(stg_ap_f_fast); +RTS_FUN(stg_ap_d_fast); +RTS_FUN(stg_ap_l_fast); +RTS_FUN(stg_ap_n_fast); +RTS_FUN(stg_ap_p_fast); +RTS_FUN(stg_ap_pv_fast); +RTS_FUN(stg_ap_pp_fast); +RTS_FUN(stg_ap_ppv_fast); +RTS_FUN(stg_ap_ppp_fast); +RTS_FUN(stg_ap_pppv_fast); +RTS_FUN(stg_ap_pppp_fast); +RTS_FUN(stg_ap_ppppp_fast); +RTS_FUN(stg_ap_pppppp_fast); + /* standard GC & stack check entry points, all defined in HeapStackCheck.hc */ RTS_RET_INFO(stg_enter_info); diff --git a/ghc/rts/Apply.cmm b/ghc/rts/Apply.cmm index 8d19d14..58ca18b 100644 --- a/ghc/rts/Apply.cmm +++ b/ghc/rts/Apply.cmm @@ -15,28 +15,13 @@ /* ---------------------------------------------------------------------------- * Evaluate a closure and return it. * - * stg_ap_0_info <--- Sp - * - * NOTE: this needs to be a polymorphic return point, because we can't - * be sure that the thing being evaluated is not a function. + * There isn't an info table / return address version of stg_ap_0, because + * everything being returned is guaranteed evaluated, so it would be a no-op. */ -#if MAX_VECTORED_RTN > 8 -#error MAX_VECTORED_RTN has changed: please modify stg_ap_0 too. -#endif - STRING(stg_ap_0_ret_str,"stg_ap_0_ret... ") -INFO_TABLE_RET( stg_ap_0, - 0/*framsize*/, 0/*bitmap*/, RET_SMALL, - RET_LBL(stg_ap_0), - RET_LBL(stg_ap_0), - RET_LBL(stg_ap_0), - RET_LBL(stg_ap_0), - RET_LBL(stg_ap_0), - RET_LBL(stg_ap_0), - RET_LBL(stg_ap_0), - RET_LBL(stg_ap_0) ) +stg_ap_0_fast { // fn is in R1, no args on the stack @@ -45,11 +30,10 @@ INFO_TABLE_RET( stg_ap_0, foreign "C" printClosure(R1 "ptr") [R1]); IF_DEBUG(sanity, - foreign "C" checkStackChunk(Sp+WDS(1) "ptr", + foreign "C" checkStackChunk(Sp "ptr", CurrentTSO + TSO_OFFSET_StgTSO_stack + WDS(StgTSO_stack_size(CurrentTSO)) "ptr") [R1]); - Sp_adj(1); ENTER(); } diff --git a/ghc/rts/Exception.cmm b/ghc/rts/Exception.cmm index 9d8d9d6..4bb9e48 100644 --- a/ghc/rts/Exception.cmm +++ b/ghc/rts/Exception.cmm @@ -102,10 +102,9 @@ blockAsyncExceptionszh_fast Sp(0) = stg_unblockAsyncExceptionszh_ret_info; } } - Sp_adj(-1); TICK_UNKNOWN_CALL(); TICK_SLOW_CALL_v(); - jump RET_LBL(stg_ap_v); + jump stg_ap_v_fast; } unblockAsyncExceptionszh_fast @@ -130,10 +129,9 @@ unblockAsyncExceptionszh_fast Sp(0) = stg_blockAsyncExceptionszh_ret_info; } } - Sp_adj(-1); TICK_UNKNOWN_CALL(); TICK_SLOW_CALL_v(); - jump RET_LBL(stg_ap_v); + jump stg_ap_v_fast; } @@ -307,10 +305,9 @@ catchzh_fast TICK_CATCHF_PUSHED(); /* Apply R1 to the realworld token */ - Sp_adj(-1); TICK_UNKNOWN_CALL(); TICK_SLOW_CALL_v(); - jump RET_LBL(stg_ap_v); + jump stg_ap_v_fast; } /* ----------------------------------------------------------------------------- @@ -372,8 +369,7 @@ retry_pop_stack: "ptr" trec = foreign "C" stmStartTransaction(MyCapability() "ptr", NO_TREC "ptr"); StgTSO_trec(CurrentTSO) = trec; R1 = StgAtomicallyFrame_code(Sp); - Sp_adj(-1); - jump RET_LBL(stg_ap_v); + jump stg_ap_v_fast; } } diff --git a/ghc/rts/Linker.c b/ghc/rts/Linker.c index 66057f7..a9faab5 100644 --- a/ghc/rts/Linker.c +++ b/ghc/rts/Linker.c @@ -402,7 +402,6 @@ typedef struct _RtsSymbolVal { #define RTS_RET_SYMBOLS \ SymX(stg_enter_ret) \ SymX(stg_gc_fun_ret) \ - SymX(stg_ap_0_ret) \ SymX(stg_ap_v_ret) \ SymX(stg_ap_f_ret) \ SymX(stg_ap_d_ret) \ @@ -617,7 +616,6 @@ typedef struct _RtsSymbolVal { SymX(stg_MUT_ARR_PTRS_FROZEN_info) \ SymX(stg_MUT_ARR_PTRS_FROZEN0_info) \ SymX(stg_WEAK_info) \ - SymX(stg_ap_0_info) \ SymX(stg_ap_v_info) \ SymX(stg_ap_f_info) \ SymX(stg_ap_d_info) \ @@ -632,6 +630,21 @@ typedef struct _RtsSymbolVal { SymX(stg_ap_pppp_info) \ SymX(stg_ap_ppppp_info) \ SymX(stg_ap_pppppp_info) \ + SymX(stg_ap_0_fast) \ + SymX(stg_ap_v_fast) \ + SymX(stg_ap_f_fast) \ + SymX(stg_ap_d_fast) \ + SymX(stg_ap_l_fast) \ + SymX(stg_ap_n_fast) \ + SymX(stg_ap_p_fast) \ + SymX(stg_ap_pv_fast) \ + SymX(stg_ap_pp_fast) \ + SymX(stg_ap_ppv_fast) \ + SymX(stg_ap_ppp_fast) \ + SymX(stg_ap_pppv_fast) \ + SymX(stg_ap_pppp_fast) \ + SymX(stg_ap_ppppp_fast) \ + SymX(stg_ap_pppppp_fast) \ SymX(stg_ap_1_upd_info) \ SymX(stg_ap_2_upd_info) \ SymX(stg_ap_3_upd_info) \ diff --git a/ghc/rts/PrimOps.cmm b/ghc/rts/PrimOps.cmm index f657a24..6a2b36f 100644 --- a/ghc/rts/PrimOps.cmm +++ b/ghc/rts/PrimOps.cmm @@ -996,8 +996,7 @@ INFO_TABLE_RET(stg_catch_retry_frame, R1 = StgCatchRetryFrame_first_code(frame); StgCatchRetryFrame_first_code_trec(frame) = new_trec; } - Sp_adj(-1); - jump RET_LBL(stg_ap_v); + jump stg_ap_v_fast; } } @@ -1061,8 +1060,7 @@ INFO_TABLE_RET(stg_atomically_frame, "ptr" trec = foreign "C" stmStartTransaction(MyCapability() "ptr", NO_TREC "ptr") []; StgTSO_trec(CurrentTSO) = trec; R1 = StgAtomicallyFrame_code(frame); - Sp_adj(-1); - jump RET_LBL(stg_ap_v); + jump stg_ap_v_fast; } } @@ -1097,8 +1095,7 @@ INFO_TABLE_RET(stg_atomically_waiting_frame, StgTSO_trec(CurrentTSO) = trec; StgHeader_info(frame) = stg_atomically_frame_info; R1 = StgAtomicallyFrame_code(frame); - Sp_adj(-1); - jump RET_LBL(stg_ap_v); + jump stg_ap_v_fast; } } @@ -1193,8 +1190,7 @@ atomicallyzh_fast StgTSO_trec(CurrentTSO) = new_trec; /* Apply R1 to the realworld token */ - Sp_adj(-1); - jump RET_LBL(stg_ap_v); + jump stg_ap_v_fast; } @@ -1214,8 +1210,7 @@ catchSTMzh_fast StgCatchSTMFrame_handler(frame) = R2; /* Apply R1 to the realworld token */ - Sp_adj(-1); - jump RET_LBL(stg_ap_v); + jump stg_ap_v_fast; } @@ -1248,8 +1243,7 @@ catchRetryzh_fast StgCatchRetryFrame_first_code_trec(frame) = new_trec; /* Apply R1 to the realworld token */ - Sp_adj(-1); - jump RET_LBL(stg_ap_v); + jump stg_ap_v_fast; } @@ -1281,8 +1275,7 @@ retry_pop_stack: StgTSO_trec(CurrentTSO) = trec; StgCatchRetryFrame_running_alt_code(frame) = 1 :: CInt; // true; R1 = StgCatchRetryFrame_alt_code(frame); - Sp_adj(-1); - jump RET_LBL(stg_ap_v); + jump stg_ap_v_fast; } else { // Retry in the alternative code: propagate W_ other_trec; @@ -1306,8 +1299,7 @@ retry_pop_stack: StgCatchRetryFrame_running_alt_code(frame) = 0 :: CInt; // false; StgTSO_trec(CurrentTSO) = trec; R1 = StgCatchRetryFrame_first_code(frame); - Sp_adj(-1); - jump RET_LBL(stg_ap_v); + jump stg_ap_v_fast; } } } @@ -1332,8 +1324,7 @@ retry_pop_stack: StgTSO_trec(CurrentTSO) = trec; R1 = StgAtomicallyFrame_code(frame); Sp = frame; - Sp_adj(-1); - jump RET_LBL(stg_ap_v); + jump stg_ap_v_fast; } } diff --git a/ghc/rts/StgStdThunks.cmm b/ghc/rts/StgStdThunks.cmm index 4da4248..342a6eb 100644 --- a/ghc/rts/StgStdThunks.cmm +++ b/ghc/rts/StgStdThunks.cmm @@ -153,8 +153,7 @@ INFO_TABLE(stg_ap_1_upd,1,1,THUNK_1_0,"stg_ap_1_upd_info","stg_ap_1_upd_info") PUSH_UPD_FRAME(Sp-SIZEOF_StgUpdateFrame,R1); R1 = StgThunk_payload(R1,0); Sp = Sp - SIZEOF_StgUpdateFrame; - Sp_adj(-1); // for stg_ap_0_ret - jump RET_LBL(stg_ap_0); + jump stg_ap_0_fast; } INFO_TABLE(stg_ap_2_upd,2,0,THUNK_2_0,"stg_ap_2_upd_info","stg_ap_2_upd_info") @@ -168,7 +167,7 @@ INFO_TABLE(stg_ap_2_upd,2,0,THUNK_2_0,"stg_ap_2_upd_info","stg_ap_2_upd_info") W_[Sp-SIZEOF_StgUpdateFrame-WDS(1)] = StgThunk_payload(R1,1); R1 = StgThunk_payload(R1,0); Sp = Sp - SIZEOF_StgUpdateFrame - WDS(1); - Sp_adj(-1); // for stg_ap_0_ret + Sp_adj(-1); // for stg_ap_*_ret TICK_UNKNOWN_CALL(); TICK_SLOW_CALL_p(); jump RET_LBL(stg_ap_p); @@ -186,7 +185,7 @@ INFO_TABLE(stg_ap_3_upd,3,0,THUNK,"stg_ap_3_upd_info","stg_ap_3_upd_info") W_[Sp-SIZEOF_StgUpdateFrame-WDS(2)] = StgThunk_payload(R1,1); R1 = StgThunk_payload(R1,0); Sp = Sp - SIZEOF_StgUpdateFrame - WDS(2); - Sp_adj(-1); // for stg_ap_0_ret + Sp_adj(-1); // for stg_ap_*_ret TICK_UNKNOWN_CALL(); TICK_SLOW_CALL_pp(); jump RET_LBL(stg_ap_pp); @@ -205,7 +204,7 @@ INFO_TABLE(stg_ap_4_upd,4,0,THUNK,"stg_ap_4_upd_info","stg_ap_4_upd_info") W_[Sp-SIZEOF_StgUpdateFrame-WDS(3)] = StgThunk_payload(R1,1); R1 = StgThunk_payload(R1,0); Sp = Sp - SIZEOF_StgUpdateFrame - WDS(3); - Sp_adj(-1); // for stg_ap_0_ret + Sp_adj(-1); // for stg_ap_*_ret TICK_UNKNOWN_CALL(); TICK_SLOW_CALL_ppp(); jump RET_LBL(stg_ap_ppp); @@ -225,7 +224,7 @@ INFO_TABLE(stg_ap_5_upd,5,0,THUNK,"stg_ap_5_upd_info","stg_ap_5_upd_info") W_[Sp-SIZEOF_StgUpdateFrame-WDS(4)] = StgThunk_payload(R1,1); R1 = StgThunk_payload(R1,0); Sp = Sp - SIZEOF_StgUpdateFrame - WDS(4); - Sp_adj(-1); // for stg_ap_0_ret + Sp_adj(-1); // for stg_ap_*_ret TICK_UNKNOWN_CALL(); TICK_SLOW_CALL_pppp(); jump RET_LBL(stg_ap_pppp); @@ -246,7 +245,7 @@ INFO_TABLE(stg_ap_6_upd,6,0,THUNK,"stg_ap_6_upd_info","stg_ap_6_upd_info") W_[Sp-SIZEOF_StgUpdateFrame-WDS(5)] = StgThunk_payload(R1,1); R1 = StgThunk_payload(R1,0); Sp = Sp - SIZEOF_StgUpdateFrame - WDS(5); - Sp_adj(-1); // for stg_ap_0_ret + Sp_adj(-1); // for stg_ap_*_ret TICK_UNKNOWN_CALL(); TICK_SLOW_CALL_ppppp(); jump RET_LBL(stg_ap_ppppp); @@ -268,7 +267,7 @@ INFO_TABLE(stg_ap_7_upd,7,0,THUNK,"stg_ap_7_upd_info","stg_ap_7_upd_info") W_[Sp-SIZEOF_StgUpdateFrame-WDS(6)] = StgThunk_payload(R1,1); R1 = StgThunk_payload(R1,0); Sp = Sp - SIZEOF_StgUpdateFrame - WDS(6); - Sp_adj(-1); // for stg_ap_0_ret + Sp_adj(-1); // for stg_ap_*_ret TICK_UNKNOWN_CALL(); TICK_SLOW_CALL_pppppp(); jump RET_LBL(stg_ap_pppppp); diff --git a/ghc/utils/genapply/GenApply.hs b/ghc/utils/genapply/GenApply.hs index 3f10ddf..1bdcad7 100644 --- a/ghc/utils/genapply/GenApply.hs +++ b/ghc/utils/genapply/GenApply.hs @@ -83,9 +83,14 @@ longRegs n = [ "L" ++ show m | m <- [1..n] ] loadRegArgs :: RegStatus -> Int -> [ArgRep] -> (Doc,Int) loadRegArgs regstatus sp args - = (vcat (map (uncurry assign_stk_to_reg) reg_locs), sp') - where - (reg_locs, _leftovers, sp') = assignRegs regstatus sp args + = (loadRegOffs reg_locs, sp') + where (reg_locs, _, sp') = assignRegs regstatus sp args + +loadRegOffs :: [(Reg,Int)] -> Doc +loadRegOffs = vcat . map (uncurry assign_stk_to_reg) + +saveRegOffs :: [(Reg,Int)] -> Doc +saveRegOffs = vcat . map (uncurry assign_reg_to_stk) -- a bit like assignRegs in CgRetConv.lhs assignRegs @@ -163,10 +168,15 @@ mkApplyName args mkApplyRetName args = mkApplyName args <> text "_ret" +mkApplyFastName args + = mkApplyName args <> text "_fast" + mkApplyInfoName args = mkApplyName args <> text "_info" -genMkPAP regstatus macro jump ticker disamb stack_apply +genMkPAP regstatus macro jump ticker disamb + no_load_regs -- don't load argumnet regs before jumping + args_in_regs -- arguments are already in regs is_pap args all_args_size fun_info_label = smaller_arity_cases $$ exact_arity_case @@ -175,45 +185,104 @@ genMkPAP regstatus macro jump ticker disamb stack_apply where n_args = length args - -- offset of args on the stack, see large comment above. - arg_sp_offset = 1 + -- offset of arguments on the stack at slow apply calls. + stk_args_slow_offset = 1 + + stk_args_offset + | args_in_regs = 0 + | otherwise = stk_args_slow_offset -- The SMALLER ARITY cases: -- if (arity == 1) { -- Sp[0] = Sp[1]; -- Sp[1] = (W_)&stg_ap_1_info; -- JMP_(GET_ENTRY(R1.cl)); - smaller_arity_cases = vcat [ smaller_arity i | i <- [1..n_args-1] ] smaller_arity arity = text "if (arity == " <> int arity <> text ") {" $$ - let - (reg_doc, sp') - | stack_apply = (empty, arg_sp_offset) - | otherwise = loadRegArgs regstatus arg_sp_offset these_args - in nest 4 (vcat [ text "TICK_SLOW_CALL_" <> text ticker <> text "_TOO_MANY();", - reg_doc, - vcat [ shuffle_down j | j <- [sp'..these_args_size] ], - loadSpWordOff "W_" these_args_size <> text " = " <> - mkApplyInfoName rest_args <> semi, - text "Sp_adj(" <> int (sp' - 1) <> text ");", + + -- load up regs for the call, if necessary + load_regs, + + -- If we have more args in registers than are required + -- for the call, then we must save some on the stack, + -- and set up the stack for the follow-up call. + -- If the extra arguments are on the stack, then we must + -- instead shuffle them down to make room for the info + -- table for the follow-on call. + if overflow_regs + then save_extra_regs + else shuffle_extra_args, + -- for a PAP, we have to arrange that the stack contains a -- return address in the even that stg_PAP_entry fails its -- heap check. See stg_PAP_entry in Apply.hc for details. if is_pap - then text "R2 = " <> mkApplyInfoName these_args <> semi + then text "R2 = " <> mkApplyInfoName this_call_args <> semi + else empty, text "jump " <> text jump <> semi ]) $$ text "}" + where - (these_args, rest_args) = splitAt arity args - these_args_size = sum (map argSize these_args) + -- offsets in case we need to save regs: + (reg_locs, _, _) + = assignRegs regstatus stk_args_offset args + + -- register assignment for *this function call* + (reg_locs', reg_call_leftovers, reg_call_sp_stk_args) + = assignRegs regstatus stk_args_offset (take arity args) + + load_regs + | no_load_regs || args_in_regs = empty + | otherwise = loadRegOffs reg_locs' + + (this_call_args, rest_args) = splitAt arity args + + -- the offset of the stack args from initial Sp + sp_stk_args + | args_in_regs = stk_args_offset + | no_load_regs = stk_args_offset + | otherwise = reg_call_sp_stk_args + + -- the stack args themselves + this_call_stack_args + | args_in_regs = reg_call_leftovers -- sp offsets are wrong + | no_load_regs = this_call_args + | otherwise = reg_call_leftovers + + stack_args_size = sum (map argSize this_call_stack_args) - shuffle_down i = + overflow_regs = args_in_regs && length reg_locs > length reg_locs' + + save_extra_regs + = -- we have extra arguments in registers to save + let + extra_reg_locs = drop (length reg_locs') (reverse reg_locs) + adj_reg_locs = [ (reg, off - adj + 1) | + (reg,off) <- extra_reg_locs ] + adj = case extra_reg_locs of + (reg, fst_off):_ -> fst_off + size = snd (last adj_reg_locs) + in + text "Sp_adj(" <> int (-size - 1) <> text ");" $$ + saveRegOffs adj_reg_locs $$ + loadSpWordOff "W_" 0 <> text " = " <> + mkApplyInfoName rest_args <> semi + + shuffle_extra_args + = vcat (map shuffle_down + [sp_stk_args .. sp_stk_args+stack_args_size-1]) $$ + loadSpWordOff "W_" (sp_stk_args+stack_args_size-1) + <> text " = " + <> mkApplyInfoName rest_args <> semi $$ + text "Sp_adj(" <> int (sp_stk_args - 1) <> text ");" + + shuffle_down i = loadSpWordOff "W_" (i-1) <> text " = " <> loadSpWordOff "W_" i <> semi @@ -227,8 +296,8 @@ genMkPAP regstatus macro jump ticker disamb stack_apply = text "if (arity == " <> int n_args <> text ") {" $$ let (reg_doc, sp') - | stack_apply = (empty, arg_sp_offset) - | otherwise = loadRegArgs regstatus arg_sp_offset args + | no_load_regs || args_in_regs = (empty, stk_args_offset) + | otherwise = loadRegArgs regstatus stk_args_offset args in nest 4 (vcat [ text "TICK_SLOW_CALL_" <> text ticker <> text "_CORRECT();", @@ -248,8 +317,17 @@ genMkPAP regstatus macro jump ticker disamb stack_apply larger_arity_case = text "} else {" $$ + let + save_regs + | args_in_regs = + text "Sp_adj(" <> int (-sp_offset) <> text ");" $$ + saveRegOffs reg_locs + | otherwise = + empty + in nest 4 (vcat [ text "TICK_SLOW_CALL_" <> text ticker <> text "_TOO_FEW();", + save_regs, text macro <> char '(' <> int n_args <> comma <> int all_args_size <> text "," <> fun_info_label <> @@ -257,6 +335,11 @@ genMkPAP regstatus macro jump ticker disamb stack_apply text ");" ]) $$ char '}' + where + -- offsets in case we need to save regs: + (reg_locs, leftovers, sp_offset) + = assignRegs regstatus stk_args_slow_offset args + -- BUILD_PAP assumes args start at offset 1 -- ----------------------------------------------------------------------------- -- generate an apply function @@ -342,7 +425,7 @@ genApply regstatus args = text "arity = TO_W_(StgBCO_arity(R1));", text "ASSERT(arity > 0);", genMkPAP regstatus "BUILD_PAP" "ENTRY_LBL(stg_BCO)" "FUN" "BCO" - True{-stack apply-} False{-not a PAP-} + True{-stack apply-} False{-args on stack-} False{-not a PAP-} args all_args_size fun_info_label ]), text "}", @@ -361,7 +444,7 @@ genApply regstatus args = text "arity = TO_W_(StgFunInfoExtra_arity(%GET_FUN_INFO(R1)));", text "ASSERT(arity > 0);", genMkPAP regstatus "BUILD_PAP" "%GET_ENTRY(R1)" "FUN" "FUN" - False{-reg apply-} False{-not a PAP-} + False{-reg apply-} False{-args on stack-} False{-not a PAP-} args all_args_size fun_info_label ]), text "}", @@ -375,7 +458,7 @@ genApply regstatus args = text "arity = TO_W_(StgPAP_arity(R1));", text "ASSERT(arity > 0);", genMkPAP regstatus "NEW_PAP" "ENTRY_LBL(stg_PAP)" "PAP" "PAP" - True{-stack apply-} True{-is a PAP-} + True{-stack apply-} False{-args on stack-} True{-is a PAP-} args all_args_size fun_info_label ]), text "}", @@ -437,6 +520,59 @@ genApply regstatus args = ] -- ----------------------------------------------------------------------------- +-- Making a fast unknown application, args are in regs + +genApplyFast regstatus args = + let + fun_fast_label = mkApplyFastName args + fun_ret_label = text "RET_LBL" <> parens (mkApplyName args) + fun_info_label = mkApplyInfoName args + all_args_size = sum (map argSize args) + in + vcat [ + fun_fast_label, + char '{', + nest 4 (vcat [ + text "W_ info;", + text "W_ arity;", + text "info = %GET_STD_INFO(R1);", + text "switch [INVALID_OBJECT .. N_CLOSURE_TYPES] (%INFO_TYPE(info)) {", + nest 4 (vcat [ + text "case FUN,", + text " FUN_1_0,", + text " FUN_0_1,", + text " FUN_2_0,", + text " FUN_1_1,", + text " FUN_0_2,", + text " FUN_STATIC: {", + nest 4 (vcat [ + text "arity = TO_W_(StgFunInfoExtra_arity(%GET_FUN_INFO(R1)));", + text "ASSERT(arity > 0);", + genMkPAP regstatus "BUILD_PAP" "%GET_ENTRY(R1)" "FUN" "FUN" + False{-reg apply-} True{-args in regs-} False{-not a PAP-} + args all_args_size fun_info_label + ]), + char '}', + + text "default: {", + let + (reg_locs, leftovers, sp_offset) = assignRegs regstatus 1 args + -- leave a one-word space on the top of the stack when + -- calling the slow version + in + nest 4 (vcat [ + text "Sp_adj" <> parens (int (-sp_offset)) <> semi, + saveRegOffs reg_locs, + text "jump" <+> fun_ret_label <> semi + ]), + char '}' + ]), + char '}' + ]), + char '}' + ] + +-- ----------------------------------------------------------------------------- -- Making a stack apply -- These little functions are like slow entry points. They provide @@ -488,7 +624,7 @@ genStackSave regstatus args = ] where body = vcat [text "Sp_adj" <> parens (int (-sp_offset)) <> semi, - vcat (map (uncurry assign_reg_to_stk) reg_locs), + saveRegOffs reg_locs, text "Sp(2) = R1;", text "Sp(1) =" <+> int stk_args <> semi, text "Sp(0) = stg_gc_fun_info;", @@ -525,6 +661,9 @@ main = do vcat (intersperse (text "") $ map (genStackFns regstatus) stackApplyTypes), + vcat (intersperse (text "") $ + map (genApplyFast regstatus) applyTypes), + genStackApplyArray stackApplyTypes, genStackSaveArray stackApplyTypes, genBitmapArray stackApplyTypes, -- 1.7.10.4