From e3ce2c8ba502144c0328467576c28432ed7e9c66 Mon Sep 17 00:00:00 2001 From: bugubugu123 Date: Fri, 15 Dec 2017 15:40:00 -0500 Subject: [PATCH 01/43] Integrate spatial reuse with temporal reuse; Added time reuse distance and cacheline reuse distance --- src/tool/hpcrun/hpcrun_stats.c | 15 +- src/tool/hpcrun/hpcrun_stats.h | 3 +- .../sample-sources/watchpoint_clients.c | 204 ++++++++++-------- .../sample-sources/watchpoint_support.h | 8 +- 4 files changed, 128 insertions(+), 102 deletions(-) diff --git a/src/tool/hpcrun/hpcrun_stats.c b/src/tool/hpcrun/hpcrun_stats.c index 11ba822acf..cf799af27d 100644 --- a/src/tool/hpcrun/hpcrun_stats.c +++ b/src/tool/hpcrun/hpcrun_stats.c @@ -102,7 +102,8 @@ static atomic_long num_falseWWIns = ATOMIC_VAR_INIT(0); static atomic_long num_falseRWIns = ATOMIC_VAR_INIT(0); static atomic_long num_falseWRIns = ATOMIC_VAR_INIT(0); -static atomic_long num_reuse = ATOMIC_VAR_INIT(0); +static atomic_long num_reuseSpatial = ATOMIC_VAR_INIT(0); +static atomic_long num_reuseTemporal = ATOMIC_VAR_INIT(0); static atomic_long num_latency = ATOMIC_VAR_INIT(0); static atomic_long num_unwind_intervals_total = ATOMIC_VAR_INIT(0); @@ -334,11 +335,17 @@ hpcrun_stats_num_accessedIns_inc(long val) } void -hpcrun_stats_num_reuse_inc(long val) +hpcrun_stats_num_reuseTemporal_inc(long val) { - atomic_fetch_add_explicit(&num_reuse, val, memory_order_relaxed); + atomic_fetch_add_explicit(&num_reuseTemporal, val, memory_order_relaxed); } +void +hpcrun_stats_num_reuseSpatial_inc(long val) +{ + atomic_fetch_add_explicit(&num_reuseSpatial, val, memory_order_relaxed); +} + void hpcrun_stats_num_latency_inc(long val) { @@ -639,7 +646,7 @@ hpcrun_stats_print_summary(void) //AMSG("WATCHPOINT ANOMALIES: samples:%ld, SM_imprecise:%ld, WP_Set:%ld, WP_triggered:%ld, WP_SampleTriggering:%ld, WP_ImpreciseIP:%ld, WP_InsaneIP:%ld, WP_Off8Addr:%ld, WP_ImpreciseAddr:%ld, WP_Dropped:%ld", num_samples_total, num_samples_imprecise, num_watchpoints_set, num_watchpoints_triggered, num_sample_triggering_watchpoints, num_watchpoints_imprecise, num_insane_ip, num_watchpoints_imprecise_address_8_byte, num_watchpoints_imprecise_address, num_watchpoints_dropped); AMSG("WATCHPOINT ANOMALIES: samples:%.2e, SM_imprecise:%.2e, WP_Set:%.2e, WP_triggered:%.2e, WP_SampleTriggering:%.2e, WP_ImpreciseIP:%.2e, WP_InsaneIP:%.2e, WP_Off8Addr:%.2e, WP_ImpreciseAddr:%.2e, WP_Dropped:%.2e", (double)atomic_load(&num_samples_total), (double)atomic_load(&num_samples_imprecise), (double)atomic_load(&num_watchpoints_set), (double)atomic_load(&num_watchpoints_triggered), (double)atomic_load(&num_sample_triggering_watchpoints), (double)atomic_load(&num_watchpoints_imprecise), (double)atomic_load(&num_insane_ip), (double)atomic_load(&num_watchpoints_imprecise_address_8_byte), (double)atomic_load(&num_watchpoints_imprecise_address), (double)atomic_load(&num_watchpoints_dropped)); - AMSG("WATCHPOINT STATS: writtenBytes:%ld, usedBytes:%ld, deadBytes:%ld, newBytes:%ld, oldBytes:%ld, oldAppxBytes:%ld, loadedBytes:%ld, accessedIns:%ld, falseWWIns:%ld, falseRWIns:%ld, falseWRIns:%ld, trueWWIns:%ld, trueRWIns:%ld, trueWRIns:%ld, RSS:%ld, reuse:%ld, latency:%ld", num_writtenBytes, num_usedBytes, num_deadBytes, num_newBytes, num_oldBytes, num_oldAppxBytes, num_loadedBytes, num_accessedIns, num_falseWWIns, num_falseRWIns, num_falseWRIns, num_trueWWIns, num_trueRWIns, num_trueWRIns, (size_t)(rusage.ru_maxrss), num_reuse, num_latency); + AMSG("WATCHPOINT STATS: writtenBytes:%ld, usedBytes:%ld, deadBytes:%ld, newBytes:%ld, oldBytes:%ld, oldAppxBytes:%ld, loadedBytes:%ld, accessedIns:%ld, falseWWIns:%ld, falseRWIns:%ld, falseWRIns:%ld, trueWWIns:%ld, trueRWIns:%ld, trueWRIns:%ld, RSS:%ld, reuseTemporal:%ld, reuseSpatial:%ldlatency:%ld", num_writtenBytes, num_usedBytes, num_deadBytes, num_newBytes, num_oldBytes, num_oldAppxBytes, num_loadedBytes, num_accessedIns, num_falseWWIns, num_falseRWIns, num_falseWRIns, num_trueWWIns, num_trueRWIns, num_trueWRIns, (size_t)(rusage.ru_maxrss), num_reuseTemporal, num_reuseSpatial, num_latency); AMSG("SAMPLE ANOMALIES: blocks: %ld (async: %ld, dlopen: %ld), " "errors: %ld (segv: %ld, soft: %ld)", diff --git a/src/tool/hpcrun/hpcrun_stats.h b/src/tool/hpcrun/hpcrun_stats.h index 9860eac446..b76a957c2f 100644 --- a/src/tool/hpcrun/hpcrun_stats.h +++ b/src/tool/hpcrun/hpcrun_stats.h @@ -85,7 +85,8 @@ void hpcrun_stats_num_deadBytes_inc(long val); void hpcrun_stats_num_newBytes_inc(long val); void hpcrun_stats_num_oldBytes_inc(long val); void hpcrun_stats_num_oldAppxBytes_inc(long val); -void hpcrun_stats_num_reuse_inc(long val); +void hpcrun_stats_num_reuseTemporal_inc(long val); +void hpcrun_stats_num_reuseSpatial_inc(long val); void hpcrun_stats_num_loadedBytes_inc(long val); diff --git a/src/tool/hpcrun/sample-sources/watchpoint_clients.c b/src/tool/hpcrun/sample-sources/watchpoint_clients.c index dbb130fc23..0c5aa895a6 100755 --- a/src/tool/hpcrun/sample-sources/watchpoint_clients.c +++ b/src/tool/hpcrun/sample-sources/watchpoint_clients.c @@ -144,8 +144,13 @@ int load_metric_id = -1; int dead_metric_id = -1; int measured_metric_id = -1; int latency_metric_id = -1; -int temporal_metric_id = -1; -int spatial_metric_id = -1; + +int temporal_reuse_metric_id = -1; +int spatial_reuse_metric_id = -1; +int reuse_time_distance_metric_id = -1; // use rdtsc() to represent the reuse distance +int reuse_cacheline_distance_metric_id = -1; // use cache miss to reprent the reuse distance +int reuse_trapped_metric_id = -1; // the times of a watch point trapped + int false_ww_metric_id = -1; int false_rw_metric_id = -1; int false_wr_metric_id = -1; @@ -153,6 +158,7 @@ int true_ww_metric_id = -1; int true_rw_metric_id = -1; int true_wr_metric_id = -1; + #define NUM_WATERMARK_METRICS (4) int curWatermarkId = 0; int watermark_metric_id[NUM_WATERMARK_METRICS] = {-1, -1, -1, -1}; @@ -193,8 +199,7 @@ __thread WPStats_t wpStats; #define WP_DEADSPY_EVENT_NAME "WP_DEADSPY" #define WP_REDSPY_EVENT_NAME "WP_REDSPY" #define WP_LOADSPY_EVENT_NAME "WP_LOADSPY" -#define WP_TEMPORAL_REUSE_EVENT_NAME "WP_TEMPORAL_REUSE" -#define WP_SPATIAL_REUSE_EVENT_NAME "WP_SPATIAL_REUSE" +#define WP_REUSE_EVENT_NAME "WP_REUSE" #define WP_FALSE_SHARING_EVENT_NAME "WP_FALSE_SHARING" #define WP_TRUE_SHARING_EVENT_NAME "WP_TRUE_SHARING" #define WP_ALL_SHARING_EVENT_NAME "WP_ALL_SHARING" @@ -207,8 +212,7 @@ typedef enum WP_CLIENT_ID{ WP_DEADSPY, WP_REDSPY, WP_LOADSPY, - WP_TEMPORAL_REUSE, - WP_SPATIAL_REUSE, + WP_REUSE, WP_FALSE_SHARING, WP_ALL_SHARING, WP_TRUE_SHARING, @@ -253,7 +257,8 @@ __thread uint64_t falseRWIns = 0; __thread uint64_t trueWWIns = 0; __thread uint64_t trueWRIns = 0; __thread uint64_t trueRWIns = 0; -__thread uint64_t reuse = 0; +__thread uint64_t reuseTemporal = 0; +__thread uint64_t reuseSpatial = 0; // Some stats __thread long int correct=0; @@ -288,8 +293,7 @@ __thread long ipDiff=0; static WPTriggerActionType DeadStoreWPCallback(WatchPointInfo_t *wpi, int startOffset, int safeAccessLen, WatchPointTrigger_t * wt); static WPTriggerActionType RedStoreWPCallback(WatchPointInfo_t *wpi, int startOffseti, int safeAccessLen, WatchPointTrigger_t * wt); -static WPTriggerActionType TemporalReuseWPCallback(WatchPointInfo_t *wpi, int startOffset, int safeAccessLen, WatchPointTrigger_t * wt); -static WPTriggerActionType SpatialReuseWPCallback(WatchPointInfo_t *wpi, int startOffset, int safeAccessLen, WatchPointTrigger_t * wt); +static WPTriggerActionType ReuseWPCallback(WatchPointInfo_t *wpi, int startOffset, int safeAccessLen, WatchPointTrigger_t * wt); static WPTriggerActionType LoadLoadWPCallback(WatchPointInfo_t *wpi, int startOffset, int safeAccessLen, WatchPointTrigger_t * wt); static WPTriggerActionType FalseSharingWPCallback(WatchPointInfo_t *wpi, int startOffset, int safeAccessLen, WatchPointTrigger_t * wt); static WPTriggerActionType AllSharingWPCallback(WatchPointInfo_t *wpi, int startOffset, int safeAccessLen, WatchPointTrigger_t * wt); @@ -324,21 +328,13 @@ static WpClientConfig_t wpClientConfig[] = { .preWPAction = DISABLE_WP, .configOverrideCallback = LoadSpyWPConfigOverride }, - /**** Temporal Reuse ***/ - { - .id = WP_TEMPORAL_REUSE, - .name = WP_TEMPORAL_REUSE_EVENT_NAME, - .wpCallback = TemporalReuseWPCallback, - .preWPAction = DISABLE_WP, - .configOverrideCallback = TemporalReuseWPConfigOverride - }, - /**** Spatial Reuse ***/ + /**** Reuse ***/ { - .id = WP_SPATIAL_REUSE, - .name = WP_SPATIAL_REUSE_EVENT_NAME, - .wpCallback = SpatialReuseWPCallback, + .id = WP_REUSE, + .name = WP_REUSE_EVENT_NAME, + .wpCallback = ReuseWPCallback, .preWPAction = DISABLE_WP, - .configOverrideCallback = SpatialReuseWPConfigOverride + .configOverrideCallback = ReuseWPConfigOverride }, /**** False Sharing ***/ { @@ -431,6 +427,7 @@ static void PopulateBlackListAddresses() { const char delim[] = " \n"; addr = strtok_r(l, delim, &save); char* perms = strtok_r(NULL, delim, &save); + (void) perms; //supress compiler's warning // skip 3 tokens for (int i=0; i < 3; i++) { (void) strtok_r(NULL, delim, &save);} char* name = strtok_r(NULL, delim, &save); @@ -525,13 +522,11 @@ static void ClientTermination(){ hpcrun_stats_num_oldBytes_inc(oldBytes); hpcrun_stats_num_oldAppxBytes_inc(oldAppxBytes); break; - case WP_TEMPORAL_REUSE: + case WP_REUSE: hpcrun_stats_num_accessedIns_inc(accessedIns); - hpcrun_stats_num_reuse_inc(reuse); - break; - case WP_SPATIAL_REUSE: + hpcrun_stats_num_reuseTemporal_inc(reuseTemporal); hpcrun_stats_num_accessedIns_inc(accessedIns); - hpcrun_stats_num_reuse_inc(reuse); + hpcrun_stats_num_reuseSpatial_inc(reuseSpatial); break; case WP_FALSE_SHARING: case WP_IPC_FALSE_SHARING: @@ -815,16 +810,18 @@ METHOD_FN(process_event_list, int lush_metrics) hpcrun_set_metric_info_and_period(redApprox_metric_id, "BYTES_RED_APPROX", MetricFlags_ValFmt_Int, 1, metric_property_none); break; - case WP_TEMPORAL_REUSE: - temporal_metric_id = hpcrun_new_metric(); - hpcrun_set_metric_info_and_period(temporal_metric_id, "TEMPORAL", MetricFlags_ValFmt_Int, 1, metric_property_none); + case WP_REUSE: + temporal_reuse_metric_id = hpcrun_new_metric(); + hpcrun_set_metric_info_and_period(temporal_reuse_metric_id, "TEMPORAL", MetricFlags_ValFmt_Int, 1, metric_property_none); + spatial_reuse_metric_id = hpcrun_new_metric(); + hpcrun_set_metric_info_and_period(spatial_reuse_metric_id, "SPATIAL", MetricFlags_ValFmt_Int, 1, metric_property_none); + reuse_time_distance_metric_id = hpcrun_new_metric(); + hpcrun_set_metric_info_and_period(reuse_time_distance_metric_id, "TIME_DISTANCE", MetricFlags_ValFmt_Int, 1, metric_property_none); + reuse_cacheline_distance_metric_id = hpcrun_new_metric(); + hpcrun_set_metric_info_and_period(reuse_cacheline_distance_metric_id, "CACHELIN_DISTANCE", MetricFlags_ValFmt_Int, 1, metric_property_none); + reuse_trapped_metric_id = hpcrun_new_metric(); + hpcrun_set_metric_info_and_period(reuse_trapped_metric_id, "REUSE_TRAP_COUNT", MetricFlags_ValFmt_Int, 1, metric_property_none); break; - - case WP_SPATIAL_REUSE: - spatial_metric_id = hpcrun_new_metric(); - hpcrun_set_metric_info_and_period(spatial_metric_id, "SPATIAL", MetricFlags_ValFmt_Int, 1, metric_property_none); - break; - case WP_ALL_SHARING: case WP_IPC_ALL_SHARING: // must have a canonical load map across processes @@ -875,9 +872,7 @@ METHOD_FN(display_events) printf("---------------------------------------------------------------------------\n"); printf("%s\n", WP_LOADSPY_EVENT_NAME); printf("---------------------------------------------------------------------------\n"); - printf("%s\n", WP_TEMPORAL_REUSE_EVENT_NAME); - printf("---------------------------------------------------------------------------\n"); - printf("%s\n", WP_SPATIAL_REUSE_EVENT_NAME); + printf("%s\n", WP_REUSE_EVENT_NAME); printf("---------------------------------------------------------------------------\n"); printf("%s\n", WP_FALSE_SHARING_EVENT_NAME); printf("---------------------------------------------------------------------------\n"); @@ -1067,6 +1062,19 @@ static inline void UpdateConcatenatedPathPair(void *ctxt, cct_node_t * oldNode, // update the foundMetric cct_metric_data_increment(metricId, node, (cct_metric_data_t){.i = metricInc}); } +static inline void UpdateConcatenatedPathPairMultiple(void *ctxt, cct_node_t * oldNode, const void * joinNode, int *metricIdArray, uint64_t *metricIncArray, uint32_t numMetric){ + if (numMetric == 0) return; + // unwind call stack once + sample_val_t v = hpcrun_sample_callpath(ctxt, metricIdArray[0], SAMPLE_NO_INC, 0/*skipInner*/, 1/*isSync*/, NULL); + // insert a special node + cct_node_t *node = hpcrun_insert_special_node(oldNode, joinNode); + // concatenate call paths + node = hpcrun_cct_insert_path_return_leaf(v.sample_node, node); + for(uint32_t i = 0; i < numMetric; i++){ + // update the foundMetric + cct_metric_data_increment(metricIdArray[i], node, (cct_metric_data_t){.i = metricIncArray[i]}); + } +} static WPTriggerActionType DeadStoreWPCallback(WatchPointInfo_t *wpi, int startOffset, int safeAccessLen, WatchPointTrigger_t * wt){ @@ -1267,7 +1275,7 @@ static WPTriggerActionType RedStoreWPCallback(WatchPointInfo_t *wpi, int startOf return ALREADY_DISABLED; } -static WPTriggerActionType TemporalReuseWPCallback(WatchPointInfo_t *wpi, int startOffset, int safeAccessLen, WatchPointTrigger_t * wt){ +static WPTriggerActionType ReuseWPCallback(WatchPointInfo_t *wpi, int startOffset, int safeAccessLen, WatchPointTrigger_t * wt){ if(!wt->pc) { // if the ip is 0, let's retain the WP return RETAIN_WP; @@ -1275,29 +1283,31 @@ static WPTriggerActionType TemporalReuseWPCallback(WatchPointInfo_t *wpi, int st // Report a reuse double myProportion = ProportionOfWatchpointAmongOthersSharingTheSameContext(wpi); uint64_t numDiffSamples = GetWeightedMetricDiffAndReset(wpi->sample.node, wpi->sample.sampledMetricId, myProportion); + uint64_t inc = numDiffSamples; int joinNodeIdx = wpi->sample.isSamplePointAccurate? E_ACCURATE_JOIN_NODE_IDX : E_INACCURATE_JOIN_NODE_IDX; - // Now increment temporal_metric_id by numDiffSamples * overlapBytes - uint64_t inc = numDiffSamples; - reuse += inc; - UpdateConcatenatedPathPair(wt->ctxt, wpi->sample.node /* oldNode*/, joinNodes[E_TEPORALLY_REUSED][joinNodeIdx] /* joinNode*/, temporal_metric_id /* checkedMetric */, inc); - return ALREADY_DISABLED; -} + uint64_t time_distance = rdtsc() - wpi->startTime; + uint64_t cacheline_distance; // readcounter - wpi->sample.cacheMissCount //TODO:jqswang -static WPTriggerActionType SpatialReuseWPCallback(WatchPointInfo_t *wpi, int startOffset, int safeAccessLen, WatchPointTrigger_t * wt){ - if(!wt->pc) { - // if the ip is 0, drop the WP - return ALREADY_DISABLED; - } - // Report a reuse - double myProportion = ProportionOfWatchpointAmongOthersSharingTheSameContext(wpi); - uint64_t numDiffSamples = GetWeightedMetricDiffAndReset(wpi->sample.node, wpi->sample.sampledMetricId, myProportion); - int joinNodeIdx = wpi->sample.isSamplePointAccurate? E_ACCURATE_JOIN_NODE_IDX : E_INACCURATE_JOIN_NODE_IDX; - // Now increment dead_metric_id by numDiffSamples * overlapBytes - uint64_t inc = numDiffSamples; - reuse += inc; + //prepare the metric updating arrays + int metricIdArray[4]; + uint64_t metricIncArray[4]; - UpdateConcatenatedPathPair(wt->ctxt, wpi->sample.node /* oldNode*/, joinNodes[E_SPATIALLY_REUSED][joinNodeIdx] /* joinNode*/, spatial_metric_id /* checkedMetric */, inc); + metricIncArray[0]=inc; + metricIdArray[1]=reuse_time_distance_metric_id; metricIncArray[1]=time_distance; + metricIdArray[2]=reuse_cacheline_distance_metric_id; metricIncArray[2]=cacheline_distance; + metricIdArray[3]=reuse_trapped_metric_id; metricIncArray[3]=1; + + if (wpi->sample.reuseType == REUSE_TEMPORAL){ + reuseTemporal += inc; + metricIdArray[0] = temporal_reuse_metric_id; + UpdateConcatenatedPathPairMultiple(wt->ctxt, wpi->sample.node /* oldNode*/, joinNodes[E_TEPORALLY_REUSED][joinNodeIdx] /* joinNode*/, metricIdArray, metricIncArray, 4); + } + else { + reuseSpatial += inc; + metricIdArray[0] = spatial_reuse_metric_id; + UpdateConcatenatedPathPairMultiple(wt->ctxt, wpi->sample.node /* oldNode*/, joinNodes[E_SPATIALLY_REUSED][joinNodeIdx] /* joinNode*/, metricIdArray, metricIncArray, 4); + } return ALREADY_DISABLED; } @@ -2130,8 +2140,8 @@ bool PrintStats(){ #endif bool OnSample(perf_mmap_data_t * mmap_data, void * contextPC, cct_node_t *node, int sampledMetricId) { - void * data_addr = mmap_data->addr; - void * precisePC = (mmap_data->header_misc & PERF_RECORD_MISC_EXACT_IP) ? mmap_data->ip : 0; + void * data_addr = (void *)mmap_data->addr; + void * precisePC = (void *)((mmap_data->header_misc & PERF_RECORD_MISC_EXACT_IP) ? mmap_data->ip : 0); // Filert out address and PC (0 or kernel address will not pass) if (!IsValidAddress(data_addr, precisePC)) { goto ErrExit; // incorrect access type @@ -2252,7 +2262,7 @@ bool OnSample(perf_mmap_data_t * mmap_data, void * contextPC, cct_node_t *node, } } break; - case WP_SPATIAL_REUSE:{ + case WP_REUSE:{ long metricThreshold = hpcrun_id2metric(sampledMetricId)->period; accessedIns += metricThreshold; @@ -2260,7 +2270,7 @@ bool OnSample(perf_mmap_data_t * mmap_data, void * contextPC, cct_node_t *node, .node = node, .type=WP_RW, .accessType=accessType, - .wpLength = accessLen, + //.wpLength = accessLen, // set later .accessLength= accessLen, .sampledMetricId=sampledMetricId, .isSamplePointAccurate = isSamplePointAccurate, @@ -2268,42 +2278,48 @@ bool OnSample(perf_mmap_data_t * mmap_data, void * contextPC, cct_node_t *node, .isBackTrace = false }; sd.wpLength = GetFloorWPLength(accessLen); - // randomly protect another word in the same cache line - uint64_t aligned_pc = ALIGN_TO_CACHE_LINE((uint64_t)data_addr); - if ((rdtsc() & 1) == 0) - sd.va = (void*) (aligned_pc - CACHE_LINE_SZ); - else - sd.va = (void *) (aligned_pc + CACHE_LINE_SZ); + if (rdtsc() & 1) { // 50% chance to detect spatial reuse + int wpSizes[] = {8, 4, 2, 1}; + FalseSharingLocs falseSharingLocs[CACHE_LINE_SZ]; + int numFSLocs = 0; + GetAllFalseSharingLocations((size_t)data_addr, accessLen, ALIGN_TO_CACHE_LINE((size_t)(data_addr)), CACHE_LINE_SZ, wpSizes, 0 /*curWPSizeIdx*/ , 4 /*totalWPSizes*/, falseSharingLocs, &numFSLocs); + assert(numFSLocs > 0); // at least there is one location to monitor + int idx = rdtsc() % numFSLocs; //randomly choose one location to monitor + sd.va = (void *)falseSharingLocs[idx].va; + sd.reuseType = REUSE_SPATIAL; #if 0 - int offset = ((uint64_t)data_addr - aligned_pc) / accessLen; - int bound = CACHE_LINE_SZ / accessLen; - int r = rdtsc() % bound; - if (r == offset) r = (r+1) % bound; - sd.va = aligned_pc + (r * accessLen); + // jqswang: I am not sure what the following code does + // randomly protect another word in the same cache line + uint64_t aligned_pc = ALIGN_TO_CACHE_LINE((uint64_t)data_addr); + if ((rdtsc() & 1) == 0) + sd.va = (void*) (aligned_pc - CACHE_LINE_SZ); + else + sd.va = (void *) (aligned_pc + CACHE_LINE_SZ); #endif +#if 0 + int offset = ((uint64_t)data_addr - aligned_pc) / accessLen; + int bound = CACHE_LINE_SZ / accessLen; + int r = rdtsc() % bound; + if (r == offset) r = (r+1) % bound; + sd.va = aligned_pc + (r * accessLen); +#endif + } + else { // 50% chance to detect the temporal reuse + sd.va = data_addr; + sd.reuseType = REUSE_TEMPORAL; + } if (!IsValidAddress(sd.va, precisePC)) { goto ErrExit; // incorrect access type } - SubscribeWatchpoint(&sd, OVERWRITE, false /* capture value */); - } - break; - case WP_TEMPORAL_REUSE:{ - long metricThreshold = hpcrun_id2metric(sampledMetricId)->period; - accessedIns += metricThreshold; - - SampleData_t sd= { - .va = data_addr, - .node = node, - .type=WP_RW, - .accessType=accessType, - .wpLength = accessLen, - .accessLength= accessLen, - .sampledMetricId=sampledMetricId, - .isSamplePointAccurate = isSamplePointAccurate, - .preWPAction=theWPConfig->preWPAction, - .isBackTrace = false - }; - sd.wpLength = GetFloorWPLength(accessLen); + //make sure the following variables have been set + //assert(cache_miss_event_set >= 0); + //assert(cache_miss_event_seq >= 0); + + // Read the cache miss counter + long long cacheMissCount; + //TODO: jqswang assert(ReadEventCounter(cache_miss_event_set /* for PAPI */, cache_miss_event_seq, &cacheMissCount) == PAPI_OK); + sd.cachelineReuseDistance = cacheMissCount; + // register the watchpoint SubscribeWatchpoint(&sd, OVERWRITE, false /* capture value */); } break; diff --git a/src/tool/hpcrun/sample-sources/watchpoint_support.h b/src/tool/hpcrun/sample-sources/watchpoint_support.h index 2e198f1d4b..06d01523e9 100644 --- a/src/tool/hpcrun/sample-sources/watchpoint_support.h +++ b/src/tool/hpcrun/sample-sources/watchpoint_support.h @@ -83,7 +83,8 @@ typedef enum ReplacementPolicy {AUTO, EMPTY_SLOT_ONLY, OLDEST, NEWEST} Replaceme typedef enum MergePolicy {AUTO_MERGE, NO_MERGE, CLIENT_ACTION} MergePolicy; typedef enum OverwritePolicy {OVERWRITE, NO_OVERWRITE} OverwritePolicy; typedef enum VictimType {EMPTY_SLOT, NON_EMPTY_SLOT, NONE_AVAILABLE} VictimType; -typedef enum WPTriggerActionType {DISABLE_WP, ALREADY_DISABLED, DISABLE_ALL_WP, RETAIN_WP} WPTriggerActionType; +typedef enum WPTriggerActionType {DISABLE_WP, ALREADY_DISABLED, DISABLE_ALL_WP, RETAIN_WP} WPTriggerActionType; //jqswang: what do they mean? +typedef enum ReuseType { REUSE_TEMPORAL, REUSE_SPATIAL} ReuseType; // for reuse client // Data structure that is given by clients to set a WP typedef struct SampleData{ @@ -101,6 +102,8 @@ typedef struct SampleData{ WPTriggerActionType preWPAction; bool isSamplePointAccurate; bool isBackTrace; + ReuseType reuseType; + uint64_t cachelineReuseDistance; } SampleData_t; typedef struct WatchPointInfo{ @@ -150,8 +153,7 @@ extern bool IsAltStackAddress(void *addr); extern double ProportionOfWatchpointAmongOthersSharingTheSameContext(WatchPointInfo_t *wpi); -extern void TemporalReuseWPConfigOverride(void*); -extern void SpatialReuseWPConfigOverride(void*); +extern void ReuseWPConfigOverride(void*); extern void FalseSharingWPConfigOverride(void*); extern void TrueSharingWPConfigOverride(void*); extern void AllSharingWPConfigOverride(void*); From 67beac907fd110a30fe0e7d2561940e2ef428dfd Mon Sep 17 00:00:00 2001 From: bugubugu123 Date: Sat, 16 Dec 2017 17:12:11 -0500 Subject: [PATCH 02/43] FIxed some bugs --- src/tool/hpcrun/sample-sources/watchpoint_support.c | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/src/tool/hpcrun/sample-sources/watchpoint_support.c b/src/tool/hpcrun/sample-sources/watchpoint_support.c index b3ccae937f..2aace57f02 100644 --- a/src/tool/hpcrun/sample-sources/watchpoint_support.c +++ b/src/tool/hpcrun/sample-sources/watchpoint_support.c @@ -355,20 +355,12 @@ void IPCAllSharingWPConfigOverride(void *v){ } -void TemporalReuseWPConfigOverride(void *v){ +void ReuseWPConfigOverride(void *v){ // dont fix IP wpConfig.dontFixIP = true; wpConfig.dontDisassembleWPAddress = true; } -void SpatialReuseWPConfigOverride(void *v){ - // dont fix IP - wpConfig.dontFixIP = true; - wpConfig.dontDisassembleWPAddress = true; -} - - - static void CreateWatchPoint(WatchPointInfo_t * wpi, SampleData_t * sampleData, bool modify) { // Perf event settings struct perf_event_attr pe = { From 7add708ee57790ee9a7c9a6be5411dcea56bc214 Mon Sep 17 00:00:00 2001 From: bugubugu123 Date: Sat, 16 Dec 2017 17:48:16 -0500 Subject: [PATCH 03/43] If the sampling period is set to 0, leave it be and the corresponding event will not generate any overflow --- src/tool/hpcrun/sample-sources/perf/linux_perf.c | 8 ++++++-- src/tool/hpcrun/utilities/tokenize.c | 3 ++- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/src/tool/hpcrun/sample-sources/perf/linux_perf.c b/src/tool/hpcrun/sample-sources/perf/linux_perf.c index 77a3e37a73..53d32425d4 100644 --- a/src/tool/hpcrun/sample-sources/perf/linux_perf.c +++ b/src/tool/hpcrun/sample-sources/perf/linux_perf.c @@ -202,12 +202,16 @@ static const char *event_name = "CPU_CYCLES"; #endif +//****************************************************************************** +// global variables +//****************************************************************************** + +int reuse_cacheline_distance_event_index = -1; //****************************************************************************** // local variables //****************************************************************************** - static sigset_t sig_mask; // a list of main description of events, shared between threads @@ -792,7 +796,7 @@ METHOD_FN(process_event_list, int lush_metrics) int period_type = hpcrun_extract_ev_thresh(event, sizeof(name), name, &threshold, default_threshold.threshold_num); - + //printf("period_type %d\n", period_type);printf("threshold %ld, %ld\n", threshold, default_threshold.threshold_num); //jqswang // ------------------------------------------------------------ // need a special case if we have our own customized predefined event // This "customized" event will use one or more perf events diff --git a/src/tool/hpcrun/utilities/tokenize.c b/src/tool/hpcrun/utilities/tokenize.c index c3fb04d9d8..ef78e76373 100644 --- a/src/tool/hpcrun/utilities/tokenize.c +++ b/src/tool/hpcrun/utilities/tokenize.c @@ -113,7 +113,8 @@ hpcrun_extract_threshold(const char *in, long *th, long def) pos = 1; } long num = strtol(in + pos, (char **)NULL, 10); - *th = (num == 0 ? def : num); + *th = num; // if the default threshold is 0, just leave it be + //*th = (num == 0 ? def : num); if (is_period) return 2; From 3ef77deb67bb03173daaaaa343413cfdf3773e9e Mon Sep 17 00:00:00 2001 From: bugubugu123 Date: Sat, 16 Dec 2017 18:12:23 -0500 Subject: [PATCH 04/43] Added the function int read_event_counter(event_thread_t *current,uint64_t *val) to read the counter value --- .../hpcrun/sample-sources/perf/perf_mmap.c | 38 +++++++++++++++++++ .../hpcrun/sample-sources/perf/perf_mmap.h | 2 +- 2 files changed, 39 insertions(+), 1 deletion(-) diff --git a/src/tool/hpcrun/sample-sources/perf/perf_mmap.c b/src/tool/hpcrun/sample-sources/perf/perf_mmap.c index f19d140a3f..f5cb88457f 100644 --- a/src/tool/hpcrun/sample-sources/perf/perf_mmap.c +++ b/src/tool/hpcrun/sample-sources/perf/perf_mmap.c @@ -406,10 +406,48 @@ parse_buffer(int sample_type, event_thread_t *current, perf_mmap_data_t *mmap_in return data_read; } + +#if defined(__x86_64__) || defined(__i386__) + +#ifdef __x86_64__ +#define DECLARE_ARGS(val, low, high) unsigned low, high +#define EAX_EDX_VAL(val, low, high) ((low) | ((uint64_t )(high) << 32)) +#define EAX_EDX_ARGS(val, low, high) "a" (low), "d" (high) +#define EAX_EDX_RET(val, low, high) "=a" (low), "=d" (high) +#else +#define DECLARE_ARGS(val, low, high) unsigned long long val +#define EAX_EDX_VAL(val, low, high) (val) +#define EAX_EDX_ARGS(val, low, high) "A" (val) +#define EAX_EDX_RET(val, low, high) "=A" (val) +#endif + +#define barrier() __asm__ __volatile__("": : :"memory") + +static inline int rdpmc(pe_mmap_t *mmap, uint64_t *value) +{ + int counter = mmap->index - 1; + DECLARE_ARGS(val, low, high); + + if (counter < 0) return -1; + + asm volatile("rdpmc" : EAX_EDX_RET(val, low, high) : "c" (counter)); + *value = EAX_EDX_VAL(val, low, high); + return 0; +} +#else +#error("rdpmc() is not defined"); +#endif + //---------------------------------------------------------------------- // Public Interfaces //---------------------------------------------------------------------- +// read the counter value of the event +int read_event_counter(event_thread_t *current,uint64_t *val){ + pe_mmap_t *current_perf_mmap = current->mmap; + assert(rdpmc(current_perf_mmap, val)); + return 0; +} //---------------------------------------------------------- // reading mmap buffer from the kernel diff --git a/src/tool/hpcrun/sample-sources/perf/perf_mmap.h b/src/tool/hpcrun/sample-sources/perf/perf_mmap.h index 057cc36012..94fb073e47 100644 --- a/src/tool/hpcrun/sample-sources/perf/perf_mmap.h +++ b/src/tool/hpcrun/sample-sources/perf/perf_mmap.h @@ -83,6 +83,6 @@ pe_mmap_t* set_mmap(int perf_fd); void perf_unmmap(pe_mmap_t *mmap); int read_perf_buffer(event_thread_t *current, perf_mmap_data_t *mmap_info); - +int read_event_counter(event_thread_t *current,uint64_t *val); #endif From 7b930bb97f5f2e3113951e9ce8723a5ff22c48ff Mon Sep 17 00:00:00 2001 From: bugubugu123 Date: Mon, 18 Dec 2017 10:50:23 -0500 Subject: [PATCH 05/43] Added the mechanism to read the perf counter --- .../hpcrun/sample-sources/perf/linux_perf.c | 12 +++++- .../hpcrun/sample-sources/perf/perf-util.c | 6 +++ .../hpcrun/sample-sources/perf/perf_mmap.c | 39 ++++++++++++++++++- .../sample-sources/watchpoint_clients.c | 30 +++++++++----- 4 files changed, 75 insertions(+), 12 deletions(-) diff --git a/src/tool/hpcrun/sample-sources/perf/linux_perf.c b/src/tool/hpcrun/sample-sources/perf/linux_perf.c index 53d32425d4..fddbecbc91 100644 --- a/src/tool/hpcrun/sample-sources/perf/linux_perf.c +++ b/src/tool/hpcrun/sample-sources/perf/linux_perf.c @@ -207,6 +207,7 @@ static const char *event_name = "CPU_CYCLES"; //****************************************************************************** int reuse_cacheline_distance_event_index = -1; +int linux_perf_sample_source_index = -1; //****************************************************************************** // local variables @@ -765,7 +766,7 @@ METHOD_FN(process_event_list, int lush_metrics) // automatically. But in practice, it didn't. Not sure why. for (event = start_tok(evlist); more_tok(); event = next_tok(), num_events++); - + self->evl.nevents = num_events; // setup all requested events @@ -796,7 +797,7 @@ METHOD_FN(process_event_list, int lush_metrics) int period_type = hpcrun_extract_ev_thresh(event, sizeof(name), name, &threshold, default_threshold.threshold_num); - //printf("period_type %d\n", period_type);printf("threshold %ld, %ld\n", threshold, default_threshold.threshold_num); //jqswang + // ------------------------------------------------------------ // need a special case if we have our own customized predefined event // This "customized" event will use one or more perf events @@ -837,6 +838,13 @@ METHOD_FN(process_event_list, int lush_metrics) // since the OS will free it, we don't have to do it in hpcrun // set the metric for this perf event event_desc[i].metric = hpcrun_new_metric(); + + /******** For witch client WP_REUSE ***************/ + if (threshold == 0){ + reuse_cacheline_distance_event_index = i; + linux_perf_sample_source_index = self->sel_idx; + } + /**************************************************/ // ------------------------------------------------------------ // if we use frequency (event_type=1) then the period is not deterministic, diff --git a/src/tool/hpcrun/sample-sources/perf/perf-util.c b/src/tool/hpcrun/sample-sources/perf/perf-util.c index 23ec629a7e..d49aada210 100644 --- a/src/tool/hpcrun/sample-sources/perf/perf-util.c +++ b/src/tool/hpcrun/sample-sources/perf/perf-util.c @@ -378,6 +378,12 @@ perf_attr_init( attr->freq = (usePeriod ? 0 : 1); attr->sample_period = threshold; /* Period or frequency of sampling */ + + //jqswang + if (threshold == 0){ + attr->read_format = PERF_FORMAT_TOTAL_TIME_ENABLED|PERF_FORMAT_TOTAL_TIME_RUNNING; + } + int max_sample_rate = perf_max_sample_rate(); if (attr->freq == 1 && threshold > max_sample_rate) { diff --git a/src/tool/hpcrun/sample-sources/perf/perf_mmap.c b/src/tool/hpcrun/sample-sources/perf/perf_mmap.c index f5cb88457f..ea38d3da1b 100644 --- a/src/tool/hpcrun/sample-sources/perf/perf_mmap.c +++ b/src/tool/hpcrun/sample-sources/perf/perf_mmap.c @@ -426,6 +426,7 @@ parse_buffer(int sample_type, event_thread_t *current, perf_mmap_data_t *mmap_in static inline int rdpmc(pe_mmap_t *mmap, uint64_t *value) { int counter = mmap->index - 1; + //fprintf(stderr,"counter = %d\n", counter); DECLARE_ARGS(val, low, high); if (counter < 0) return -1; @@ -438,6 +439,27 @@ static inline int rdpmc(pe_mmap_t *mmap, uint64_t *value) #error("rdpmc() is not defined"); #endif +/* + * values[0] = raw count + * values[1] = TIME_ENABLED + * values[2] = TIME_RUNNING + */ +static inline uint64_t perf_scale(uint64_t *values) { + uint64_t res = 0; + + if (!values[2] && !values[1] && values[0]) { + fprintf(stderr,"WARNING: time_running = 0 = time_enabled, raw count not zero\n"); + } + if (values[2] > values[1]) { + fprintf(stderr, "WARNING: time_running > time_enabled\n"); + } + if (values[2]) { + res = (uint64_t)((double)values[0] * values[1]/values[2]); + } + return res; +} + + //---------------------------------------------------------------------- // Public Interfaces //---------------------------------------------------------------------- @@ -445,7 +467,22 @@ static inline int rdpmc(pe_mmap_t *mmap, uint64_t *value) // read the counter value of the event int read_event_counter(event_thread_t *current,uint64_t *val){ pe_mmap_t *current_perf_mmap = current->mmap; - assert(rdpmc(current_perf_mmap, val)); + //rdpmc(current_perf_mmap, val); //something wrong when using rdpmc + + uint64_t values[3]; + if (current->fd < 0){ + EMSG("Error: unable to open the event %d file descriptor", current->event->id); + return -1; + } + int ret = read(current->fd, values, sizeof(values)); + if (ret < sizeof(values)) { + EMSG("Error: unable to read event %d", current->event->id); + return -1; + } + + *val = perf_scale(values); + + //fprintf(stderr, "val = %lx\n", *val); return 0; } diff --git a/src/tool/hpcrun/sample-sources/watchpoint_clients.c b/src/tool/hpcrun/sample-sources/watchpoint_clients.c index 0c5aa895a6..fbfbd11211 100755 --- a/src/tool/hpcrun/sample-sources/watchpoint_clients.c +++ b/src/tool/hpcrun/sample-sources/watchpoint_clients.c @@ -158,6 +158,9 @@ int true_ww_metric_id = -1; int true_rw_metric_id = -1; int true_wr_metric_id = -1; +extern int reuse_cacheline_distance_event_index; +extern int linux_perf_sample_source_index; + #define NUM_WATERMARK_METRICS (4) int curWatermarkId = 0; @@ -1287,8 +1290,16 @@ static WPTriggerActionType ReuseWPCallback(WatchPointInfo_t *wpi, int startOffse int joinNodeIdx = wpi->sample.isSamplePointAccurate? E_ACCURATE_JOIN_NODE_IDX : E_INACCURATE_JOIN_NODE_IDX; uint64_t time_distance = rdtsc() - wpi->startTime; - uint64_t cacheline_distance; // readcounter - wpi->sample.cacheMissCount //TODO:jqswang - + uint64_t cacheline_distance; + event_thread_t *event_thread = (event_thread_t *)TD_GET(ss_info)[linux_perf_sample_source_index].ptr; + read_event_counter(&(event_thread[reuse_cacheline_distance_event_index]), &cacheline_distance); + if (cacheline_distance < wpi->sample.cachelineReuseDistance){ + fprintf(stderr, "HPCRUN: cacheline counter value decreased, previous %lx --> current %lx\n", wpi->sample.cachelineReuseDistance, cacheline_distance); + cacheline_distance = 0; // maybe set it to zero ?? + } + else { + cacheline_distance -= wpi->sample.cachelineReuseDistance; + } //prepare the metric updating arrays int metricIdArray[4]; uint64_t metricIncArray[4]; @@ -2312,13 +2323,14 @@ bool OnSample(perf_mmap_data_t * mmap_data, void * contextPC, cct_node_t *node, goto ErrExit; // incorrect access type } //make sure the following variables have been set - //assert(cache_miss_event_set >= 0); - //assert(cache_miss_event_seq >= 0); - - // Read the cache miss counter - long long cacheMissCount; - //TODO: jqswang assert(ReadEventCounter(cache_miss_event_set /* for PAPI */, cache_miss_event_seq, &cacheMissCount) == PAPI_OK); - sd.cachelineReuseDistance = cacheMissCount; + assert(linux_perf_sample_source_index >= 0); + assert(reuse_cacheline_distance_event_index >= 0); + + // Read the cacheline event counter + uint64_t cachelineCount; + event_thread_t *event_thread = (event_thread_t *)TD_GET(ss_info)[linux_perf_sample_source_index].ptr; + read_event_counter(&(event_thread[reuse_cacheline_distance_event_index]), &cachelineCount); + sd.cachelineReuseDistance = cachelineCount; // register the watchpoint SubscribeWatchpoint(&sd, OVERWRITE, false /* capture value */); } From e386bc8584939480cad1beac5177fe5c49b1f9df Mon Sep 17 00:00:00 2001 From: bugubugu123 Date: Wed, 20 Dec 2017 11:46:07 -0500 Subject: [PATCH 06/43] When reading the counter, currently disabled the scaling. --- src/tool/hpcrun/sample-sources/perf/perf_mmap.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/tool/hpcrun/sample-sources/perf/perf_mmap.c b/src/tool/hpcrun/sample-sources/perf/perf_mmap.c index ea38d3da1b..d86e78a926 100644 --- a/src/tool/hpcrun/sample-sources/perf/perf_mmap.c +++ b/src/tool/hpcrun/sample-sources/perf/perf_mmap.c @@ -480,7 +480,11 @@ int read_event_counter(event_thread_t *current,uint64_t *val){ return -1; } - *val = perf_scale(values); + *val = values[0]; //*val = perf_scale(values); + //TODO: If recording cache misses, the scaled value may be smaller than the previous reading (the counter should always increment if not reset). + // While time_enabled is always increasing, the counter of value and time_runing always increase together. + // We need to check whether we need to scale the value. If yes, how? + //fprintf(stderr, "values[0] = %lx, values[1] = %lx, values[2] = %lx\n", values[0],values[1], values[2); //fprintf(stderr, "val = %lx\n", *val); return 0; From 478d5d5b3adaf433e235a91b7fa79b414098ab0d Mon Sep 17 00:00:00 2001 From: bugubugu123 Date: Sat, 30 Dec 2017 09:49:58 -0500 Subject: [PATCH 07/43] For the context of triggered point, use the precise PC instead of the context PC to construct the calling context --- .../sample-sources/watchpoint_clients.c | 38 +++++++++++++++---- .../sample-sources/watchpoint_support.c | 4 +- 2 files changed, 33 insertions(+), 9 deletions(-) diff --git a/src/tool/hpcrun/sample-sources/watchpoint_clients.c b/src/tool/hpcrun/sample-sources/watchpoint_clients.c index fbfbd11211..8c82d47fcd 100755 --- a/src/tool/hpcrun/sample-sources/watchpoint_clients.c +++ b/src/tool/hpcrun/sample-sources/watchpoint_clients.c @@ -1065,14 +1065,36 @@ static inline void UpdateConcatenatedPathPair(void *ctxt, cct_node_t * oldNode, // update the foundMetric cct_metric_data_increment(metricId, node, (cct_metric_data_t){.i = metricInc}); } -static inline void UpdateConcatenatedPathPairMultiple(void *ctxt, cct_node_t * oldNode, const void * joinNode, int *metricIdArray, uint64_t *metricIncArray, uint32_t numMetric){ +static inline void UpdateConcatenatedPathPairMultiple(void *ctxt, void *precise_pc, cct_node_t * oldNode, const void * joinNode, int *metricIdArray, uint64_t *metricIncArray, uint32_t numMetric){ + // Currently, we assume precise_pc + 1 = context_pc (+1 means one instruction) if (numMetric == 0) return; + // unwind call stack once - sample_val_t v = hpcrun_sample_callpath(ctxt, metricIdArray[0], SAMPLE_NO_INC, 0/*skipInner*/, 1/*isSync*/, NULL); + sample_val_t v = hpcrun_sample_callpath(ctxt, metricIdArray[0], SAMPLE_NO_INC, 0/*skipInner*/, 1/*isSync*/, NULL); + cct_node_t *new_node = v.sample_node; + if (precise_pc !=0){ + cct_node_t *tmp_node = hpcrun_cct_parent(new_node); + assert(tmp_node); + if (is_same_function(hpcrun_context_pc(ctxt), precise_pc) == SAME_FN){ + tmp_node = hpcrun_insert_special_node(tmp_node, precise_pc-1); // in hpcrun_insert_special_node(), the ip is added by 1. We want to cancel it here. + new_node = tmp_node; + cct_addr_t *addr = hpcrun_cct_addr(tmp_node); + } + else { // if they are not within the same function. Set the node to the calling site. + cct_addr_t * addr = hpcrun_cct_addr(tmp_node); + if (addr->ip_norm.lm_ip - (unsigned long)precise_pc <= 15){ + tmp_node = hpcrun_cct_parent(tmp_node); + assert(tmp_node); + tmp_node = hpcrun_insert_special_node(tmp_node, precise_pc-1); + new_node = tmp_node; + } + } + } + // insert a special node cct_node_t *node = hpcrun_insert_special_node(oldNode, joinNode); // concatenate call paths - node = hpcrun_cct_insert_path_return_leaf(v.sample_node, node); + node = hpcrun_cct_insert_path_return_leaf(new_node, node); for(uint32_t i = 0; i < numMetric; i++){ // update the foundMetric cct_metric_data_increment(metricIdArray[i], node, (cct_metric_data_t){.i = metricIncArray[i]}); @@ -1281,7 +1303,8 @@ static WPTriggerActionType RedStoreWPCallback(WatchPointInfo_t *wpi, int startOf static WPTriggerActionType ReuseWPCallback(WatchPointInfo_t *wpi, int startOffset, int safeAccessLen, WatchPointTrigger_t * wt){ if(!wt->pc) { // if the ip is 0, let's retain the WP - return RETAIN_WP; + //return RETAIN_WP; + return ALREADY_DISABLED; } // Report a reuse double myProportion = ProportionOfWatchpointAmongOthersSharingTheSameContext(wpi); @@ -1312,12 +1335,12 @@ static WPTriggerActionType ReuseWPCallback(WatchPointInfo_t *wpi, int startOffse if (wpi->sample.reuseType == REUSE_TEMPORAL){ reuseTemporal += inc; metricIdArray[0] = temporal_reuse_metric_id; - UpdateConcatenatedPathPairMultiple(wt->ctxt, wpi->sample.node /* oldNode*/, joinNodes[E_TEPORALLY_REUSED][joinNodeIdx] /* joinNode*/, metricIdArray, metricIncArray, 4); + UpdateConcatenatedPathPairMultiple(wt->ctxt,wt->pc, wpi->sample.node /* oldNode*/, joinNodes[E_TEPORALLY_REUSED][joinNodeIdx] /* joinNode*/, metricIdArray, metricIncArray, 4); } else { reuseSpatial += inc; metricIdArray[0] = spatial_reuse_metric_id; - UpdateConcatenatedPathPairMultiple(wt->ctxt, wpi->sample.node /* oldNode*/, joinNodes[E_SPATIALLY_REUSED][joinNodeIdx] /* joinNode*/, metricIdArray, metricIncArray, 4); + UpdateConcatenatedPathPairMultiple(wt->ctxt, wt->pc, wpi->sample.node /* oldNode*/, joinNodes[E_SPATIALLY_REUSED][joinNodeIdx] /* joinNode*/, metricIdArray, metricIncArray, 4); } return ALREADY_DISABLED; } @@ -2286,7 +2309,7 @@ bool OnSample(perf_mmap_data_t * mmap_data, void * contextPC, cct_node_t *node, .sampledMetricId=sampledMetricId, .isSamplePointAccurate = isSamplePointAccurate, .preWPAction=theWPConfig->preWPAction, - .isBackTrace = false + .isBackTrace = false, }; sd.wpLength = GetFloorWPLength(accessLen); if (rdtsc() & 1) { // 50% chance to detect spatial reuse @@ -2326,6 +2349,7 @@ bool OnSample(perf_mmap_data_t * mmap_data, void * contextPC, cct_node_t *node, assert(linux_perf_sample_source_index >= 0); assert(reuse_cacheline_distance_event_index >= 0); + // Read the cacheline event counter uint64_t cachelineCount; event_thread_t *event_thread = (event_thread_t *)TD_GET(ss_info)[linux_perf_sample_source_index].ptr; diff --git a/src/tool/hpcrun/sample-sources/watchpoint_support.c b/src/tool/hpcrun/sample-sources/watchpoint_support.c index 2aace57f02..9a73e40305 100644 --- a/src/tool/hpcrun/sample-sources/watchpoint_support.c +++ b/src/tool/hpcrun/sample-sources/watchpoint_support.c @@ -833,7 +833,6 @@ static bool CollectWatchPointTriggerInfo(WatchPointInfo_t * wpi, WatchPointTrig EMSG("Failed to ReadMampBuffer: %s\n", strerror(errno)); monitor_real_abort(); } - if(! (hdr.misc & PERF_RECORD_MISC_EXACT_IP)){ //EMSG("PERF_SAMPLE_IP imprecise\n"); tData.numWatchpointImpreciseIP ++; @@ -847,7 +846,8 @@ static bool CollectWatchPointTriggerInfo(WatchPointInfo_t * wpi, WatchPointTrig reliableIP = patchedIP; } else { // Fake as requested by Xu for reuse clients - reliableIP = contextIP-1; + //reliableIP = contextIP-1; + reliableIP = 0; //To avoid ambiguity, just ignore the inaccurate PC for the moment for reuse client. TODO: Can we still use the inaccurate pc? } //EMSG("PERF_SAMPLE_IP imprecise: %p patched to %p in WP handler\n", tmpIP, patchedIP); } else { From f02834e75fc8bd01404ef36cff9ddd292c9f4ad6 Mon Sep 17 00:00:00 2001 From: bugubugu123 Date: Sat, 30 Dec 2017 15:31:37 -0500 Subject: [PATCH 08/43] FIxed a typo --- src/tool/hpcrun/sample-sources/watchpoint_clients.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/tool/hpcrun/sample-sources/watchpoint_clients.c b/src/tool/hpcrun/sample-sources/watchpoint_clients.c index 8c82d47fcd..87aa637be5 100755 --- a/src/tool/hpcrun/sample-sources/watchpoint_clients.c +++ b/src/tool/hpcrun/sample-sources/watchpoint_clients.c @@ -914,7 +914,7 @@ enum JoinNodeType { E_KILLED=0, E_USED, E_NEW_VAL, - E_TEPORALLY_REUSED, + E_TEMPORALLY_REUSED, E_SPATIALLY_REUSED, E_TRUE_WW_SHARE, E_TRUE_WR_SHARE, @@ -945,8 +945,8 @@ static void USED_BY_INACCURATE_PC(void) {} static void NEW_VAL_BY(void) {} static void NEW_VAL_BY_INACCURATE_PC(void) {} -static void TEPORALLY_REUSED_BY(void) {} -static void TEPORALLY_REUSED_BY_INACCURATE_PC(void) {} +static void TEMPORALLY_REUSED_BY(void) {} +static void TEMPORALLY_REUSED_BY_INACCURATE_PC(void) {} static void SPATIALLY_REUSED_BY(void) {} static void SPATIALLY_REUSED_BY_INACCURATE_PC(void) {} @@ -994,7 +994,7 @@ static const void * joinNodes[][2] = { [E_KILLED] = GET_FUN_ADDR(KILLED_BY), [E_USED] = GET_FUN_ADDR(USED_BY), [E_NEW_VAL] = GET_FUN_ADDR(NEW_VAL_BY), - [E_TEPORALLY_REUSED] = GET_FUN_ADDR(TEPORALLY_REUSED_BY), + [E_TEMPORALLY_REUSED] = GET_FUN_ADDR(TEMPORALLY_REUSED_BY), [E_SPATIALLY_REUSED] = GET_FUN_ADDR(SPATIALLY_REUSED_BY), [E_TRUE_WW_SHARE] = GET_FUN_ADDR(TRUE_WW_SHARE), [E_TRUE_WR_SHARE] = GET_FUN_ADDR(TRUE_WR_SHARE), @@ -1335,7 +1335,7 @@ static WPTriggerActionType ReuseWPCallback(WatchPointInfo_t *wpi, int startOffse if (wpi->sample.reuseType == REUSE_TEMPORAL){ reuseTemporal += inc; metricIdArray[0] = temporal_reuse_metric_id; - UpdateConcatenatedPathPairMultiple(wt->ctxt,wt->pc, wpi->sample.node /* oldNode*/, joinNodes[E_TEPORALLY_REUSED][joinNodeIdx] /* joinNode*/, metricIdArray, metricIncArray, 4); + UpdateConcatenatedPathPairMultiple(wt->ctxt,wt->pc, wpi->sample.node /* oldNode*/, joinNodes[E_TEMPORALLY_REUSED][joinNodeIdx] /* joinNode*/, metricIdArray, metricIncArray, 4); } else { reuseSpatial += inc; From 992e6fd961b13e55077d1c12e632d0fc88e1c2e5 Mon Sep 17 00:00:00 2001 From: bugubugu123 Date: Tue, 9 Jan 2018 16:33:10 -0500 Subject: [PATCH 09/43] For reuse clients, the calling context becomes [root] reuse_point + joinNode + first_use_point [leaf] --- .../sample-sources/watchpoint_clients.c | 68 +++++++++++-------- 1 file changed, 38 insertions(+), 30 deletions(-) diff --git a/src/tool/hpcrun/sample-sources/watchpoint_clients.c b/src/tool/hpcrun/sample-sources/watchpoint_clients.c index 4dc21528bc..b0c38d3f1b 100755 --- a/src/tool/hpcrun/sample-sources/watchpoint_clients.c +++ b/src/tool/hpcrun/sample-sources/watchpoint_clients.c @@ -943,11 +943,11 @@ static void USED_BY_INACCURATE_PC(void) {} static void NEW_VAL_BY(void) {} static void NEW_VAL_BY_INACCURATE_PC(void) {} -static void TEMPORALLY_REUSED_BY(void) {} -static void TEMPORALLY_REUSED_BY_INACCURATE_PC(void) {} +static void TEMPORALLY_REUSED_FROM(void) {} +static void TEMPORALLY_REUSED_FROM_INACCURATE_PC(void) {} -static void SPATIALLY_REUSED_BY(void) {} -static void SPATIALLY_REUSED_BY_INACCURATE_PC(void) {} +static void SPATIALLY_REUSED_FROM(void) {} +static void SPATIALLY_REUSED_FROM_INACCURATE_PC(void) {} static void TRUE_WW_SHARE(void) {} static void TRUE_WW_SHARE_INACCURATE_PC(void) {} @@ -992,8 +992,8 @@ static const void * joinNodes[][2] = { [E_KILLED] = GET_FUN_ADDR(KILLED_BY), [E_USED] = GET_FUN_ADDR(USED_BY), [E_NEW_VAL] = GET_FUN_ADDR(NEW_VAL_BY), - [E_TEMPORALLY_REUSED] = GET_FUN_ADDR(TEMPORALLY_REUSED_BY), - [E_SPATIALLY_REUSED] = GET_FUN_ADDR(SPATIALLY_REUSED_BY), + [E_TEMPORALLY_REUSED] = GET_FUN_ADDR(TEMPORALLY_REUSED_FROM), + [E_SPATIALLY_REUSED] = GET_FUN_ADDR(SPATIALLY_REUSED_FROM), [E_TRUE_WW_SHARE] = GET_FUN_ADDR(TRUE_WW_SHARE), [E_TRUE_WR_SHARE] = GET_FUN_ADDR(TRUE_WR_SHARE), [E_TRUE_RW_SHARE] = GET_FUN_ADDR(TRUE_RW_SHARE), @@ -1063,36 +1063,42 @@ static inline void UpdateConcatenatedPathPair(void *ctxt, cct_node_t * oldNode, // update the foundMetric cct_metric_data_increment(metricId, node, (cct_metric_data_t){.i = metricInc}); } -static inline void UpdateConcatenatedPathPairMultiple(void *ctxt, void *precise_pc, cct_node_t * oldNode, const void * joinNode, int *metricIdArray, uint64_t *metricIncArray, uint32_t numMetric){ - // Currently, we assume precise_pc + 1 = context_pc (+1 means one instruction) - if (numMetric == 0) return; + +static inline cct_node_t *getPreciseNode(void *ctxt, void *precise_pc, int dummyMetricId){ + // currently, we assume precise_pc + 1 = context_pc for PEBS (+1 means one instruction) + // we want the context to point to the exact IP // unwind call stack once - sample_val_t v = hpcrun_sample_callpath(ctxt, metricIdArray[0], SAMPLE_NO_INC, 0/*skipInner*/, 1/*isSync*/, NULL); + sample_val_t v = hpcrun_sample_callpath(ctxt, dummyMetricId, SAMPLE_NO_INC, 0/*skipInner*/, 1/*isSync*/, NULL); cct_node_t *new_node = v.sample_node; - if (precise_pc !=0){ - cct_node_t *tmp_node = hpcrun_cct_parent(new_node); - assert(tmp_node); - if (is_same_function(hpcrun_context_pc(ctxt), precise_pc) == SAME_FN){ - tmp_node = hpcrun_insert_special_node(tmp_node, precise_pc-1); // in hpcrun_insert_special_node(), the ip is added by 1. We want to cancel it here. + if (precise_pc == 0) return new_node; + + cct_node_t *tmp_node = hpcrun_cct_parent(new_node); + assert(tmp_node); + if (is_same_function(hpcrun_context_pc(ctxt), precise_pc) == SAME_FN){ + tmp_node = hpcrun_insert_special_node(tmp_node, precise_pc-1); // in hpcrun_insert_special_node(), the ip is added by 1. We want to cancel it here. + new_node = tmp_node; + cct_addr_t *addr = hpcrun_cct_addr(tmp_node); + } + else { // if they are not within the same function. Set the node to the calling site. + cct_addr_t * addr = hpcrun_cct_addr(tmp_node); + if (addr->ip_norm.lm_ip - (unsigned long)precise_pc <= 15){ + tmp_node = hpcrun_cct_parent(tmp_node); + assert(tmp_node); + tmp_node = hpcrun_insert_special_node(tmp_node, precise_pc-1); new_node = tmp_node; - cct_addr_t *addr = hpcrun_cct_addr(tmp_node); - } - else { // if they are not within the same function. Set the node to the calling site. - cct_addr_t * addr = hpcrun_cct_addr(tmp_node); - if (addr->ip_norm.lm_ip - (unsigned long)precise_pc <= 15){ - tmp_node = hpcrun_cct_parent(tmp_node); - assert(tmp_node); - tmp_node = hpcrun_insert_special_node(tmp_node, precise_pc-1); - new_node = tmp_node; - } } } + return new_node; +} + +static inline void UpdateConcatenatedPathPairMultiple(cct_node_t *bottomNode, cct_node_t * topNode, const void * joinNode, int *metricIdArray, uint64_t *metricIncArray, uint32_t numMetric){ + if (numMetric == 0) return; // insert a special node - cct_node_t *node = hpcrun_insert_special_node(oldNode, joinNode); + cct_node_t *node = hpcrun_insert_special_node(topNode, joinNode); // concatenate call paths - node = hpcrun_cct_insert_path_return_leaf(new_node, node); + node = hpcrun_cct_insert_path_return_leaf(bottomNode, node); for(uint32_t i = 0; i < numMetric; i++){ // update the foundMetric cct_metric_data_increment(metricIdArray[i], node, (cct_metric_data_t){.i = metricIncArray[i]}); @@ -1300,7 +1306,7 @@ static WPTriggerActionType RedStoreWPCallback(WatchPointInfo_t *wpi, int startOf static WPTriggerActionType ReuseWPCallback(WatchPointInfo_t *wpi, int startOffset, int safeAccessLen, WatchPointTrigger_t * wt){ if(!wt->pc) { - // if the ip is 0, let's retain the WP + // if the ip is 0, let's drop the WP //return RETAIN_WP; return ALREADY_DISABLED; } @@ -1333,12 +1339,14 @@ static WPTriggerActionType ReuseWPCallback(WatchPointInfo_t *wpi, int startOffse if (wpi->sample.reuseType == REUSE_TEMPORAL){ reuseTemporal += inc; metricIdArray[0] = temporal_reuse_metric_id; - UpdateConcatenatedPathPairMultiple(wt->ctxt,wt->pc, wpi->sample.node /* oldNode*/, joinNodes[E_TEMPORALLY_REUSED][joinNodeIdx] /* joinNode*/, metricIdArray, metricIncArray, 4); + cct_node_t *reuseNode = getPreciseNode(wt->ctxt, wt->pc, temporal_reuse_metric_id ); + UpdateConcatenatedPathPairMultiple(wpi->sample.node /*bottomNode*/, reuseNode /*topNode*/, joinNodes[E_TEMPORALLY_REUSED][joinNodeIdx] /* joinNode*/, metricIdArray, metricIncArray, 4); } else { reuseSpatial += inc; metricIdArray[0] = spatial_reuse_metric_id; - UpdateConcatenatedPathPairMultiple(wt->ctxt, wt->pc, wpi->sample.node /* oldNode*/, joinNodes[E_SPATIALLY_REUSED][joinNodeIdx] /* joinNode*/, metricIdArray, metricIncArray, 4); + cct_node_t *reuseNode = getPreciseNode(wt->ctxt, wt->pc, spatial_reuse_metric_id ); + UpdateConcatenatedPathPairMultiple(wpi->sample.node /*bottomNode*/, reuseNode /*topNode*/, joinNodes[E_SPATIALLY_REUSED][joinNodeIdx] /* joinNode*/, metricIdArray, metricIncArray, 4); } return ALREADY_DISABLED; } From 82dc2c5dab095b5e0c414ab8a5e95a0155b80156 Mon Sep 17 00:00:00 2001 From: bugubugu123 Date: Wed, 10 Jan 2018 11:55:26 -0500 Subject: [PATCH 10/43] Added weight field for PERF --- src/tool/hpcrun/sample-sources/perf/linux_perf.c | 2 +- src/tool/hpcrun/sample-sources/perf/perf-util.c | 3 ++- src/tool/hpcrun/sample-sources/perf/perf_mmap.c | 1 + 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/src/tool/hpcrun/sample-sources/perf/linux_perf.c b/src/tool/hpcrun/sample-sources/perf/linux_perf.c index 1978d8dcd9..d676b6255f 100644 --- a/src/tool/hpcrun/sample-sources/perf/linux_perf.c +++ b/src/tool/hpcrun/sample-sources/perf/linux_perf.c @@ -474,7 +474,7 @@ record_sample(event_thread_t *current, perf_mmap_data_t *mmap_data, } hpcrun_clear_handling_sample(td); #endif - + if(WatchpointClientActive()){ OnSample(mmap_data, hpcrun_context_pc(context), diff --git a/src/tool/hpcrun/sample-sources/perf/perf-util.c b/src/tool/hpcrun/sample-sources/perf/perf-util.c index 6e1ed8ef88..2b1dae89c9 100644 --- a/src/tool/hpcrun/sample-sources/perf/perf-util.c +++ b/src/tool/hpcrun/sample-sources/perf/perf-util.c @@ -309,7 +309,8 @@ perf_attr_init( unsigned int sample_type = sampletype | PERF_SAMPLE_PERIOD | PERF_SAMPLE_TIME | PERF_SAMPLE_IP | PERF_SAMPLE_ADDR - | PERF_SAMPLE_CPU | PERF_SAMPLE_TID; + | PERF_SAMPLE_CPU | PERF_SAMPLE_TID + | PERF_SAMPLE_WEIGHT; attr->size = sizeof(struct perf_event_attr); /* Size of attribute structure */ attr->freq = (usePeriod ? 0 : 1); diff --git a/src/tool/hpcrun/sample-sources/perf/perf_mmap.c b/src/tool/hpcrun/sample-sources/perf/perf_mmap.c index 0621e235dd..141d4db075 100644 --- a/src/tool/hpcrun/sample-sources/perf/perf_mmap.c +++ b/src/tool/hpcrun/sample-sources/perf/perf_mmap.c @@ -385,6 +385,7 @@ parse_buffer(int sample_type, event_thread_t *current, perf_mmap_data_t *mmap_in #endif #if LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0) if (sample_type & PERF_SAMPLE_WEIGHT) { + perf_read_u64(current_perf_mmap, &mmap_info->weight); data_read++; } if (sample_type & PERF_SAMPLE_DATA_SRC) { From 204ab13cc25be3d7f563740c7d02c76f84903106 Mon Sep 17 00:00:00 2001 From: bugubugu123 Date: Wed, 10 Jan 2018 21:12:06 -0500 Subject: [PATCH 11/43] Added the metric LATENCY for reuse client Re-added the flag PERF_EVENT_IOC_UPDATE_BREAKPOINT for older kernel --- src/tool/hpcrun/sample-sources/watchpoint_clients.c | 7 ++++++- src/tool/hpcrun/sample-sources/watchpoint_support.c | 8 ++++++-- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/src/tool/hpcrun/sample-sources/watchpoint_clients.c b/src/tool/hpcrun/sample-sources/watchpoint_clients.c index b0c38d3f1b..fcc42397a1 100755 --- a/src/tool/hpcrun/sample-sources/watchpoint_clients.c +++ b/src/tool/hpcrun/sample-sources/watchpoint_clients.c @@ -816,6 +816,8 @@ METHOD_FN(process_event_list, int lush_metrics) hpcrun_set_metric_info_and_period(temporal_reuse_metric_id, "TEMPORAL", MetricFlags_ValFmt_Int, 1, metric_property_none); spatial_reuse_metric_id = hpcrun_new_metric(); hpcrun_set_metric_info_and_period(spatial_reuse_metric_id, "SPATIAL", MetricFlags_ValFmt_Int, 1, metric_property_none); + latency_metric_id = hpcrun_new_metric(); + hpcrun_set_metric_info_and_period(latency_metric_id, "LATENCY", MetricFlags_ValFmt_Int, 1, metric_property_none); reuse_time_distance_metric_id = hpcrun_new_metric(); hpcrun_set_metric_info_and_period(reuse_time_distance_metric_id, "TIME_DISTANCE", MetricFlags_ValFmt_Int, 1, metric_property_none); reuse_cacheline_distance_metric_id = hpcrun_new_metric(); @@ -2307,7 +2309,10 @@ bool OnSample(perf_mmap_data_t * mmap_data, void * contextPC, cct_node_t *node, case WP_REUSE:{ long metricThreshold = hpcrun_id2metric(sampledMetricId)->period; accessedIns += metricThreshold; - + if (accessType == LOAD) { + cct_metric_data_increment(latency_metric_id, node, (cct_metric_data_t){.i = mmap_data->weight}); + } + SampleData_t sd= { .node = node, .type=WP_RW, diff --git a/src/tool/hpcrun/sample-sources/watchpoint_support.c b/src/tool/hpcrun/sample-sources/watchpoint_support.c index aae2506f4e..a94ee469e6 100644 --- a/src/tool/hpcrun/sample-sources/watchpoint_support.c +++ b/src/tool/hpcrun/sample-sources/watchpoint_support.c @@ -209,7 +209,7 @@ static void InitConfig(){ CHECK(close(fd)); -#if defined(PERF_EVENT_IOC_MODIFY_ATTRIBUTES) +#if defined(PERF_EVENT_IOC_MODIFY_ATTRIBUTES) || defined(PERF_EVENT_IOC_UPDATE_BREAKPOINT) wpConfig.isWPModifyEnabled = true; #else wpConfig.isWPModifyEnabled = false; @@ -409,13 +409,17 @@ static void CreateWatchPoint(WatchPointInfo_t * wpi, SampleData_t * sampleData, default: pe.bp_type = HW_BREAKPOINT_W | HW_BREAKPOINT_R; } -#if defined(PERF_EVENT_IOC_MODIFY_ATTRIBUTES) +#if defined(PERF_EVENT_IOC_MODIFY_ATTRIBUTES) || defined(PERF_EVENT_IOC_UPDATE_BREAKPOINT) if(modify) { // modification assert(wpi->fileHandle != -1); assert(wpi->mmapBuffer != 0); //DisableWatchpoint(wpi); +#if defined(PERF_EVENT_IOC_MODIFY_ATTRIBUTES) CHECK(ioctl(wpi->fileHandle, PERF_EVENT_IOC_MODIFY_ATTRIBUTES, (unsigned long) (&pe))); +#else + CHECK(ioctl(wpi->fileHandle, PERF_EVENT_IOC_UPDATE_BREAKPOINT, (unsigned long) (&pe))); +#endif //if(wpi->isActive == false) { //EnableWatchpoint(wpi->fileHandle); //} From f4fac972ba5ef849231030e04f28e0185ba66d00 Mon Sep 17 00:00:00 2001 From: bugubugu123 Date: Thu, 11 Jan 2018 16:54:36 -0500 Subject: [PATCH 12/43] Fixed a bug --- src/tool/hpcrun/sample-sources/watchpoint_clients.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/tool/hpcrun/sample-sources/watchpoint_clients.c b/src/tool/hpcrun/sample-sources/watchpoint_clients.c index fcc42397a1..fe56df1d4d 100755 --- a/src/tool/hpcrun/sample-sources/watchpoint_clients.c +++ b/src/tool/hpcrun/sample-sources/watchpoint_clients.c @@ -2309,7 +2309,8 @@ bool OnSample(perf_mmap_data_t * mmap_data, void * contextPC, cct_node_t *node, case WP_REUSE:{ long metricThreshold = hpcrun_id2metric(sampledMetricId)->period; accessedIns += metricThreshold; - if (accessType == LOAD) { + const char *event_name = hpcrun_id2metric(sampledMetricId)->name; + if ( strstr(event_name, "LATENCY_ABOVE_THRESHOLD") || strstr(event_name, "LOAD_LATENCY") ) { cct_metric_data_increment(latency_metric_id, node, (cct_metric_data_t){.i = mmap_data->weight}); } From 48b2d70a856a50ecb34ebcbf12d4a2573f6c81d8 Mon Sep 17 00:00:00 2001 From: bugubugu123 Date: Fri, 12 Jan 2018 00:51:57 -0500 Subject: [PATCH 13/43] Set the correct period of the metric "LATENCY" --- src/tool/hpcrun/sample-sources/watchpoint_clients.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/tool/hpcrun/sample-sources/watchpoint_clients.c b/src/tool/hpcrun/sample-sources/watchpoint_clients.c index fe56df1d4d..068ae90fa6 100755 --- a/src/tool/hpcrun/sample-sources/watchpoint_clients.c +++ b/src/tool/hpcrun/sample-sources/watchpoint_clients.c @@ -528,6 +528,14 @@ static void ClientTermination(){ hpcrun_stats_num_reuseTemporal_inc(reuseTemporal); hpcrun_stats_num_accessedIns_inc(accessedIns); hpcrun_stats_num_reuseSpatial_inc(reuseSpatial); + // Set the period of LATENCY and event MEM_TRANS_RETIRED:LATENCY_ABOVE_THRESHOLD/LOAD_LATENCY the same + // NOTES: There exists a concurrency problem. But will it cause any problem? + for(int i=0; i < hpcrun_get_num_metrics(); i++){ + if (strstr(hpcrun_id2metric(i)->name,"MEM_TRANS_RETIRED")){ + hpcrun_id2metric(latency_metric_id)->period = hpcrun_id2metric(i)->period; + break; + } + } break; case WP_FALSE_SHARING: case WP_IPC_FALSE_SHARING: From cc725a82e08232f06b8033e0a1a67eee46a0321f Mon Sep 17 00:00:00 2001 From: bugubugu123 Date: Fri, 12 Jan 2018 16:21:30 -0500 Subject: [PATCH 14/43] Minors --- src/tool/hpcrun/sample-sources/perf/linux_perf.c | 7 +++++++ src/tool/hpcrun/sample-sources/watchpoint_clients.c | 12 +----------- 2 files changed, 8 insertions(+), 11 deletions(-) diff --git a/src/tool/hpcrun/sample-sources/perf/linux_perf.c b/src/tool/hpcrun/sample-sources/perf/linux_perf.c index d676b6255f..11e11fe881 100644 --- a/src/tool/hpcrun/sample-sources/perf/linux_perf.c +++ b/src/tool/hpcrun/sample-sources/perf/linux_perf.c @@ -813,6 +813,13 @@ METHOD_FN(process_event_list, int lush_metrics) metric_desc_t *m = hpcrun_set_metric_info_and_period(event_desc[i].metric, name_dup, MetricFlags_ValFmt_Real, threshold, prop); + // add the latency metric if the event is MEM_TRANS_RETIRED:LATENCY_ABOVE_THRESHOLD or MEM_TRANS_RETIRED:LOAD_LATENCY + if (strstr(name, "MEM_TRANS_RETIRED")) { + extern int latency_metric_id; + latency_metric_id = hpcrun_new_metric(); + hpcrun_set_metric_info_and_period(latency_metric_id, "LATENCY", MetricFlags_ValFmt_Int, threshold, metric_property_none); + } + if (m == NULL) { EMSG("Error: unable to create metric #%d: %s", index, name); } else { diff --git a/src/tool/hpcrun/sample-sources/watchpoint_clients.c b/src/tool/hpcrun/sample-sources/watchpoint_clients.c index 068ae90fa6..f1fbbc9e4f 100755 --- a/src/tool/hpcrun/sample-sources/watchpoint_clients.c +++ b/src/tool/hpcrun/sample-sources/watchpoint_clients.c @@ -528,14 +528,6 @@ static void ClientTermination(){ hpcrun_stats_num_reuseTemporal_inc(reuseTemporal); hpcrun_stats_num_accessedIns_inc(accessedIns); hpcrun_stats_num_reuseSpatial_inc(reuseSpatial); - // Set the period of LATENCY and event MEM_TRANS_RETIRED:LATENCY_ABOVE_THRESHOLD/LOAD_LATENCY the same - // NOTES: There exists a concurrency problem. But will it cause any problem? - for(int i=0; i < hpcrun_get_num_metrics(); i++){ - if (strstr(hpcrun_id2metric(i)->name,"MEM_TRANS_RETIRED")){ - hpcrun_id2metric(latency_metric_id)->period = hpcrun_id2metric(i)->period; - break; - } - } break; case WP_FALSE_SHARING: case WP_IPC_FALSE_SHARING: @@ -824,8 +816,6 @@ METHOD_FN(process_event_list, int lush_metrics) hpcrun_set_metric_info_and_period(temporal_reuse_metric_id, "TEMPORAL", MetricFlags_ValFmt_Int, 1, metric_property_none); spatial_reuse_metric_id = hpcrun_new_metric(); hpcrun_set_metric_info_and_period(spatial_reuse_metric_id, "SPATIAL", MetricFlags_ValFmt_Int, 1, metric_property_none); - latency_metric_id = hpcrun_new_metric(); - hpcrun_set_metric_info_and_period(latency_metric_id, "LATENCY", MetricFlags_ValFmt_Int, 1, metric_property_none); reuse_time_distance_metric_id = hpcrun_new_metric(); hpcrun_set_metric_info_and_period(reuse_time_distance_metric_id, "TIME_DISTANCE", MetricFlags_ValFmt_Int, 1, metric_property_none); reuse_cacheline_distance_metric_id = hpcrun_new_metric(); @@ -2320,7 +2310,7 @@ bool OnSample(perf_mmap_data_t * mmap_data, void * contextPC, cct_node_t *node, const char *event_name = hpcrun_id2metric(sampledMetricId)->name; if ( strstr(event_name, "LATENCY_ABOVE_THRESHOLD") || strstr(event_name, "LOAD_LATENCY") ) { cct_metric_data_increment(latency_metric_id, node, (cct_metric_data_t){.i = mmap_data->weight}); - } + } SampleData_t sd= { .node = node, From 46afb96593f1f90dfdacbace5afecc6753cde263 Mon Sep 17 00:00:00 2001 From: bugubugu123 Date: Fri, 19 Jan 2018 13:26:03 -0500 Subject: [PATCH 15/43] Added data_src for each perf sample; When sampling the load (threshold_above_latency), do not record the sample if it is an L1 hit --- .../hpcrun/sample-sources/perf/linux_perf.c | 23 +++++++++++++++---- .../hpcrun/sample-sources/perf/perf-util.c | 2 +- .../hpcrun/sample-sources/perf/perf-util.h | 13 +++++++++++ .../sample-sources/watchpoint_clients.c | 5 ---- 4 files changed, 32 insertions(+), 11 deletions(-) diff --git a/src/tool/hpcrun/sample-sources/perf/linux_perf.c b/src/tool/hpcrun/sample-sources/perf/linux_perf.c index 11e11fe881..8e74752ed5 100644 --- a/src/tool/hpcrun/sample-sources/perf/linux_perf.c +++ b/src/tool/hpcrun/sample-sources/perf/linux_perf.c @@ -449,11 +449,24 @@ record_sample(event_thread_t *current, perf_mmap_data_t *mmap_data, } else { td->precise_pc = 0; } - - *sv = hpcrun_sample_callpath(context, current->event->metric, - (hpcrun_metricVal_t) {.r=counter}, - 0/*skipInner*/, 0/*isSync*/, &info); - + + if ( strstr(current->event->metric_desc->name, "LATENCY_ABOVE_THRESHOLD") || strstr(current->event->metric_desc->name, "LOAD_LATENCY") ) { + perf_mmap_data_src_t data_src; + data_src.val = mmap_data->data_src; + if ( (data_src.mem_lvl & PERF_MEM_LVL_HIT) && (data_src.mem_lvl & PERF_MEM_LVL_L1)){ // L1 HIT, ignore + *sv = hpcrun_sample_callpath(context, current->event->metric, (hpcrun_metricVal_t){.i=0}, 0/*skipInner*/, 0/*isSync*/, NULL); + } + else { + *sv = hpcrun_sample_callpath(context, current->event->metric, (hpcrun_metricVal_t) {.r=counter}, 0/*skipInner*/, 0/*isSync*/, &info); + extern int latency_metric_id; + cct_metric_data_increment(latency_metric_id, sv->sample_node, (cct_metric_data_t){.i = mmap_data->weight}); + } + } + else { + *sv = hpcrun_sample_callpath(context, current->event->metric, + (hpcrun_metricVal_t) {.r=counter}, + 0/*skipInner*/, 0/*isSync*/, &info); + } // no need to reset the precise_pc; hpcrun_sample_callpath does so // td->precise_pc = 0; diff --git a/src/tool/hpcrun/sample-sources/perf/perf-util.c b/src/tool/hpcrun/sample-sources/perf/perf-util.c index 2b1dae89c9..24e8c46a24 100644 --- a/src/tool/hpcrun/sample-sources/perf/perf-util.c +++ b/src/tool/hpcrun/sample-sources/perf/perf-util.c @@ -310,7 +310,7 @@ perf_attr_init( | PERF_SAMPLE_PERIOD | PERF_SAMPLE_TIME | PERF_SAMPLE_IP | PERF_SAMPLE_ADDR | PERF_SAMPLE_CPU | PERF_SAMPLE_TID - | PERF_SAMPLE_WEIGHT; + | PERF_SAMPLE_WEIGHT | PERF_SAMPLE_DATA_SRC; attr->size = sizeof(struct perf_event_attr); /* Size of attribute structure */ attr->freq = (usePeriod ? 0 : 1); diff --git a/src/tool/hpcrun/sample-sources/perf/perf-util.h b/src/tool/hpcrun/sample-sources/perf/perf-util.h index 24b77d888e..38db32e702 100644 --- a/src/tool/hpcrun/sample-sources/perf/perf-util.h +++ b/src/tool/hpcrun/sample-sources/perf/perf-util.h @@ -121,6 +121,19 @@ typedef struct perf_mmap_data_s { } perf_mmap_data_t; +// data structure for the data_src field of perf_mmap_data_t +typedef union perf_mmap_data_src_t { + uint64_t val; + struct { + uint64_t mem_op:5, /* type of opcode */ + mem_lvl:14, /* memory hierarchy level */ + mem_snoop:5, /* snoop mode */ + mem_lock:2, /* lock instr */ + mem_dtlb:7, /* tlb access */ + mem_rsvd:31; + }; +} perf_mmap_data_src_t; + // -------------------------------------------------------------- // main data structure to store the information of an event. diff --git a/src/tool/hpcrun/sample-sources/watchpoint_clients.c b/src/tool/hpcrun/sample-sources/watchpoint_clients.c index f1fbbc9e4f..6dff6f4b63 100755 --- a/src/tool/hpcrun/sample-sources/watchpoint_clients.c +++ b/src/tool/hpcrun/sample-sources/watchpoint_clients.c @@ -2307,11 +2307,6 @@ bool OnSample(perf_mmap_data_t * mmap_data, void * contextPC, cct_node_t *node, case WP_REUSE:{ long metricThreshold = hpcrun_id2metric(sampledMetricId)->period; accessedIns += metricThreshold; - const char *event_name = hpcrun_id2metric(sampledMetricId)->name; - if ( strstr(event_name, "LATENCY_ABOVE_THRESHOLD") || strstr(event_name, "LOAD_LATENCY") ) { - cct_metric_data_increment(latency_metric_id, node, (cct_metric_data_t){.i = mmap_data->weight}); - } - SampleData_t sd= { .node = node, .type=WP_RW, From d882c50db2325c9b303b89fc024d94271424badf Mon Sep 17 00:00:00 2001 From: bugubugu123 Date: Sat, 3 Feb 2018 15:54:19 -0500 Subject: [PATCH 16/43] To read the counter value, we need to scale the value by perf_scale(). Otherwise, it may make the value of some event appear smaller. --- .../hpcrun/sample-sources/perf/linux_perf.c | 28 ++++++++++++--- .../hpcrun/sample-sources/perf/perf_mmap.c | 12 ++++--- .../hpcrun/sample-sources/perf/perf_mmap.h | 2 +- .../sample-sources/watchpoint_clients.c | 34 ++++++++++++++----- 4 files changed, 59 insertions(+), 17 deletions(-) diff --git a/src/tool/hpcrun/sample-sources/perf/linux_perf.c b/src/tool/hpcrun/sample-sources/perf/linux_perf.c index 8e74752ed5..ded61c10cc 100644 --- a/src/tool/hpcrun/sample-sources/perf/linux_perf.c +++ b/src/tool/hpcrun/sample-sources/perf/linux_perf.c @@ -202,6 +202,8 @@ static const char *event_name = "CPU_CYCLES"; // global variables //****************************************************************************** +int *linux_perf_reading_events = NULL; +int linux_perf_num_reading_events = -1; int reuse_cacheline_distance_event_index = -1; int linux_perf_sample_source_index = -1; @@ -487,12 +489,20 @@ record_sample(event_thread_t *current, perf_mmap_data_t *mmap_data, } hpcrun_clear_handling_sample(td); #endif + //event_thread_t *event_thread = (event_thread_t *)TD_GET(ss_info)[linux_perf_sample_source_index].ptr; + fprintf(stderr, "COUNTER:"); + for (int i=0; i < linux_perf_num_reading_events; i++){ + uint64_t tmp_counter; + linux_perf_read_event_counter(linux_perf_reading_events[i], &tmp_counter, 1 /* is scaled*/); + fprintf(stderr, " %lu", tmp_counter); + } + fprintf(stderr, "\n"); if(WatchpointClientActive()){ - OnSample(mmap_data, - hpcrun_context_pc(context), - sv->sample_node, - current->event->metric); + //OnSample(mmap_data, + // hpcrun_context_pc(context), + // sv->sample_node, + // current->event->metric); } return sv; } @@ -741,6 +751,8 @@ METHOD_FN(process_event_list, int lush_metrics) size_t size = sizeof(event_info_t) * num_events; event_desc = (event_info_t*) hpcrun_malloc(size); + linux_perf_reading_events = (int *) hpcrun_malloc(sizeof(int) * num_events); + linux_perf_num_reading_events = 0; if (event_desc == NULL) { EMSG("Unable to allocate %d bytes", size); return; @@ -809,6 +821,7 @@ METHOD_FN(process_event_list, int lush_metrics) /******** For witch client WP_REUSE ***************/ if (threshold == 0){ + linux_perf_reading_events[linux_perf_num_reading_events++] = i; reuse_cacheline_distance_event_index = i; linux_perf_sample_source_index = self->sel_idx; } @@ -985,6 +998,13 @@ void linux_perf_events_resume(){ perf_start_all(nevents, event_thread); } +int linux_perf_read_event_counter(int event_index, uint64_t *val, int isScaled){ + // isScaled: >=1 -> True, <=0 ->False + sample_source_t *self = &obj_name(); + event_thread_t *event_thread = TD_GET(ss_info)[self->sel_idx].ptr; + return read_event_counter(&(event_thread[event_index]), val, isScaled); +} + // --------------------------------------------- // signal handler // --------------------------------------------- diff --git a/src/tool/hpcrun/sample-sources/perf/perf_mmap.c b/src/tool/hpcrun/sample-sources/perf/perf_mmap.c index 141d4db075..204feee4c2 100644 --- a/src/tool/hpcrun/sample-sources/perf/perf_mmap.c +++ b/src/tool/hpcrun/sample-sources/perf/perf_mmap.c @@ -467,7 +467,8 @@ static inline uint64_t perf_scale(uint64_t *values) { //---------------------------------------------------------------------- // read the counter value of the event -int read_event_counter(event_thread_t *current,uint64_t *val){ +int read_event_counter(event_thread_t *current,uint64_t *val, int isScaled){ + // isScaled: >=1 -> True, <=0 ->False pe_mmap_t *current_perf_mmap = current->mmap; //rdpmc(current_perf_mmap, val); //something wrong when using rdpmc @@ -481,8 +482,12 @@ int read_event_counter(event_thread_t *current,uint64_t *val){ EMSG("Error: unable to read event %d", current->event->id); return -1; } - - *val = values[0]; //*val = perf_scale(values); + if (isScaled >= 1){ + *val = perf_scale(values); + } + else { + *val = values[0]; + } //TODO: If recording cache misses, the scaled value may be smaller than the previous reading (the counter should always increment if not reset). // While time_enabled is always increasing, the counter of value and time_runing always increase together. // We need to check whether we need to scale the value. If yes, how? @@ -491,7 +496,6 @@ int read_event_counter(event_thread_t *current,uint64_t *val){ //fprintf(stderr, "val = %lx\n", *val); return 0; } - //---------------------------------------------------------- // reading mmap buffer from the kernel // in/out: mmapped data of type perf_mmap_data_t. diff --git a/src/tool/hpcrun/sample-sources/perf/perf_mmap.h b/src/tool/hpcrun/sample-sources/perf/perf_mmap.h index 94fb073e47..7f23bc6974 100644 --- a/src/tool/hpcrun/sample-sources/perf/perf_mmap.h +++ b/src/tool/hpcrun/sample-sources/perf/perf_mmap.h @@ -83,6 +83,6 @@ pe_mmap_t* set_mmap(int perf_fd); void perf_unmmap(pe_mmap_t *mmap); int read_perf_buffer(event_thread_t *current, perf_mmap_data_t *mmap_info); -int read_event_counter(event_thread_t *current,uint64_t *val); +int read_event_counter(event_thread_t *current,uint64_t *val, int isScaled); #endif diff --git a/src/tool/hpcrun/sample-sources/watchpoint_clients.c b/src/tool/hpcrun/sample-sources/watchpoint_clients.c index 6dff6f4b63..902194f1d6 100755 --- a/src/tool/hpcrun/sample-sources/watchpoint_clients.c +++ b/src/tool/hpcrun/sample-sources/watchpoint_clients.c @@ -158,7 +158,8 @@ int true_wr_metric_id = -1; extern int reuse_cacheline_distance_event_index; extern int linux_perf_sample_source_index; - +extern int *linux_perf_reading_events; +extern int linux_perf_num_reading_events; #define NUM_WATERMARK_METRICS (4) int curWatermarkId = 0; @@ -1317,9 +1318,17 @@ static WPTriggerActionType ReuseWPCallback(WatchPointInfo_t *wpi, int startOffse int joinNodeIdx = wpi->sample.isSamplePointAccurate? E_ACCURATE_JOIN_NODE_IDX : E_INACCURATE_JOIN_NODE_IDX; uint64_t time_distance = rdtsc() - wpi->startTime; - uint64_t cacheline_distance; - event_thread_t *event_thread = (event_thread_t *)TD_GET(ss_info)[linux_perf_sample_source_index].ptr; - read_event_counter(&(event_thread[reuse_cacheline_distance_event_index]), &cacheline_distance); + uint64_t cacheline_distance = 0; + //event_thread_t *event_thread = (event_thread_t *)TD_GET(ss_info)[linux_perf_sample_source_index].ptr; + //fprintf(stderr, "SECOND_COUNTER: 0x%lx,%lu", wt->pc, rdtsc()); + for (int i=0; i < linux_perf_num_reading_events; i++){ + uint64_t tmp_counter; + //read_event_counter(&(event_thread[ linux_perf_reading_events[i]]), &tmp_counter); + linux_perf_read_event_counter( linux_perf_reading_events[i], &tmp_counter); + //fprintf(stderr, " %lu", tmp_counter); + cacheline_distance += tmp_counter; + } + //fprintf(stderr, "\n"); if (cacheline_distance < wpi->sample.cachelineReuseDistance){ fprintf(stderr, "HPCRUN: cacheline counter value decreased, previous %lx --> current %lx\n", wpi->sample.cachelineReuseDistance, cacheline_distance); cacheline_distance = 0; // maybe set it to zero ?? @@ -1327,6 +1336,7 @@ static WPTriggerActionType ReuseWPCallback(WatchPointInfo_t *wpi, int startOffse else { cacheline_distance -= wpi->sample.cachelineReuseDistance; } + fprintf(stderr, "REUSE_DISTANCE: %lu\n", cacheline_distance); //prepare the metric updating arrays int metricIdArray[4]; uint64_t metricIncArray[4]; @@ -2359,11 +2369,19 @@ bool OnSample(perf_mmap_data_t * mmap_data, void * contextPC, cct_node_t *node, // Read the cacheline event counter uint64_t cachelineCount; - event_thread_t *event_thread = (event_thread_t *)TD_GET(ss_info)[linux_perf_sample_source_index].ptr; - read_event_counter(&(event_thread[reuse_cacheline_distance_event_index]), &cachelineCount); - sd.cachelineReuseDistance = cachelineCount; + //event_thread_t *event_thread = (event_thread_t *)TD_GET(ss_info)[linux_perf_sample_source_index].ptr; + sd.cachelineReuseDistance = 0; + fprintf(stderr, "FIRST_COUNTER: 0x%lx,%lu", precisePC, rdtsc()); + for (int i=0; i < linux_perf_num_reading_events; i++){ + //read_event_counter(&(event_thread[ linux_perf_reading_events[i]]), &cachelineCount); + linux_perf_read_event_counter(linux_perf_reading_events[i], &cachelineCount); + sd.cachelineReuseDistance += cachelineCount; + fprintf(stderr, " %lu", cachelineCount); + } + fprintf(stderr,"\n"); + // register the watchpoint - SubscribeWatchpoint(&sd, OVERWRITE, false /* capture value */); + //SubscribeWatchpoint(&sd, OVERWRITE, false /* capture value */); } break; case WP_FALSE_SHARING: From 2cd656817370f852761957d133f95e7bdb94f028 Mon Sep 17 00:00:00 2001 From: bugubugu123 Date: Tue, 6 Feb 2018 11:26:06 -0500 Subject: [PATCH 17/43] DEBUGGING MODE: Testing the reuse distance accuracy by "predicting the counting rate" --- .../hpcrun/sample-sources/perf/linux_perf.c | 56 ++++++++++-- .../hpcrun/sample-sources/perf/perf_mmap.c | 76 ++++++++++------- .../hpcrun/sample-sources/perf/perf_mmap.h | 5 +- .../sample-sources/watchpoint_clients.c | 85 ++++++++++++++++--- .../sample-sources/watchpoint_support.h | 2 +- 5 files changed, 170 insertions(+), 54 deletions(-) diff --git a/src/tool/hpcrun/sample-sources/perf/linux_perf.c b/src/tool/hpcrun/sample-sources/perf/linux_perf.c index ded61c10cc..00e97406cc 100644 --- a/src/tool/hpcrun/sample-sources/perf/linux_perf.c +++ b/src/tool/hpcrun/sample-sources/perf/linux_perf.c @@ -189,6 +189,21 @@ static int perf_event_handler( int sig, siginfo_t* siginfo, void* context); +static inline uint64_t perf_scale(uint64_t *values) { //jqswang + uint64_t res = 0; + + if (!values[2] && !values[1] && values[0]) { + fprintf(stderr,"WARNING: time_running = 0 = time_enabled, raw count not zero\n"); + } + if (values[2] > values[1]) { + fprintf(stderr, "WARNING: time_running > time_enabled\n"); + } + if (values[2]) { + res = (uint64_t)((double)values[0] * values[1]/values[2]); + } + return res; +} + //****************************************************************************** // constants //****************************************************************************** @@ -489,7 +504,7 @@ record_sample(event_thread_t *current, perf_mmap_data_t *mmap_data, } hpcrun_clear_handling_sample(td); #endif - //event_thread_t *event_thread = (event_thread_t *)TD_GET(ss_info)[linux_perf_sample_source_index].ptr; +#if 0 //for debugging fprintf(stderr, "COUNTER:"); for (int i=0; i < linux_perf_num_reading_events; i++){ uint64_t tmp_counter; @@ -497,12 +512,34 @@ record_sample(event_thread_t *current, perf_mmap_data_t *mmap_data, fprintf(stderr, " %lu", tmp_counter); } fprintf(stderr, "\n"); - +#endif if(WatchpointClientActive()){ - //OnSample(mmap_data, - // hpcrun_context_pc(context), - // sv->sample_node, - // current->event->metric); + OnSample(mmap_data, + hpcrun_context_pc(context), + sv->sample_node, + current->event->metric); + } + else { +#if 0 + fprintf(stderr, "COUNTER:"); + for (int i=0; i < linux_perf_num_reading_events; i++){ + uint64_t tmp_counter; + linux_perf_read_event_counter(linux_perf_reading_events[i], &tmp_counter, 1 /* is scaled*/); + fprintf(stderr, " %lu", tmp_counter); + } + fprintf(stderr, "\n"); + + fprintf(stderr, "COUNTER:"); + for (int i=0; i < linux_perf_num_reading_events; i++){ + uint64_t val[3]; + uint64_t scaled; + linux_perf_read_event_counter_full( linux_perf_reading_events[i], val); + scaled = perf_scale(val); + fprintf(stderr," %lu,%lu,%lu,%lu", val[0], val[1], val[2], scaled); + } + fprintf(stderr, "\n"); +#endif + } return sv; } @@ -998,13 +1035,14 @@ void linux_perf_events_resume(){ perf_start_all(nevents, event_thread); } -int linux_perf_read_event_counter(int event_index, uint64_t *val, int isScaled){ - // isScaled: >=1 -> True, <=0 ->False +// val is a uint64_t array and has at least 3 elements +int linux_perf_read_event_counter(int event_index, uint64_t *val){ sample_source_t *self = &obj_name(); event_thread_t *event_thread = TD_GET(ss_info)[self->sel_idx].ptr; - return read_event_counter(&(event_thread[event_index]), val, isScaled); + return perf_read_event_counter(&(event_thread[event_index]), val); } + // --------------------------------------------- // signal handler // --------------------------------------------- diff --git a/src/tool/hpcrun/sample-sources/perf/perf_mmap.c b/src/tool/hpcrun/sample-sources/perf/perf_mmap.c index 204feee4c2..ef04cbc7ca 100644 --- a/src/tool/hpcrun/sample-sources/perf/perf_mmap.c +++ b/src/tool/hpcrun/sample-sources/perf/perf_mmap.c @@ -62,7 +62,6 @@ #include #include - /****************************************************************************** * hpcrun includes *****************************************************************************/ @@ -441,24 +440,22 @@ static inline int rdpmc(pe_mmap_t *mmap, uint64_t *value) #error("rdpmc() is not defined"); #endif + /* - * values[0] = raw count - * values[1] = TIME_ENABLED - * values[2] = TIME_RUNNING + * val[0] = raw count + * val[1] = TIME_ENABLED + * val[2] = TIME_RUNNING */ -static inline uint64_t perf_scale(uint64_t *values) { - uint64_t res = 0; - - if (!values[2] && !values[1] && values[0]) { +static inline int isCounterValid(uint64_t *val){ + if (!val[2] && !val[1] && val[0]) { fprintf(stderr,"WARNING: time_running = 0 = time_enabled, raw count not zero\n"); + return -1; } - if (values[2] > values[1]) { + if (val[2] > val[1]) { fprintf(stderr, "WARNING: time_running > time_enabled\n"); + return -1; } - if (values[2]) { - res = (uint64_t)((double)values[0] * values[1]/values[2]); - } - return res; + return 1; } @@ -467,35 +464,54 @@ static inline uint64_t perf_scale(uint64_t *values) { //---------------------------------------------------------------------- // read the counter value of the event -int read_event_counter(event_thread_t *current,uint64_t *val, int isScaled){ - // isScaled: >=1 -> True, <=0 ->False +// val is an array of uint64_t, at least has a length of 3 +int perf_read_event_counter(event_thread_t *current, uint64_t *val){ + pe_mmap_t *current_perf_mmap = current->mmap; //rdpmc(current_perf_mmap, val); //something wrong when using rdpmc - uint64_t values[3]; if (current->fd < 0){ EMSG("Error: unable to open the event %d file descriptor", current->event->id); return -1; } - int ret = read(current->fd, values, sizeof(values)); - if (ret < sizeof(values)) { + int ret = read(current->fd, val, sizeof(uint64_t) * 3 ); + if (ret < sizeof(uint64_t)*3) { EMSG("Error: unable to read event %d", current->event->id); return -1; } - if (isScaled >= 1){ - *val = perf_scale(values); - } - else { - *val = values[0]; - } - //TODO: If recording cache misses, the scaled value may be smaller than the previous reading (the counter should always increment if not reset). - // While time_enabled is always increasing, the counter of value and time_runing always increase together. - // We need to check whether we need to scale the value. If yes, how? - //fprintf(stderr, "values[0] = %lx, values[1] = %lx, values[2] = %lx\n", values[0],values[1], values[2); - - //fprintf(stderr, "val = %lx\n", *val); return 0; } + +/* + * val[0] = raw count + * val[1] = TIME_ENABLED + * val[2] = TIME_RUNNING + */ +uint64_t perf_get_scaled_counter_val(uint64_t *val){ + uint64_t res = 0; + isCounterValid(val); + if (val[2]) { + res = (uint64_t)((double)val[0] * val[1]/val[2] ); + } + return res; +} + + +uint64_t perf_get_scaled_counter_delta(uint64_t *val, uint64_t *prev_val){ + uint64_t res = 0; + isCounterValid(val); + isCounterValid(prev_val); + + if (val[2] - prev_val[2]) { + res = (uint64_t)( + ((double)val[0] - (double)prev_val[0]) * ( (double)val[1] - (double)prev_val[1]) + / ((double) val[2] - (double)prev_val[2]) + ); + } + return res; +} + + //---------------------------------------------------------- // reading mmap buffer from the kernel // in/out: mmapped data of type perf_mmap_data_t. diff --git a/src/tool/hpcrun/sample-sources/perf/perf_mmap.h b/src/tool/hpcrun/sample-sources/perf/perf_mmap.h index 7f23bc6974..ede3876555 100644 --- a/src/tool/hpcrun/sample-sources/perf/perf_mmap.h +++ b/src/tool/hpcrun/sample-sources/perf/perf_mmap.h @@ -83,6 +83,7 @@ pe_mmap_t* set_mmap(int perf_fd); void perf_unmmap(pe_mmap_t *mmap); int read_perf_buffer(event_thread_t *current, perf_mmap_data_t *mmap_info); -int read_event_counter(event_thread_t *current,uint64_t *val, int isScaled); - +int perf_read_event_counter(event_thread_t *current, uint64_t *val); +uint64_t perf_get_scaled_counter_val(uint64_t *val); +uint64_t perf_get_scaled_counter_delta(uint64_t *val, uint64_t *prev_val); #endif diff --git a/src/tool/hpcrun/sample-sources/watchpoint_clients.c b/src/tool/hpcrun/sample-sources/watchpoint_clients.c index 902194f1d6..76c1857058 100755 --- a/src/tool/hpcrun/sample-sources/watchpoint_clients.c +++ b/src/tool/hpcrun/sample-sources/watchpoint_clients.c @@ -161,6 +161,35 @@ extern int linux_perf_sample_source_index; extern int *linux_perf_reading_events; extern int linux_perf_num_reading_events; +static inline uint64_t perf_scale(uint64_t *values) { //jqswang + uint64_t res = 0; + + if (!values[2] && !values[1] && values[0]) { + fprintf(stderr,"WARNING: time_running = 0 = time_enabled, raw count not zero\n"); + } + if (values[2] > values[1]) { + fprintf(stderr, "WARNING: time_running > time_enabled\n"); + } + if (values[2]) { + res = (uint64_t)((double)values[0] * values[1]/values[2]); + } + return res; +} + +uint64_t old_values[3] = {0,0,0}; +double counting_rate = 0.0; +static inline void update_counting_rate(uint64_t *values){ //jqswang + if ( values[2] == old_values[2]){ + fprintf(stderr, "HPCRUN: WARNING: the sampling rate is too high for the multiplexed events\n"); + return; + } + counting_rate = ((double)(values[0] - old_values[0])) / (values[2] - old_values[2]); + memcpy(old_values, values, sizeof(uint64_t)*3); + +} + + + #define NUM_WATERMARK_METRICS (4) int curWatermarkId = 0; int watermark_metric_id[NUM_WATERMARK_METRICS] = {-1, -1, -1, -1}; @@ -1319,24 +1348,42 @@ static WPTriggerActionType ReuseWPCallback(WatchPointInfo_t *wpi, int startOffse uint64_t time_distance = rdtsc() - wpi->startTime; uint64_t cacheline_distance = 0; - //event_thread_t *event_thread = (event_thread_t *)TD_GET(ss_info)[linux_perf_sample_source_index].ptr; - //fprintf(stderr, "SECOND_COUNTER: 0x%lx,%lu", wt->pc, rdtsc()); +#if 0 + fprintf(stderr, "SECOND_COUNTER: 0x%lx,%lu", wt->pc, rdtsc()); for (int i=0; i < linux_perf_num_reading_events; i++){ uint64_t tmp_counter; - //read_event_counter(&(event_thread[ linux_perf_reading_events[i]]), &tmp_counter); - linux_perf_read_event_counter( linux_perf_reading_events[i], &tmp_counter); - //fprintf(stderr, " %lu", tmp_counter); + linux_perf_read_event_counter( linux_perf_reading_events[i], &tmp_counter, 0); + fprintf(stderr, " %lu", tmp_counter); cacheline_distance += tmp_counter; } - //fprintf(stderr, "\n"); + fprintf(stderr, "\n"); +#else + fprintf(stderr, "SECOND_COUNTER:"); + uint64_t val[3]; + uint64_t scaled; + for (int i=0; i < linux_perf_num_reading_events; i++){ + linux_perf_read_event_counter( linux_perf_reading_events[i], val); + scaled = perf_scale(val); + fprintf(stderr," %lu,%lu,%lu,%lu", val[0], val[1], val[2], scaled); + cacheline_distance = perf_get_scaled_counter_delta(val, wpi->sample.cachelineReuseDistance); + } + fprintf(stderr, "\n"); +#endif +#if 0 if (cacheline_distance < wpi->sample.cachelineReuseDistance){ - fprintf(stderr, "HPCRUN: cacheline counter value decreased, previous %lx --> current %lx\n", wpi->sample.cachelineReuseDistance, cacheline_distance); + fprintf(stderr, "HPCRUN: cacheline counter value decreased, previous %lu --> current %lu\n", wpi->sample.cachelineReuseDistance, cacheline_distance); cacheline_distance = 0; // maybe set it to zero ?? } else { cacheline_distance -= wpi->sample.cachelineReuseDistance; } - fprintf(stderr, "REUSE_DISTANCE: %lu\n", cacheline_distance); +#endif + if (cacheline_distance == 0){ + fprintf(stderr, "REUSE_DISTANCE (EST): %lu (rate %lf)\n", (uint64_t)( (val[1] - wpi->sample.cachelineReuseDistance[1]) * counting_rate), counting_rate); + } + else{ + fprintf(stderr, "REUSE_DISTANCE (ACC): %lu (rate %lf)\n", cacheline_distance, ((double)val[0] - wpi->sample.cachelineReuseDistance[0] )/((double)val[2] - wpi->sample.cachelineReuseDistance[2])); + } //prepare the metric updating arrays int metricIdArray[4]; uint64_t metricIncArray[4]; @@ -2370,18 +2417,32 @@ bool OnSample(perf_mmap_data_t * mmap_data, void * contextPC, cct_node_t *node, // Read the cacheline event counter uint64_t cachelineCount; //event_thread_t *event_thread = (event_thread_t *)TD_GET(ss_info)[linux_perf_sample_source_index].ptr; - sd.cachelineReuseDistance = 0; + //sd.cachelineReuseDistance = 0; +#if 0 fprintf(stderr, "FIRST_COUNTER: 0x%lx,%lu", precisePC, rdtsc()); for (int i=0; i < linux_perf_num_reading_events; i++){ //read_event_counter(&(event_thread[ linux_perf_reading_events[i]]), &cachelineCount); - linux_perf_read_event_counter(linux_perf_reading_events[i], &cachelineCount); + linux_perf_read_event_counter(linux_perf_reading_events[i], &cachelineCount, 0); sd.cachelineReuseDistance += cachelineCount; fprintf(stderr, " %lu", cachelineCount); } fprintf(stderr,"\n"); - +#else + fprintf(stderr, "FIRST_COUNTER(rate %lf):", counting_rate); + for (int i=0; i < linux_perf_num_reading_events; i++){ + uint64_t val[3]; + uint64_t scaled; + linux_perf_read_event_counter( linux_perf_reading_events[i], val); + update_counting_rate(val); + scaled = perf_scale(val); + fprintf(stderr," %lu,%lu,%lu,%lu", val[0], val[1], val[2], scaled); + memcpy(sd.cachelineReuseDistance, val, sizeof(uint64_t)*3);; + } + fprintf(stderr, "\n"); + //sd.cachelineReuseDistance = cachelineCount; +#endif // register the watchpoint - //SubscribeWatchpoint(&sd, OVERWRITE, false /* capture value */); + SubscribeWatchpoint(&sd, OVERWRITE, false /* capture value */); } break; case WP_FALSE_SHARING: diff --git a/src/tool/hpcrun/sample-sources/watchpoint_support.h b/src/tool/hpcrun/sample-sources/watchpoint_support.h index a289fefb3f..8a15699a6c 100644 --- a/src/tool/hpcrun/sample-sources/watchpoint_support.h +++ b/src/tool/hpcrun/sample-sources/watchpoint_support.h @@ -103,7 +103,7 @@ typedef struct SampleData{ bool isSamplePointAccurate; bool isBackTrace; ReuseType reuseType; - uint64_t cachelineReuseDistance; + uint64_t cachelineReuseDistance[3]; } SampleData_t; typedef struct WatchPointInfo{ From 6778f89b65c621d9034eeb0ddcb63de6664b8e91 Mon Sep 17 00:00:00 2001 From: bugubugu123 Date: Tue, 6 Feb 2018 13:01:07 -0500 Subject: [PATCH 18/43] DEBUGGING MODE: Only measure the temporal reuse; Check the reuse distance from "estimation" --- src/tool/hpcrun/sample-sources/watchpoint_clients.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/tool/hpcrun/sample-sources/watchpoint_clients.c b/src/tool/hpcrun/sample-sources/watchpoint_clients.c index 76c1857058..83976b6c8f 100755 --- a/src/tool/hpcrun/sample-sources/watchpoint_clients.c +++ b/src/tool/hpcrun/sample-sources/watchpoint_clients.c @@ -1378,12 +1378,17 @@ static WPTriggerActionType ReuseWPCallback(WatchPointInfo_t *wpi, int startOffse cacheline_distance -= wpi->sample.cachelineReuseDistance; } #endif +#if 1 if (cacheline_distance == 0){ fprintf(stderr, "REUSE_DISTANCE (EST): %lu (rate %lf)\n", (uint64_t)( (val[1] - wpi->sample.cachelineReuseDistance[1]) * counting_rate), counting_rate); + //just drop it } else{ fprintf(stderr, "REUSE_DISTANCE (ACC): %lu (rate %lf)\n", cacheline_distance, ((double)val[0] - wpi->sample.cachelineReuseDistance[0] )/((double)val[2] - wpi->sample.cachelineReuseDistance[2])); } +#endif + //fprintf(stderr, "REUSE_DISTANCE: %lu %lu\n", cacheline_distance, inc); + //prepare the metric updating arrays int metricIdArray[4]; uint64_t metricIncArray[4]; @@ -2376,7 +2381,8 @@ bool OnSample(perf_mmap_data_t * mmap_data, void * contextPC, cct_node_t *node, .isBackTrace = false, }; sd.wpLength = GetFloorWPLength(accessLen); - if (rdtsc() & 1) { // 50% chance to detect spatial reuse + //if (rdtsc() & 1) { // 50% chance to detect spatial reuse + if (0){ //jqswang: testing, always temporal reuse int wpSizes[] = {8, 4, 2, 1}; FalseSharingLocs falseSharingLocs[CACHE_LINE_SZ]; int numFSLocs = 0; From db21f228c68d846f30bc49b13b2e2367ab4efaa5 Mon Sep 17 00:00:00 2001 From: bugubugu123 Date: Wed, 7 Feb 2018 09:56:14 -0500 Subject: [PATCH 19/43] Minors. Still debugging mode --- .../sample-sources/watchpoint_clients.c | 39 ++++++++++--------- 1 file changed, 20 insertions(+), 19 deletions(-) diff --git a/src/tool/hpcrun/sample-sources/watchpoint_clients.c b/src/tool/hpcrun/sample-sources/watchpoint_clients.c index 83976b6c8f..0fadff93fc 100755 --- a/src/tool/hpcrun/sample-sources/watchpoint_clients.c +++ b/src/tool/hpcrun/sample-sources/watchpoint_clients.c @@ -180,7 +180,7 @@ uint64_t old_values[3] = {0,0,0}; double counting_rate = 0.0; static inline void update_counting_rate(uint64_t *values){ //jqswang if ( values[2] == old_values[2]){ - fprintf(stderr, "HPCRUN: WARNING: the sampling rate is too high for the multiplexed events\n"); + //fprintf(stderr, "HPCRUN: WARNING: the sampling rate is too high for the multiplexed events\n"); return; } counting_rate = ((double)(values[0] - old_values[0])) / (values[2] - old_values[2]); @@ -1348,27 +1348,18 @@ static WPTriggerActionType ReuseWPCallback(WatchPointInfo_t *wpi, int startOffse uint64_t time_distance = rdtsc() - wpi->startTime; uint64_t cacheline_distance = 0; -#if 0 - fprintf(stderr, "SECOND_COUNTER: 0x%lx,%lu", wt->pc, rdtsc()); - for (int i=0; i < linux_perf_num_reading_events; i++){ - uint64_t tmp_counter; - linux_perf_read_event_counter( linux_perf_reading_events[i], &tmp_counter, 0); - fprintf(stderr, " %lu", tmp_counter); - cacheline_distance += tmp_counter; - } - fprintf(stderr, "\n"); -#else - fprintf(stderr, "SECOND_COUNTER:"); + +// fprintf(stderr, "SECOND_COUNTER:"); uint64_t val[3]; uint64_t scaled; for (int i=0; i < linux_perf_num_reading_events; i++){ linux_perf_read_event_counter( linux_perf_reading_events[i], val); scaled = perf_scale(val); - fprintf(stderr," %lu,%lu,%lu,%lu", val[0], val[1], val[2], scaled); +// fprintf(stderr," %lu,%lu,%lu,%lu", val[0], val[1], val[2], scaled); cacheline_distance = perf_get_scaled_counter_delta(val, wpi->sample.cachelineReuseDistance); } - fprintf(stderr, "\n"); -#endif +// fprintf(stderr, "\n"); + #if 0 if (cacheline_distance < wpi->sample.cachelineReuseDistance){ fprintf(stderr, "HPCRUN: cacheline counter value decreased, previous %lu --> current %lu\n", wpi->sample.cachelineReuseDistance, cacheline_distance); @@ -1378,7 +1369,7 @@ static WPTriggerActionType ReuseWPCallback(WatchPointInfo_t *wpi, int startOffse cacheline_distance -= wpi->sample.cachelineReuseDistance; } #endif -#if 1 +#if 0 if (cacheline_distance == 0){ fprintf(stderr, "REUSE_DISTANCE (EST): %lu (rate %lf)\n", (uint64_t)( (val[1] - wpi->sample.cachelineReuseDistance[1]) * counting_rate), counting_rate); //just drop it @@ -1389,6 +1380,16 @@ static WPTriggerActionType ReuseWPCallback(WatchPointInfo_t *wpi, int startOffse #endif //fprintf(stderr, "REUSE_DISTANCE: %lu %lu\n", cacheline_distance, inc); + char marker = 'T'; // T: real data; "F": estimation from counting rate + if (cacheline_distance == 0){ + cacheline_distance = (uint64_t)( (val[1] - wpi->sample.cachelineReuseDistance[1]) * counting_rate); + marker = 'F'; + } + + fprintf(stderr, "REUSE_DISTANCE: %c %lu %lu\n", marker, cacheline_distance, inc); + + + //prepare the metric updating arrays int metricIdArray[4]; uint64_t metricIncArray[4]; @@ -2434,17 +2435,17 @@ bool OnSample(perf_mmap_data_t * mmap_data, void * contextPC, cct_node_t *node, } fprintf(stderr,"\n"); #else - fprintf(stderr, "FIRST_COUNTER(rate %lf):", counting_rate); + //fprintf(stderr, "FIRST_COUNTER(rate %lf):", counting_rate); for (int i=0; i < linux_perf_num_reading_events; i++){ uint64_t val[3]; uint64_t scaled; linux_perf_read_event_counter( linux_perf_reading_events[i], val); update_counting_rate(val); scaled = perf_scale(val); - fprintf(stderr," %lu,%lu,%lu,%lu", val[0], val[1], val[2], scaled); + //fprintf(stderr," %lu,%lu,%lu,%lu", val[0], val[1], val[2], scaled); memcpy(sd.cachelineReuseDistance, val, sizeof(uint64_t)*3);; } - fprintf(stderr, "\n"); + //fprintf(stderr, "\n"); //sd.cachelineReuseDistance = cachelineCount; #endif // register the watchpoint From 28f475e4467f878b61b05527b05cd4ea06c1f5c6 Mon Sep 17 00:00:00 2001 From: bugubugu123 Date: Sat, 10 Mar 2018 15:26:10 -0500 Subject: [PATCH 20/43] Support sampling both loads and stores --- .../sample-sources/watchpoint_clients.c | 87 +++++++++++++------ .../sample-sources/watchpoint_support.h | 2 +- 2 files changed, 61 insertions(+), 28 deletions(-) diff --git a/src/tool/hpcrun/sample-sources/watchpoint_clients.c b/src/tool/hpcrun/sample-sources/watchpoint_clients.c index 0fadff93fc..1331180eeb 100755 --- a/src/tool/hpcrun/sample-sources/watchpoint_clients.c +++ b/src/tool/hpcrun/sample-sources/watchpoint_clients.c @@ -155,6 +155,11 @@ int false_wr_metric_id = -1; int true_ww_metric_id = -1; int true_rw_metric_id = -1; int true_wr_metric_id = -1; +int my_iteration = 0;//jqswang +void increment_iteration(){ + my_iteration++; +} + extern int reuse_cacheline_distance_event_index; extern int linux_perf_sample_source_index; @@ -554,11 +559,21 @@ static void ClientTermination(){ hpcrun_stats_num_oldAppxBytes_inc(oldAppxBytes); break; case WP_REUSE: + { + uint64_t val[3]; + fprintf(stderr, "FINAL_COUNTING:"); + for (int i=0; i < MIN(2,linux_perf_num_reading_events); i++){ + linux_perf_read_event_counter( linux_perf_reading_events[i], val); + //fprintf(stderr,"FINAL_COUNTING: %lu\n" , perf_scale(val) );//jqswang + fprintf(stderr, " %lu %lu %lu,", val[0], val[1], val[2]);//jqswang + } + fprintf(stderr, "\n"); + hpcrun_stats_num_accessedIns_inc(accessedIns); hpcrun_stats_num_reuseTemporal_inc(reuseTemporal); hpcrun_stats_num_accessedIns_inc(accessedIns); hpcrun_stats_num_reuseSpatial_inc(reuseSpatial); - break; + } break; case WP_FALSE_SHARING: case WP_IPC_FALSE_SHARING: hpcrun_stats_num_accessedIns_inc(accessedIns); @@ -1340,6 +1355,11 @@ static WPTriggerActionType ReuseWPCallback(WatchPointInfo_t *wpi, int startOffse //return RETAIN_WP; return ALREADY_DISABLED; } + + if( wt->accessType == STORE) //jqswang + return RETAIN_WP; + + // Report a reuse double myProportion = ProportionOfWatchpointAmongOthersSharingTheSameContext(wpi); uint64_t numDiffSamples = GetWeightedMetricDiffAndReset(wpi->sample.node, wpi->sample.sampledMetricId, myProportion); @@ -1350,13 +1370,23 @@ static WPTriggerActionType ReuseWPCallback(WatchPointInfo_t *wpi, int startOffse uint64_t cacheline_distance = 0; // fprintf(stderr, "SECOND_COUNTER:"); - uint64_t val[3]; - uint64_t scaled; - for (int i=0; i < linux_perf_num_reading_events; i++){ - linux_perf_read_event_counter( linux_perf_reading_events[i], val); - scaled = perf_scale(val); + uint64_t val[2][3]; + //uint64_t scaled; + //for (int i=0; i < linux_perf_num_reading_events; i++){ + for (int i=0; i < MIN(2, linux_perf_num_reading_events); i++){ + linux_perf_read_event_counter( linux_perf_reading_events[i], val[i]); + //scaled = perf_scale(val); // fprintf(stderr," %lu,%lu,%lu,%lu", val[0], val[1], val[2], scaled); - cacheline_distance = perf_get_scaled_counter_delta(val, wpi->sample.cachelineReuseDistance); + //cacheline_distance = perf_get_scaled_counter_delta(val, wpi->sample.cachelineReuseDistance); + //cacheline_distance = val[1] - wpi->sample.cachelineReuseDistance[1]; +#if 0 + if (val[1] - wpi->sample.cachelineReuseDistance[1] == val[2] - wpi->sample.cachelineReuseDistance[2]){ + cacheline_distance = val[0] - wpi->sample.cachelineReuseDistance[0]; + } + else{ + cacheline_distance = 0; + } +#endif } // fprintf(stderr, "\n"); @@ -1378,17 +1408,9 @@ static WPTriggerActionType ReuseWPCallback(WatchPointInfo_t *wpi, int startOffse fprintf(stderr, "REUSE_DISTANCE (ACC): %lu (rate %lf)\n", cacheline_distance, ((double)val[0] - wpi->sample.cachelineReuseDistance[0] )/((double)val[2] - wpi->sample.cachelineReuseDistance[2])); } #endif - //fprintf(stderr, "REUSE_DISTANCE: %lu %lu\n", cacheline_distance, inc); - - char marker = 'T'; // T: real data; "F": estimation from counting rate - if (cacheline_distance == 0){ - cacheline_distance = (uint64_t)( (val[1] - wpi->sample.cachelineReuseDistance[1]) * counting_rate); - marker = 'F'; - } - - fprintf(stderr, "REUSE_DISTANCE: %c %lu %lu\n", marker, cacheline_distance, inc); - + +// fprintf(stderr, "REUSE_DISTANCE: %c %lu %lu %lu\n", marker, cacheline_distance, inc, val[1] - wpi->sample.cachelineReuseDistance[1]); //prepare the metric updating arrays int metricIdArray[4]; @@ -1399,18 +1421,27 @@ static WPTriggerActionType ReuseWPCallback(WatchPointInfo_t *wpi, int startOffse metricIdArray[2]=reuse_cacheline_distance_metric_id; metricIncArray[2]=cacheline_distance; metricIdArray[3]=reuse_trapped_metric_id; metricIncArray[3]=1; + cct_node_t *reuseNode; if (wpi->sample.reuseType == REUSE_TEMPORAL){ reuseTemporal += inc; metricIdArray[0] = temporal_reuse_metric_id; - cct_node_t *reuseNode = getPreciseNode(wt->ctxt, wt->pc, temporal_reuse_metric_id ); + reuseNode = getPreciseNode(wt->ctxt, wt->pc, temporal_reuse_metric_id ); UpdateConcatenatedPathPairMultiple(wpi->sample.node /*bottomNode*/, reuseNode /*topNode*/, joinNodes[E_TEMPORALLY_REUSED][joinNodeIdx] /* joinNode*/, metricIdArray, metricIncArray, 4); } else { reuseSpatial += inc; metricIdArray[0] = spatial_reuse_metric_id; - cct_node_t *reuseNode = getPreciseNode(wt->ctxt, wt->pc, spatial_reuse_metric_id ); + reuseNode = getPreciseNode(wt->ctxt, wt->pc, spatial_reuse_metric_id ); UpdateConcatenatedPathPairMultiple(wpi->sample.node /*bottomNode*/, reuseNode /*topNode*/, joinNodes[E_SPATIALLY_REUSED][joinNodeIdx] /* joinNode*/, metricIdArray, metricIncArray, 4); } + + fprintf(stderr, "REUSE_DISTANCE: %d %d %lu,", hpcrun_cct_persistent_id(wpi->sample.node), hpcrun_cct_persistent_id(reuseNode), inc); + for(int i=0; i < MIN(2, linux_perf_num_reading_events); i++){ + fprintf(stderr, " %lu %lu %lu,", val[i][0] - wpi->sample.cachelineReuseDistance[i][0],val[i][1] - wpi->sample.cachelineReuseDistance[i][1],val[i][2] - wpi->sample.cachelineReuseDistance[i][2]); + } + fprintf(stderr, "\n"); + + return ALREADY_DISABLED; } @@ -2372,7 +2403,7 @@ bool OnSample(perf_mmap_data_t * mmap_data, void * contextPC, cct_node_t *node, accessedIns += metricThreshold; SampleData_t sd= { .node = node, - .type=WP_RW, + .type=WP_RW, //jqswang: Setting it to WP_READ causes segment fault .accessType=accessType, //.wpLength = accessLen, // set later .accessLength= accessLen, @@ -2381,7 +2412,7 @@ bool OnSample(perf_mmap_data_t * mmap_data, void * contextPC, cct_node_t *node, .preWPAction=theWPConfig->preWPAction, .isBackTrace = false, }; - sd.wpLength = GetFloorWPLength(accessLen); + sd.wpLength = 4;//GetFloorWPLength(accessLen); //jqswang //if (rdtsc() & 1) { // 50% chance to detect spatial reuse if (0){ //jqswang: testing, always temporal reuse int wpSizes[] = {8, 4, 2, 1}; @@ -2410,7 +2441,7 @@ bool OnSample(perf_mmap_data_t * mmap_data, void * contextPC, cct_node_t *node, #endif } else { // 50% chance to detect the temporal reuse - sd.va = data_addr; + sd.va = (void *)(( (uint64_t)data_addr >> 2) << 2) ; //data_addr; //jqswang sd.reuseType = REUSE_TEMPORAL; } if (!IsValidAddress(sd.va, precisePC)) { @@ -2436,18 +2467,20 @@ bool OnSample(perf_mmap_data_t * mmap_data, void * contextPC, cct_node_t *node, fprintf(stderr,"\n"); #else //fprintf(stderr, "FIRST_COUNTER(rate %lf):", counting_rate); - for (int i=0; i < linux_perf_num_reading_events; i++){ + //for (int i=0; i < linux_perf_num_reading_events; i++){ + for (int i=0; i < MIN(2, linux_perf_num_reading_events); i++){ uint64_t val[3]; - uint64_t scaled; + //uint64_t scaled; linux_perf_read_event_counter( linux_perf_reading_events[i], val); - update_counting_rate(val); - scaled = perf_scale(val); + //update_counting_rate(val); + //scaled = perf_scale(val); //fprintf(stderr," %lu,%lu,%lu,%lu", val[0], val[1], val[2], scaled); - memcpy(sd.cachelineReuseDistance, val, sizeof(uint64_t)*3);; + memcpy(sd.cachelineReuseDistance[i], val, sizeof(uint64_t)*3);; } //fprintf(stderr, "\n"); //sd.cachelineReuseDistance = cachelineCount; #endif + // register the watchpoint SubscribeWatchpoint(&sd, OVERWRITE, false /* capture value */); } diff --git a/src/tool/hpcrun/sample-sources/watchpoint_support.h b/src/tool/hpcrun/sample-sources/watchpoint_support.h index 8a15699a6c..20ad878edc 100644 --- a/src/tool/hpcrun/sample-sources/watchpoint_support.h +++ b/src/tool/hpcrun/sample-sources/watchpoint_support.h @@ -103,7 +103,7 @@ typedef struct SampleData{ bool isSamplePointAccurate; bool isBackTrace; ReuseType reuseType; - uint64_t cachelineReuseDistance[3]; + uint64_t cachelineReuseDistance[2][3]; } SampleData_t; typedef struct WatchPointInfo{ From e88b43214f5cded9af4b5a88244a0374877540c7 Mon Sep 17 00:00:00 2001 From: bugubugu123 Date: Sat, 10 Mar 2018 17:39:47 -0500 Subject: [PATCH 21/43] Use another trace file to output the result from reuse client --- .../sample-sources/watchpoint_clients.c | 82 +++++++++++++++++-- src/tool/hpcrun/thread_data.h | 2 + 2 files changed, 76 insertions(+), 8 deletions(-) diff --git a/src/tool/hpcrun/sample-sources/watchpoint_clients.c b/src/tool/hpcrun/sample-sources/watchpoint_clients.c index 1331180eeb..ceb4515b0f 100755 --- a/src/tool/hpcrun/sample-sources/watchpoint_clients.c +++ b/src/tool/hpcrun/sample-sources/watchpoint_clients.c @@ -323,6 +323,54 @@ __thread long ipDiff=0; //static int some_overflow; +/****************************************************************************** + * private tool function +*****************************************************************************/ +static int OpenWitchTraceOutput(){ + #define OUTPUT_TRACE_BUFFER_SIZE (1 <<10) + char file_name[PATH_MAX]; + int ret = snprintf(file_name, PATH_MAX, "%s-%u.reuse.hpcrun", hpcrun_files_executable_name(), syscall(SYS_gettid)); + if ( ret < 0 || ret >= PATH_MAX){ + return -1; + } + int fd = open(file_name, O_WRONLY | O_CREAT | O_APPEND, 0644); + if (fd < 0){ + return -1; + } + ret = hpcio_outbuf_attach(&(TD_GET(witch_client_trace_output)), fd, hpcrun_malloc(OUTPUT_TRACE_BUFFER_SIZE), OUTPUT_TRACE_BUFFER_SIZE, HPCIO_OUTBUF_UNLOCKED); + if (ret != HPCFMT_OK){ + return -1; + } + return 0; +} + +static void CloseWitchTraceOutput(){ + hpcio_outbuf_t *out_ptr = &(TD_GET(witch_client_trace_output)); + if (out_ptr->fd >= 0){ + hpcio_outbuf_close(out_ptr); + } +} + +static int WriteWitchTraceOutput(const char *fmt, ...){ + #define LOCAL_BUFFER_SIZE 1024 + va_list arg; + char local_buf[LOCAL_BUFFER_SIZE]; + va_start(arg, fmt); + int data_size = vsnprintf(local_buf, LOCAL_BUFFER_SIZE, fmt, arg); + va_end(arg); + if (data_size < 0 && data_size >= LOCAL_BUFFER_SIZE){ + return -1; + } + int ret = hpcio_outbuf_write(&(TD_GET(witch_client_trace_output)), local_buf, data_size); + if (ret != data_size){ + return -1; + } + return 0; +} + + + + /****************************************************************************** * method functions *****************************************************************************/ @@ -532,6 +580,7 @@ METHOD_FN(start) return; } td->ss_state[self->sel_idx] = START; + assert(OpenWitchTraceOutput()==0); } static void ClientTermination(){ @@ -561,13 +610,18 @@ static void ClientTermination(){ case WP_REUSE: { uint64_t val[3]; - fprintf(stderr, "FINAL_COUNTING:"); + //fprintf(stderr, "FINAL_COUNTING:"); + WriteWitchTraceOutput("FINAL_COUNTING:"); + for (int i=0; i < MIN(2,linux_perf_num_reading_events); i++){ linux_perf_read_event_counter( linux_perf_reading_events[i], val); - //fprintf(stderr,"FINAL_COUNTING: %lu\n" , perf_scale(val) );//jqswang - fprintf(stderr, " %lu %lu %lu,", val[0], val[1], val[2]);//jqswang + //fprintf(stderr, " %lu %lu %lu,", val[0], val[1], val[2]);//jqswang + WriteWitchTraceOutput(" %lu %lu %lu,", val[0], val[1], val[2]); } - fprintf(stderr, "\n"); + //fprintf(stderr, "\n"); + WriteWitchTraceOutput("\n"); + //close the trace output + CloseWitchTraceOutput(); hpcrun_stats_num_accessedIns_inc(accessedIns); hpcrun_stats_num_reuseTemporal_inc(reuseTemporal); @@ -762,7 +816,7 @@ METHOD_FN(stop) TMSG(WATCHPOINT,"*WARNING* WATCHPOINT stop called when not in state START"); return; } - + ClientTermination(); if (ENABLED(PRINTTOPN)) @@ -857,6 +911,14 @@ METHOD_FN(process_event_list, int lush_metrics) break; case WP_REUSE: + { + //set up the trace output + //char file_name[PATH_MAX]; + //int ret = snprintf(file_name, PATH_MAX, "%s-%d.reuse.hpcrun", hpcrun_get_executable_name(), TD_GET(core_profile_trace_data.id)); + //int fd = open(str, O_WRONLY | O_CREAT | O_EXCL, 0644); + //assert(fd > 0); + //hpcio_outbuf_attach(&(TD_GET(witch_client_trace_output)), fd, hpcrun_malloc(1<<10), 1<<10, HPCIO_OUTBUF_UNLOCKED); + temporal_reuse_metric_id = hpcrun_new_metric(); hpcrun_set_metric_info_and_period(temporal_reuse_metric_id, "TEMPORAL", MetricFlags_ValFmt_Int, 1, metric_property_none); spatial_reuse_metric_id = hpcrun_new_metric(); @@ -867,6 +929,7 @@ METHOD_FN(process_event_list, int lush_metrics) hpcrun_set_metric_info_and_period(reuse_cacheline_distance_metric_id, "CACHELIN_DISTANCE", MetricFlags_ValFmt_Int, 1, metric_property_none); reuse_trapped_metric_id = hpcrun_new_metric(); hpcrun_set_metric_info_and_period(reuse_trapped_metric_id, "REUSE_TRAP_COUNT", MetricFlags_ValFmt_Int, 1, metric_property_none); + } break; case WP_ALL_SHARING: case WP_IPC_ALL_SHARING: @@ -1435,11 +1498,14 @@ static WPTriggerActionType ReuseWPCallback(WatchPointInfo_t *wpi, int startOffse UpdateConcatenatedPathPairMultiple(wpi->sample.node /*bottomNode*/, reuseNode /*topNode*/, joinNodes[E_SPATIALLY_REUSED][joinNodeIdx] /* joinNode*/, metricIdArray, metricIncArray, 4); } - fprintf(stderr, "REUSE_DISTANCE: %d %d %lu,", hpcrun_cct_persistent_id(wpi->sample.node), hpcrun_cct_persistent_id(reuseNode), inc); +// fprintf(stderr, "REUSE_DISTANCE: %d %d %lu,", hpcrun_cct_persistent_id(wpi->sample.node), hpcrun_cct_persistent_id(reuseNode), inc); + WriteWitchTraceOutput("REUSE_DISTANCE: %d %d %lu,", hpcrun_cct_persistent_id(wpi->sample.node), hpcrun_cct_persistent_id(reuseNode), inc); for(int i=0; i < MIN(2, linux_perf_num_reading_events); i++){ - fprintf(stderr, " %lu %lu %lu,", val[i][0] - wpi->sample.cachelineReuseDistance[i][0],val[i][1] - wpi->sample.cachelineReuseDistance[i][1],val[i][2] - wpi->sample.cachelineReuseDistance[i][2]); +// fprintf(stderr, " %lu %lu %lu,", val[i][0] - wpi->sample.cachelineReuseDistance[i][0],val[i][1] - wpi->sample.cachelineReuseDistance[i][1],val[i][2] - wpi->sample.cachelineReuseDistance[i][2]); + WriteWitchTraceOutput(" %lu %lu %lu,", val[i][0] - wpi->sample.cachelineReuseDistance[i][0],val[i][1] - wpi->sample.cachelineReuseDistance[i][1],val[i][2] - wpi->sample.cachelineReuseDistance[i][2]); } - fprintf(stderr, "\n"); +// fprintf(stderr, "\n"); + WriteWitchTraceOutput("\n"); return ALREADY_DISABLED; diff --git a/src/tool/hpcrun/thread_data.h b/src/tool/hpcrun/thread_data.h index 00dd0dbc8a..29e387d678 100644 --- a/src/tool/hpcrun/thread_data.h +++ b/src/tool/hpcrun/thread_data.h @@ -194,6 +194,8 @@ typedef struct thread_data_t { core_profile_trace_data_t core_profile_trace_data; + hpcio_outbuf_t witch_client_trace_output; //jqswang: it is used to output any data from witch client for post-mortem processing + // ---------------------------------------- // backtrace buffer // ---------------------------------------- From 82518451607f3fb84ec98ed917f96c6bf559ed91 Mon Sep 17 00:00:00 2001 From: bugubugu123 Date: Thu, 15 Mar 2018 15:34:03 -0400 Subject: [PATCH 22/43] Fixed a bug --- src/tool/hpcrun/metrics.c | 9 +++++++-- src/tool/hpcrun/sample-sources/watchpoint_support.c | 1 + 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/src/tool/hpcrun/metrics.c b/src/tool/hpcrun/metrics.c index ae32854c35..efdc455c77 100644 --- a/src/tool/hpcrun/metrics.c +++ b/src/tool/hpcrun/metrics.c @@ -509,8 +509,13 @@ hpcrun_get_weighted_metric_diff(int metric_id1, int metric_id2, diff->i = (loc1->i - loc2->i); break; case MetricFlags_ValFmt_Real: - assert(loc1->r >= loc2->r); - diff->r = (loc1->r - loc2->r); + //assert(loc1->r >= loc2->r); //jqswang + if (loc1->r < loc2->r){ + diff->r = 0; + } + else { + diff->r = (loc1->r - loc2->r); + } diffWithPeriod->r = (loc1->r - loc2->r) * minfo1->period; break; default: diff --git a/src/tool/hpcrun/sample-sources/watchpoint_support.c b/src/tool/hpcrun/sample-sources/watchpoint_support.c index a94ee469e6..2394b72f59 100644 --- a/src/tool/hpcrun/sample-sources/watchpoint_support.c +++ b/src/tool/hpcrun/sample-sources/watchpoint_support.c @@ -486,6 +486,7 @@ static void CreateDummyHardwareEvent(void) { int perf_fd = perf_event_open(&pe, 0, -1, -1, 0); if (perf_fd == -1) { EMSG("Failed to open perf event file: %s\n", strerror(errno)); + printf("errno: %d %s\n", errno, strerror(errno)); monitor_real_abort(); } tData.lbrDummyFD = perf_fd; From 430b1d2debdf0d6ee654d547b6c72de465a67817 Mon Sep 17 00:00:00 2001 From: bugubugu123 Date: Thu, 15 Mar 2018 16:45:57 -0400 Subject: [PATCH 23/43] Changed back to support reuse optimization --- .../sample-sources/watchpoint_clients.c | 222 +++++++++--------- 1 file changed, 109 insertions(+), 113 deletions(-) diff --git a/src/tool/hpcrun/sample-sources/watchpoint_clients.c b/src/tool/hpcrun/sample-sources/watchpoint_clients.c index ceb4515b0f..b0d48c1fed 100755 --- a/src/tool/hpcrun/sample-sources/watchpoint_clients.c +++ b/src/tool/hpcrun/sample-sources/watchpoint_clients.c @@ -155,10 +155,6 @@ int false_wr_metric_id = -1; int true_ww_metric_id = -1; int true_rw_metric_id = -1; int true_wr_metric_id = -1; -int my_iteration = 0;//jqswang -void increment_iteration(){ - my_iteration++; -} extern int reuse_cacheline_distance_event_index; @@ -324,7 +320,7 @@ __thread long ipDiff=0; /****************************************************************************** - * private tool function + * private tool function *****************************************************************************/ static int OpenWitchTraceOutput(){ #define OUTPUT_TRACE_BUFFER_SIZE (1 <<10) @@ -335,7 +331,7 @@ static int OpenWitchTraceOutput(){ } int fd = open(file_name, O_WRONLY | O_CREAT | O_APPEND, 0644); if (fd < 0){ - return -1; + return -1; } ret = hpcio_outbuf_attach(&(TD_GET(witch_client_trace_output)), fd, hpcrun_malloc(OUTPUT_TRACE_BUFFER_SIZE), OUTPUT_TRACE_BUFFER_SIZE, HPCIO_OUTBUF_UNLOCKED); if (ret != HPCFMT_OK){ @@ -468,7 +464,7 @@ static WpClientConfig_t wpClientConfig[] = { .preWPAction = DISABLE_WP, .configOverrideCallback = IPCTrueSharingWPConfigOverride } - + }; @@ -574,13 +570,13 @@ METHOD_FN(start) { thread_data_t* td = hpcrun_get_thread_data(); source_state_t my_state = TD_GET(ss_state)[self->sel_idx]; - + if (my_state == START) { TMSG(WATCHPOINT,"*NOTE* WATCHPOINT start called when already in state START"); return; } td->ss_state[self->sel_idx] = START; - assert(OpenWitchTraceOutput()==0); + //assert(OpenWitchTraceOutput()==0); //jqswang: for reuse-histo } static void ClientTermination(){ @@ -588,7 +584,7 @@ static void ClientTermination(){ hpcrun_stats_num_samples_imprecise_inc(wpStats.numImpreciseSamples); hpcrun_stats_num_watchpoints_set_inc(wpStats.numWatchpointsSet); WatchpointThreadTerminate(); - + switch (theWPConfig->id) { case WP_DEADSPY: hpcrun_stats_num_writtenBytes_inc(writtenBytes); @@ -609,6 +605,7 @@ static void ClientTermination(){ break; case WP_REUSE: { + #if 0 //jqswang: for reuse-histo uint64_t val[3]; //fprintf(stderr, "FINAL_COUNTING:"); WriteWitchTraceOutput("FINAL_COUNTING:"); @@ -618,11 +615,11 @@ static void ClientTermination(){ //fprintf(stderr, " %lu %lu %lu,", val[0], val[1], val[2]);//jqswang WriteWitchTraceOutput(" %lu %lu %lu,", val[0], val[1], val[2]); } - //fprintf(stderr, "\n"); + //fprintf(stderr, "\n"); WriteWitchTraceOutput("\n"); //close the trace output CloseWitchTraceOutput(); - + #endif hpcrun_stats_num_accessedIns_inc(accessedIns); hpcrun_stats_num_reuseTemporal_inc(reuseTemporal); hpcrun_stats_num_accessedIns_inc(accessedIns); @@ -652,7 +649,7 @@ static void ClientTermination(){ hpcrun_stats_num_trueWWIns_inc(trueWWIns); hpcrun_stats_num_trueRWIns_inc(trueRWIns); hpcrun_stats_num_trueWRIns_inc(trueWRIns); - + default: break; } @@ -678,10 +675,10 @@ TopN(cct_node_t* node, cct_op_arg_t arg, size_t level) if (!set) return; hpcrun_metricVal_t *loc = hpcrun_metric_set_loc(set, metricID); if (!loc) return; - + uint64_t val = loc->i; if (val == 0) return; - + for (i=0; icore_profile_trace_data.epoch->csdata.tree_root; //TODO: partial? cct_node_t *partial = td->core_profile_trace_data.epoch->csdata.partial_unw_root; - + // trave root first and then partial second hpcrun_cct_walk_node_1st(root, TopN, (void *) metricID); - + int i, j; for (i=0; ii; - + if (val2 > val1) { cct_node_t *tmp = topNNode[i]; topNNode[i] = topNNode[j]; @@ -746,9 +743,9 @@ PrintTopN(int metricID) path = default_path; } sprintf(path, "%s/%s", path, "topN.log"); - + fd = fopen(path, "a+"); - + int libmonitorId, libhpcrunId; // print loadmodule info first fprintf (fd, "\n"); @@ -806,22 +803,22 @@ METHOD_FN(stop) //thread_data_t *td = hpcrun_get_thread_data(); //int nevents = self->evl.nevents; source_state_t my_state = TD_GET(ss_state)[self->sel_idx]; - + if (my_state == STOP) { TMSG(WATCHPOINT,"*NOTE* WATCHPOINT stop called when already in state STOP"); return; } - + if (my_state != START) { TMSG(WATCHPOINT,"*WARNING* WATCHPOINT stop called when not in state START"); return; } ClientTermination(); - + if (ENABLED(PRINTTOPN)) PrintTopN(dead_metric_id); - + TD_GET(ss_state)[self->sel_idx] = STOP; } @@ -829,7 +826,7 @@ static void METHOD_FN(shutdown) { TMSG(WATCHPOINT, "shutdown"); - + METHOD_CALL(self, stop); // make sure stop has been called self->state = UNINIT; } @@ -873,7 +870,7 @@ METHOD_FN(process_event_list, int lush_metrics) } char* evlist = METHOD_CALL(self, get_event_str); char* event = start_tok(evlist); - + // only one supported for(int i = 0; i < WP_MAX_CLIENTS; i++) { if (hpcrun_ev_is(event, wpClientConfig[i].name)) { @@ -881,17 +878,17 @@ METHOD_FN(process_event_list, int lush_metrics) break; } } - + wpStats.numImpreciseSamples = 0; wpStats.numWatchpointsSet = 0; WatchpointThreadInit(theWPConfig->wpCallback); - + if(theWPConfig->configOverrideCallback){ theWPConfig->configOverrideCallback(0); } - + PopulateBlackListAddresses(); - + switch (theWPConfig->id) { case WP_DEADSPY: measured_metric_id = hpcrun_new_metric(); @@ -899,7 +896,7 @@ METHOD_FN(process_event_list, int lush_metrics) dead_metric_id = hpcrun_new_metric(); hpcrun_set_metric_info_and_period(dead_metric_id, "BYTES_DEAD", MetricFlags_ValFmt_Int, 1, metric_property_none); break; - + case WP_REDSPY: case WP_LOADSPY: measured_metric_id = hpcrun_new_metric(); @@ -909,12 +906,12 @@ METHOD_FN(process_event_list, int lush_metrics) redApprox_metric_id = hpcrun_new_metric(); hpcrun_set_metric_info_and_period(redApprox_metric_id, "BYTES_RED_APPROX", MetricFlags_ValFmt_Int, 1, metric_property_none); break; - + case WP_REUSE: { //set up the trace output //char file_name[PATH_MAX]; - //int ret = snprintf(file_name, PATH_MAX, "%s-%d.reuse.hpcrun", hpcrun_get_executable_name(), TD_GET(core_profile_trace_data.id)); + //int ret = snprintf(file_name, PATH_MAX, "%s-%d.reuse.hpcrun", hpcrun_get_executable_name(), TD_GET(core_profile_trace_data.id)); //int fd = open(str, O_WRONLY | O_CREAT | O_EXCL, 0644); //assert(fd > 0); //hpcio_outbuf_attach(&(TD_GET(witch_client_trace_output)), fd, hpcrun_malloc(1<<10), 1<<10, HPCIO_OUTBUF_UNLOCKED); @@ -940,7 +937,7 @@ METHOD_FN(process_event_list, int lush_metrics) SetUpFalseSharingMetrics(); SetUpTrueSharingMetrics(); break; - + case WP_FALSE_SHARING: case WP_IPC_FALSE_SHARING: // must have a canonical load map across processes @@ -949,7 +946,7 @@ METHOD_FN(process_event_list, int lush_metrics) hpcrun_set_metric_info_and_period(measured_metric_id, "MONITORED", MetricFlags_ValFmt_Int, 1, metric_property_none); SetUpFalseSharingMetrics(); break; - + case WP_TRUE_SHARING: case WP_IPC_TRUE_SHARING: // must have a canonical load map across processes @@ -958,7 +955,7 @@ METHOD_FN(process_event_list, int lush_metrics) hpcrun_set_metric_info_and_period(measured_metric_id, "MONITORED", MetricFlags_ValFmt_Int, 1, metric_property_none); SetUpTrueSharingMetrics(); break; - + default: break; } @@ -1219,14 +1216,14 @@ static WPTriggerActionType DeadStoreWPCallback(WatchPointInfo_t *wpi, int startO // if the ip is 0, let's drop the WP return ALREADY_DISABLED; } - + // This is a approximation. // If we took N samples at wpi->sample.node since the last time a WP triggered here, // If this a dead write, we'll update the dead_writes metric at the call path sample.node:KILLED_BY:curctxt> // Otherwise (not dead), we'll update the used_writes metric at the call path sample.node:USED_BY:curctxt> // In either case, the increment will be (N * overlapBytes) // Bump up watermark_metric_id to match sampledMetricId - + double myProportion = ProportionOfWatchpointAmongOthersSharingTheSameContext(wpi); uint64_t numDiffSamples = GetWeightedMetricDiffAndReset(wpi->sample.node, wpi->sample.sampledMetricId, myProportion); int overlapBytes = GET_OVERLAP_BYTES(wpi->sample.va, wpi->sample.wpLength, wt->va, wt->accessLength); @@ -1234,7 +1231,7 @@ static WPTriggerActionType DeadStoreWPCallback(WatchPointInfo_t *wpi, int startO fprintf(stderr, "\n wpi->sample.va=%p, wpi->sample.wpLength = %d, wt->va = %p, wt->accessLength=%d\n", wpi->sample.va, wpi->sample.wpLength, wt->va, wt->accessLength); monitor_real_abort(); } - + // Now increment dead_metric_id by numDiffSamples * wpi->sample.accessLength // I could have done numDiffSamples * overlapBytes, but it will cause misattribution when access sizes are not same at dead and kill sites. // Basically, we are assuming that whatever happened in the observed watchpoints is applicable to the entire access length @@ -1275,10 +1272,10 @@ static WPTriggerActionType RedStoreWPCallback(WatchPointInfo_t *wpi, int startOf // if the ip is 0, let's drop the WP return ALREADY_DISABLED; } - + bool isFloatOperation = wt->floatType == ELEM_TYPE_UNKNOWN? false: true; bool redBytes = 0; - + // check integer instructions int overlapLen = GET_OVERLAP_BYTES(wt->va, safeAccessLen, wpi->sample.va, wpi->sample.wpLength); if(overlapLen <= 0){ @@ -1289,17 +1286,17 @@ static WPTriggerActionType RedStoreWPCallback(WatchPointInfo_t *wpi, int startOf int joinNodeIdx = wpi->sample.isSamplePointAccurate? E_ACCURATE_JOIN_NODE_IDX : E_INACCURATE_JOIN_NODE_IDX; int firstOffest = FIRST_OVERLAPPED_BYTE_OFFSET_IN_FIRST(wt->va, safeAccessLen, wpi->sample.va, wpi->sample.wpLength); int secondOffest = FIRST_OVERLAPPED_BYTE_OFFSET_IN_FIRST(wt->va, safeAccessLen, wpi->sample.va, wpi->sample.wpLength); - + void * wpiStartByte = wpi->sample.va + secondOffest; void * wtStartByte = wt->va + firstOffest; // if the overlapLen is not 4 or 8, we cannot do any FP, DP approximation. //wpiStartByte and wtStartByte are not 4 or 8 byte aligned, we cannot do any FP, DP approximation. - + // If we got an insane address that cannot be read, return silently if(!IsAddressReadable(wtStartByte)){ return ALREADY_DISABLED; } - + if(isFloatOperation){ switch (wt->floatType) { case ELEM_TYPE_SINGLE:{ @@ -1329,7 +1326,7 @@ static WPTriggerActionType RedStoreWPCallback(WatchPointInfo_t *wpi, int startOf } } break; - + case ELEM_TYPE_DOUBLE:{ if(overlapLen < sizeof(double)){ goto TreatLikeInteger; @@ -1355,7 +1352,7 @@ static WPTriggerActionType RedStoreWPCallback(WatchPointInfo_t *wpi, int startOf } } break; - + default: // unhandled!! goto TreatLikeInteger; break; @@ -1378,10 +1375,10 @@ static WPTriggerActionType RedStoreWPCallback(WatchPointInfo_t *wpi, int startOf UpdateConcatenatedPathPair(wt->ctxt, wpi->sample.node /* oldNode*/, joinNodes[E_NEW_VAL][joinNodeIdx] /* joinNode*/, measured_metric_id /* checkedMetric */, inc); } }else /* non float */{ - + TreatLikeInteger: ; - + for(int i = firstOffest, k = secondOffest ; i < firstOffest + overlapLen; i++, k++){ if(((uint8_t*)(wt->va))[i] == wpi->value[k]) { redBytes ++; @@ -1392,7 +1389,7 @@ static WPTriggerActionType RedStoreWPCallback(WatchPointInfo_t *wpi, int startOf } double myProportion = ProportionOfWatchpointAmongOthersSharingTheSameContext(wpi); uint64_t numDiffSamples = GetWeightedMetricDiffAndReset(wpi->sample.node, wpi->sample.sampledMetricId, myProportion); - + if(redBytes != 0) { // Now increment metric: if the entire overlap is redundant, amplify to numDiffSamples * wpi->sample.accessLength // This is an approximation of what might have happened. @@ -1419,10 +1416,6 @@ static WPTriggerActionType ReuseWPCallback(WatchPointInfo_t *wpi, int startOffse return ALREADY_DISABLED; } - if( wt->accessType == STORE) //jqswang - return RETAIN_WP; - - // Report a reuse double myProportion = ProportionOfWatchpointAmongOthersSharingTheSameContext(wpi); uint64_t numDiffSamples = GetWeightedMetricDiffAndReset(wpi->sample.node, wpi->sample.sampledMetricId, myProportion); @@ -1472,13 +1465,13 @@ static WPTriggerActionType ReuseWPCallback(WatchPointInfo_t *wpi, int startOffse } #endif - + // fprintf(stderr, "REUSE_DISTANCE: %c %lu %lu %lu\n", marker, cacheline_distance, inc, val[1] - wpi->sample.cachelineReuseDistance[1]); //prepare the metric updating arrays int metricIdArray[4]; uint64_t metricIncArray[4]; - + metricIncArray[0]=inc; metricIdArray[1]=reuse_time_distance_metric_id; metricIncArray[1]=time_distance; metricIdArray[2]=reuse_cacheline_distance_metric_id; metricIncArray[2]=cacheline_distance; @@ -1498,15 +1491,17 @@ static WPTriggerActionType ReuseWPCallback(WatchPointInfo_t *wpi, int startOffse UpdateConcatenatedPathPairMultiple(wpi->sample.node /*bottomNode*/, reuseNode /*topNode*/, joinNodes[E_SPATIALLY_REUSED][joinNodeIdx] /* joinNode*/, metricIdArray, metricIncArray, 4); } -// fprintf(stderr, "REUSE_DISTANCE: %d %d %lu,", hpcrun_cct_persistent_id(wpi->sample.node), hpcrun_cct_persistent_id(reuseNode), inc); +#if 0 //jqswang : it is for reuse-histo WriteWitchTraceOutput("REUSE_DISTANCE: %d %d %lu,", hpcrun_cct_persistent_id(wpi->sample.node), hpcrun_cct_persistent_id(reuseNode), inc); for(int i=0; i < MIN(2, linux_perf_num_reading_events); i++){ -// fprintf(stderr, " %lu %lu %lu,", val[i][0] - wpi->sample.cachelineReuseDistance[i][0],val[i][1] - wpi->sample.cachelineReuseDistance[i][1],val[i][2] - wpi->sample.cachelineReuseDistance[i][2]); + WriteWitchTraceOutput(" %lu %lu %lu,", val[i][0] - wpi->sample.cachelineReuseDistance[i][0],val[i][1] - wpi->sample.cachelineReuseDistance[i][1],val[i][2] - wpi->sample.cachelineReuseDistance[i][2]); } // fprintf(stderr, "\n"); WriteWitchTraceOutput("\n"); +#else +#endif return ALREADY_DISABLED; } @@ -1544,7 +1539,7 @@ static WPTriggerActionType FalseSharingWPCallback(WatchPointInfo_t *wpi, int sta joinNode = joinNodes[E_FALSE_WW_SHARE][joinNodeIdx]; } } - + sample_val_t v = hpcrun_sample_callpath(wt->ctxt, measured_metric_id, SAMPLE_UNIT_INC, 0/*skipInner*/, 1/*isSync*/, NULL); // insert a special node cct_node_t *node = hpcrun_insert_special_node(v.sample_node, joinNode); @@ -1579,7 +1574,7 @@ static WPTriggerActionType TrueSharingWPCallback(WatchPointInfo_t *wpi, int star joinNode = joinNodes[E_TRUE_WW_SHARE][joinNodeIdx]; } } - + sample_val_t v = hpcrun_sample_callpath(wt->ctxt, measured_metric_id, SAMPLE_UNIT_INC, 0/*skipInner*/, 1/*isSync*/, NULL); // insert a special node cct_node_t *node = hpcrun_insert_special_node(v.sample_node, joinNode); @@ -1610,7 +1605,7 @@ static WPTriggerActionType IPCFalseSharingWPCallback(WatchPointInfo_t *wpi, int joinNode = joinNodes[E_IPC_FALSE_WW_SHARE][joinNodeIdx]; } } - + sample_val_t v = hpcrun_sample_callpath(wt->ctxt, measured_metric_id, SAMPLE_UNIT_INC, 0/*skipInner*/, 1/*isSync*/, NULL); // insert a special node cct_node_t *node = hpcrun_insert_special_node(v.sample_node, joinNode); @@ -1639,7 +1634,7 @@ static WPTriggerActionType IPCTrueSharingWPCallback(WatchPointInfo_t *wpi, int s joinNode = joinNodes[E_IPC_TRUE_WW_SHARE][joinNodeIdx]; } } - + sample_val_t v = hpcrun_sample_callpath(wt->ctxt, measured_metric_id, SAMPLE_UNIT_INC, 0/*skipInner*/, 1/*isSync*/, NULL); // insert a special node cct_node_t *node = hpcrun_insert_special_node(v.sample_node, joinNode); @@ -1658,7 +1653,7 @@ static inline bool IsLibMonitorAddress(void * addr) { if(!libmonitorLM){ libmonitorLM = hpcrun_loadmap_findByName(hpcrun_loadmap_findLoadName("libmonitor.so"))->dso_info; } - + if (addr >= libmonitorLM->start_addr && addr < libmonitorLM->end_addr){ return true; } @@ -1669,7 +1664,7 @@ static inline bool IsHPCRunAddress(void * addr) { if(!hpcrunLM){ hpcrunLM = hpcrun_loadmap_findByName(hpcrun_loadmap_findLoadName("libhpcrun.so"))->dso_info; } - + if (addr >= hpcrunLM->start_addr && addr < hpcrunLM->end_addr){ return true; } @@ -1697,7 +1692,7 @@ static inline bool IsValidAddress(void * addr, void * pc){ thread_data_t * td = hpcrun_get_thread_data(); if( (addr == 0) ) return false; - + if( (pc == 0) ) return false; @@ -1706,15 +1701,15 @@ static inline bool IsValidAddress(void * addr, void * pc){ if(IsAltStackAddress(addr)) return false; if(IsFSorGS(addr)) - return false; - + return false; + if(IsBlackListedWatchpointAddress(addr) || IsBlackListedWatchpointAddress(pc)){ return false; } - + if (isTdataAddress(addr)) return false; - + if((addr && !(((unsigned long)addr) & 0xF0000000000000)) && (pc && !(((unsigned long)pc) & 0xF0000000000000))) return true; @@ -1728,7 +1723,7 @@ void ReadSharedDataTransactionally(SharedData_t *localSharedData){ int64_t startCounter = gSharedData.counter; if(startCounter & 1) continue; // Some writer is updating - + __sync_synchronize(); *localSharedData = gSharedData; __sync_synchronize(); @@ -1752,7 +1747,7 @@ int static inline GetFloorWPLength(int accessLen){ int static inline GetFloorWPLengthAtAddress(void * address, int accessLen){ uint8_t alignment = ((size_t) address) & (MAX_WP_LENGTH -1); - + switch (alignment) { case 1: case 3: case 5: case 7: /* 1-byte aligned */ return 1; case 2: case 6: /* 2-byte aligned */ return MIN(2, accessLen); @@ -1877,7 +1872,7 @@ void ReadIPCSharedDataTransactionally(IPC_FSInfo *ipcFSInfo){ int64_t startCounter = ipcSharedData->counter; if(startCounter & 1) continue; // Some writer is updating - + __sync_synchronize(); *ipcFSInfo = ipcSharedData->fsInfo; __sync_synchronize(); @@ -1911,11 +1906,11 @@ static inline void create_shared_memory() { if(__sync_bool_compare_and_swap(&ipcSharedData, 0, ptr)){ hpcrun_process_aux_cleanup_add(destroy_shared_memory, NULL); } - + } uint16_t GetOrCreateIPCSharedLMEntry(const char * realPath){ - + if(ipcSharedData == NULL) create_shared_memory(); // start from 1; leave 0 out; @@ -1957,7 +1952,7 @@ unsigned long GetPFN(unsigned long virt_addr){ printf("Error! Cannot open %s\n", PA_PATH); goto ErrExit; } - + //Shifting by virt-addr-offset number of bytes //and multiplying by the size of an address (the size of an entry in pagemap file) uint64_t file_offset = virt_addr / getpagesize() * PAGEMAP_ENTRY; @@ -1980,9 +1975,9 @@ unsigned long GetPFN(unsigned long virt_addr){ else c_buf[PAGEMAP_ENTRY - i - 1] = c; } - + fclose(f); - + for(int i=0; i < PAGEMAP_ENTRY; i++){ //printf("%d ",c_buf[i]); read_val = (read_val << 8) + c_buf[i]; @@ -1995,14 +1990,14 @@ unsigned long GetPFN(unsigned long virt_addr){ // printf("Page not present\n"); // if(GET_BIT(read_val, 62)) // printf("Page swapped\n"); - + return INVALID_PHYSICAL_ADDRESS; ErrExit: if(f){ fclose(f); } return INVALID_PHYSICAL_ADDRESS; - + } @@ -2020,7 +2015,7 @@ static inline struct VAPAMap* splayPAtoVAMap(struct VAPAMap* root, unsigned long static void InsertVAtoPAMap(void * va, unsigned long pa){ VAPAMap_t * found = splayVAtoPAMap(vaToPAMap, va); - + // Check if a trace node with traceKey already exists under this context node if(found && (va == found->virtualAddress)) { vaToPAMap = found; @@ -2049,7 +2044,7 @@ static void InsertVAtoPAMap(void * va, unsigned long pa){ static void InsertPAtoVAMap(unsigned long pa, void * va){ VAPAMap_t * found = splayPAtoVAMap(paToVAMap, pa); - + // Check if a trace node with traceKey already exists under this context node if(found && (pa == found->physicalAddress)) { paToVAMap = found; @@ -2142,7 +2137,7 @@ static void UpdateVMMap(){ if(s != 0){ fprintf(stderr, "\n Failed to STAT %s", VA_PATH); } - + if( ((lastVMMAPCheck % VM_MAP_CHECK_FREQUENCY) == 0) && (lastMapChangeTime != mapsStat.st_mtime)) { // New mapping @@ -2165,11 +2160,11 @@ static void HandleIPCFalseSharing(void * data_addr, void * pc, cct_node_t *node, unsigned long pa = GetPAfromVA(data_addr); // Ok, on a shared page! // Ok to publish new data? - + // Is the published address old enough (stayed for > 1 sample time span) int64_t curTime = rdtsc(); volatile IPC_FSInfo * globalIPCInfo = &(ipcSharedData->fsInfo); - + pid_t me = myTid; // Get the time, tid, and counter // This is definately racy but benign. @@ -2180,14 +2175,14 @@ static void HandleIPCFalseSharing(void * data_addr, void * pc, cct_node_t *node, && (pa != INVALID_PHYSICAL_ADDRESS) // my PA is a valid address ) { // Attempt to lockout - + if(__sync_bool_compare_and_swap(&(ipcSharedData->counter), theCounter, theCounter+1)){ } else { // Failed to update ==> someone else succeeded ==> Fetch that address and set a WP for that goto SET_FS_WP; } - - + + globalIPCInfo->time = rdtsc(); globalIPCInfo->tid = myTid; globalIPCInfo->wpType = accessType == LOAD ? WP_WRITE : WP_RW; @@ -2195,7 +2190,7 @@ static void HandleIPCFalseSharing(void * data_addr, void * pc, cct_node_t *node, globalIPCInfo->address = pa; globalIPCInfo->offset = PAGE_OFFSET(data_addr); globalIPCInfo->accessLen = accessLen; - + int btLen = 0; for(; btLen < MAX_BACKTRACE_LEN - 1; btLen++){ if (node == NULL) @@ -2203,7 +2198,7 @@ static void HandleIPCFalseSharing(void * data_addr, void * pc, cct_node_t *node, globalIPCInfo->backtrace[btLen] = *hpcrun_cct_addr(node); node = hpcrun_cct_parent(node); } - + // unlikely; if btLen == MAX_BACKTRACE_LEN; drop the WP by invalidating it if (btLen == MAX_BACKTRACE_LEN -1 ) { globalIPCInfo->tid = -1; @@ -2223,12 +2218,12 @@ static void HandleIPCFalseSharing(void * data_addr, void * pc, cct_node_t *node, if(va == INVALID_VIRUAL_ADDRESS) { goto ErrExit; } - + va = va + localIPCInfo.offset; - + long metricThreshold = hpcrun_id2metric(sampledMetricId)->period; accessedIns += metricThreshold; - + switch (theWPConfig->id) { case WP_IPC_TRUE_SHARING:{ // Set WP at the same address @@ -2296,7 +2291,7 @@ bool PrintStats(){ void * contextIP = hpcrun_context_pc(context); int v1 = get_access_type(mmap_data->ip); int v2 = get_access_type(contextIP); - + switch(v1){ case 0: unk1++; break; case 1: ld1++; break; @@ -2311,7 +2306,7 @@ bool PrintStats(){ case 3: mix2++; break; default: break; } - + float tot = unk1 + ld1 + st1 + mix1; fprintf(stderr, "W=%f (%f), L=%f(%f), M=%f(%f), U=%f(%f)\n", st1/tot, st2/tot, ld1/tot, ld2/tot, mix1/tot, mix2/tot, unk1/tot, unk2/tot); /* @@ -2325,7 +2320,7 @@ bool PrintStats(){ void * contextIP = hpcrun_context_pc(context); extern int is_same_function(void *ins1, void* ins2); int samev1 = is_same_function(contextIP, mmap_data->ip); - + switch(samev1){ case 0: difffunc++; break; case 1: samefunc++; break; @@ -2348,12 +2343,12 @@ bool OnSample(perf_mmap_data_t * mmap_data, void * contextPC, cct_node_t *node, if (!IsValidAddress(data_addr, precisePC)) { goto ErrExit; // incorrect access type } - + // do not monitor NULL CCT node if (node == NULL) { goto ErrExit; // incorrect CCT } - + // fprintf(stderr, " numWatchpointsSet=%lu\n", wpStats.numWatchpointsSet); int accessLen; @@ -2367,7 +2362,7 @@ bool OnSample(perf_mmap_data_t * mmap_data, void * contextPC, cct_node_t *node, //EMSG("Sampled sd.accessType = %d, accessLen=%d at precisePC = %p\n", accessType, accessLen, precisePC); goto ErrExit; // incorrect access type } - + // if the context PC and precise PC are not in the same function, then the sample point is inaccurate. bool isSamplePointAccurate; FunctionType ft = is_same_function(contextPC, precisePC); @@ -2376,14 +2371,14 @@ bool OnSample(perf_mmap_data_t * mmap_data, void * contextPC, cct_node_t *node, } else { isSamplePointAccurate = false; } - + switch (theWPConfig->id) { case WP_DEADSPY:{ if(accessType == LOAD){ //EMSG("Sampled accessType = %d\n", accessType); goto ErrExit; // incorrect access type } - + long metricThreshold = hpcrun_id2metric(sampledMetricId)->period; writtenBytes += accessLen * metricThreshold; SampleData_t sd= { @@ -2402,7 +2397,7 @@ bool OnSample(perf_mmap_data_t * mmap_data, void * contextPC, cct_node_t *node, SubscribeWatchpoint(&sd, OVERWRITE, false /* capture value */); } break; - + case WP_REDSPY:{ // If we got an insane address that cannot be read, return silently if(!IsAddressReadable(data_addr)){ @@ -2478,9 +2473,10 @@ bool OnSample(perf_mmap_data_t * mmap_data, void * contextPC, cct_node_t *node, .preWPAction=theWPConfig->preWPAction, .isBackTrace = false, }; - sd.wpLength = 4;//GetFloorWPLength(accessLen); //jqswang - //if (rdtsc() & 1) { // 50% chance to detect spatial reuse - if (0){ //jqswang: testing, always temporal reuse + sd.wpLength = GetFloorWPLength(accessLen); + //sd.wpLength = 4;// jqswang: it is for reuse-histo + if (rdtsc() & 1) { // 50% chance to detect spatial reuse + //if (0){ //jqswang: it is for reuse-histo int wpSizes[] = {8, 4, 2, 1}; FalseSharingLocs falseSharingLocs[CACHE_LINE_SZ]; int numFSLocs = 0; @@ -2507,8 +2503,9 @@ bool OnSample(perf_mmap_data_t * mmap_data, void * contextPC, cct_node_t *node, #endif } else { // 50% chance to detect the temporal reuse - sd.va = (void *)(( (uint64_t)data_addr >> 2) << 2) ; //data_addr; //jqswang - sd.reuseType = REUSE_TEMPORAL; + sd.va = data_addr; + //sd.va = (void *)(( (uint64_t)data_addr >> 2) << 2) ; //jqswang: it is for reuse-histo + sd.reuseType = REUSE_TEMPORAL; } if (!IsValidAddress(sd.va, precisePC)) { goto ErrExit; // incorrect access type @@ -2554,7 +2551,7 @@ bool OnSample(perf_mmap_data_t * mmap_data, void * contextPC, cct_node_t *node, case WP_FALSE_SHARING: case WP_TRUE_SHARING: case WP_ALL_SHARING:{ - + // Is the published address old enough (stayed for > 1 sample time span) int64_t curTime = rdtsc(); SharedData_t localSharedData; @@ -2564,7 +2561,7 @@ bool OnSample(perf_mmap_data_t * mmap_data, void * contextPC, cct_node_t *node, localSharedData.time = gSharedData.time; localSharedData.tid = gSharedData.tid; localSharedData.counter = gSharedData.counter; - + //ReadSharedDataTransactionally(&localSharedData); if( ((curTime-localSharedData.time) > 2 * (curTime-lastTime)) // Sufficient time passed since the last time somebody published && @@ -2580,7 +2577,7 @@ bool OnSample(perf_mmap_data_t * mmap_data, void * contextPC, cct_node_t *node, localSharedData.accessLen = accessLen; localSharedData.counter ++; // makes the counter odd localSharedData.node = node; - + if(__sync_bool_compare_and_swap(&gSharedData.counter, theCounter, theCounter+1)){ gSharedData = localSharedData; __sync_synchronize(); @@ -2594,7 +2591,7 @@ bool OnSample(perf_mmap_data_t * mmap_data, void * contextPC, cct_node_t *node, SET_FS_WP: ReadSharedDataTransactionally(&localSharedData); long metricThreshold = hpcrun_id2metric(sampledMetricId)->period; accessedIns += metricThreshold; - + switch (theWPConfig->id) { case WP_TRUE_SHARING:{ // Set WP at the same address @@ -2679,7 +2676,7 @@ bool OnSample(perf_mmap_data_t * mmap_data, void * contextPC, cct_node_t *node, lastTime = curTime; } break; - + case WP_IPC_FALSE_SHARING: case WP_IPC_TRUE_SHARING: { UpdateVMMap(); @@ -2691,10 +2688,9 @@ bool OnSample(perf_mmap_data_t * mmap_data, void * contextPC, cct_node_t *node, } wpStats.numWatchpointsSet ++; return true; - + ErrExit: wpStats.numImpreciseSamples ++; return false; - -} +} From 0300a715f333a4195be549f8b02463692d3259db Mon Sep 17 00:00:00 2001 From: bugubugu123 Date: Thu, 15 Mar 2018 16:48:03 -0400 Subject: [PATCH 24/43] Minors --- .../sample-sources/watchpoint_clients.c | 198 +++++++++--------- 1 file changed, 98 insertions(+), 100 deletions(-) diff --git a/src/tool/hpcrun/sample-sources/watchpoint_clients.c b/src/tool/hpcrun/sample-sources/watchpoint_clients.c index ceb4515b0f..0eec77117f 100755 --- a/src/tool/hpcrun/sample-sources/watchpoint_clients.c +++ b/src/tool/hpcrun/sample-sources/watchpoint_clients.c @@ -324,7 +324,7 @@ __thread long ipDiff=0; /****************************************************************************** - * private tool function + * private tool function *****************************************************************************/ static int OpenWitchTraceOutput(){ #define OUTPUT_TRACE_BUFFER_SIZE (1 <<10) @@ -335,7 +335,7 @@ static int OpenWitchTraceOutput(){ } int fd = open(file_name, O_WRONLY | O_CREAT | O_APPEND, 0644); if (fd < 0){ - return -1; + return -1; } ret = hpcio_outbuf_attach(&(TD_GET(witch_client_trace_output)), fd, hpcrun_malloc(OUTPUT_TRACE_BUFFER_SIZE), OUTPUT_TRACE_BUFFER_SIZE, HPCIO_OUTBUF_UNLOCKED); if (ret != HPCFMT_OK){ @@ -468,7 +468,7 @@ static WpClientConfig_t wpClientConfig[] = { .preWPAction = DISABLE_WP, .configOverrideCallback = IPCTrueSharingWPConfigOverride } - + }; @@ -574,7 +574,7 @@ METHOD_FN(start) { thread_data_t* td = hpcrun_get_thread_data(); source_state_t my_state = TD_GET(ss_state)[self->sel_idx]; - + if (my_state == START) { TMSG(WATCHPOINT,"*NOTE* WATCHPOINT start called when already in state START"); return; @@ -588,7 +588,7 @@ static void ClientTermination(){ hpcrun_stats_num_samples_imprecise_inc(wpStats.numImpreciseSamples); hpcrun_stats_num_watchpoints_set_inc(wpStats.numWatchpointsSet); WatchpointThreadTerminate(); - + switch (theWPConfig->id) { case WP_DEADSPY: hpcrun_stats_num_writtenBytes_inc(writtenBytes); @@ -618,7 +618,7 @@ static void ClientTermination(){ //fprintf(stderr, " %lu %lu %lu,", val[0], val[1], val[2]);//jqswang WriteWitchTraceOutput(" %lu %lu %lu,", val[0], val[1], val[2]); } - //fprintf(stderr, "\n"); + //fprintf(stderr, "\n"); WriteWitchTraceOutput("\n"); //close the trace output CloseWitchTraceOutput(); @@ -652,7 +652,7 @@ static void ClientTermination(){ hpcrun_stats_num_trueWWIns_inc(trueWWIns); hpcrun_stats_num_trueRWIns_inc(trueRWIns); hpcrun_stats_num_trueWRIns_inc(trueWRIns); - + default: break; } @@ -678,10 +678,10 @@ TopN(cct_node_t* node, cct_op_arg_t arg, size_t level) if (!set) return; hpcrun_metricVal_t *loc = hpcrun_metric_set_loc(set, metricID); if (!loc) return; - + uint64_t val = loc->i; if (val == 0) return; - + for (i=0; icore_profile_trace_data.epoch->csdata.tree_root; //TODO: partial? cct_node_t *partial = td->core_profile_trace_data.epoch->csdata.partial_unw_root; - + // trave root first and then partial second hpcrun_cct_walk_node_1st(root, TopN, (void *) metricID); - + int i, j; for (i=0; ii; - + if (val2 > val1) { cct_node_t *tmp = topNNode[i]; topNNode[i] = topNNode[j]; @@ -746,9 +746,9 @@ PrintTopN(int metricID) path = default_path; } sprintf(path, "%s/%s", path, "topN.log"); - + fd = fopen(path, "a+"); - + int libmonitorId, libhpcrunId; // print loadmodule info first fprintf (fd, "\n"); @@ -806,22 +806,22 @@ METHOD_FN(stop) //thread_data_t *td = hpcrun_get_thread_data(); //int nevents = self->evl.nevents; source_state_t my_state = TD_GET(ss_state)[self->sel_idx]; - + if (my_state == STOP) { TMSG(WATCHPOINT,"*NOTE* WATCHPOINT stop called when already in state STOP"); return; } - + if (my_state != START) { TMSG(WATCHPOINT,"*WARNING* WATCHPOINT stop called when not in state START"); return; } ClientTermination(); - + if (ENABLED(PRINTTOPN)) PrintTopN(dead_metric_id); - + TD_GET(ss_state)[self->sel_idx] = STOP; } @@ -829,7 +829,7 @@ static void METHOD_FN(shutdown) { TMSG(WATCHPOINT, "shutdown"); - + METHOD_CALL(self, stop); // make sure stop has been called self->state = UNINIT; } @@ -873,7 +873,7 @@ METHOD_FN(process_event_list, int lush_metrics) } char* evlist = METHOD_CALL(self, get_event_str); char* event = start_tok(evlist); - + // only one supported for(int i = 0; i < WP_MAX_CLIENTS; i++) { if (hpcrun_ev_is(event, wpClientConfig[i].name)) { @@ -881,17 +881,17 @@ METHOD_FN(process_event_list, int lush_metrics) break; } } - + wpStats.numImpreciseSamples = 0; wpStats.numWatchpointsSet = 0; WatchpointThreadInit(theWPConfig->wpCallback); - + if(theWPConfig->configOverrideCallback){ theWPConfig->configOverrideCallback(0); } - + PopulateBlackListAddresses(); - + switch (theWPConfig->id) { case WP_DEADSPY: measured_metric_id = hpcrun_new_metric(); @@ -899,7 +899,7 @@ METHOD_FN(process_event_list, int lush_metrics) dead_metric_id = hpcrun_new_metric(); hpcrun_set_metric_info_and_period(dead_metric_id, "BYTES_DEAD", MetricFlags_ValFmt_Int, 1, metric_property_none); break; - + case WP_REDSPY: case WP_LOADSPY: measured_metric_id = hpcrun_new_metric(); @@ -909,12 +909,12 @@ METHOD_FN(process_event_list, int lush_metrics) redApprox_metric_id = hpcrun_new_metric(); hpcrun_set_metric_info_and_period(redApprox_metric_id, "BYTES_RED_APPROX", MetricFlags_ValFmt_Int, 1, metric_property_none); break; - + case WP_REUSE: { //set up the trace output //char file_name[PATH_MAX]; - //int ret = snprintf(file_name, PATH_MAX, "%s-%d.reuse.hpcrun", hpcrun_get_executable_name(), TD_GET(core_profile_trace_data.id)); + //int ret = snprintf(file_name, PATH_MAX, "%s-%d.reuse.hpcrun", hpcrun_get_executable_name(), TD_GET(core_profile_trace_data.id)); //int fd = open(str, O_WRONLY | O_CREAT | O_EXCL, 0644); //assert(fd > 0); //hpcio_outbuf_attach(&(TD_GET(witch_client_trace_output)), fd, hpcrun_malloc(1<<10), 1<<10, HPCIO_OUTBUF_UNLOCKED); @@ -940,7 +940,7 @@ METHOD_FN(process_event_list, int lush_metrics) SetUpFalseSharingMetrics(); SetUpTrueSharingMetrics(); break; - + case WP_FALSE_SHARING: case WP_IPC_FALSE_SHARING: // must have a canonical load map across processes @@ -949,7 +949,7 @@ METHOD_FN(process_event_list, int lush_metrics) hpcrun_set_metric_info_and_period(measured_metric_id, "MONITORED", MetricFlags_ValFmt_Int, 1, metric_property_none); SetUpFalseSharingMetrics(); break; - + case WP_TRUE_SHARING: case WP_IPC_TRUE_SHARING: // must have a canonical load map across processes @@ -958,7 +958,7 @@ METHOD_FN(process_event_list, int lush_metrics) hpcrun_set_metric_info_and_period(measured_metric_id, "MONITORED", MetricFlags_ValFmt_Int, 1, metric_property_none); SetUpTrueSharingMetrics(); break; - + default: break; } @@ -1219,14 +1219,14 @@ static WPTriggerActionType DeadStoreWPCallback(WatchPointInfo_t *wpi, int startO // if the ip is 0, let's drop the WP return ALREADY_DISABLED; } - + // This is a approximation. // If we took N samples at wpi->sample.node since the last time a WP triggered here, // If this a dead write, we'll update the dead_writes metric at the call path sample.node:KILLED_BY:curctxt> // Otherwise (not dead), we'll update the used_writes metric at the call path sample.node:USED_BY:curctxt> // In either case, the increment will be (N * overlapBytes) // Bump up watermark_metric_id to match sampledMetricId - + double myProportion = ProportionOfWatchpointAmongOthersSharingTheSameContext(wpi); uint64_t numDiffSamples = GetWeightedMetricDiffAndReset(wpi->sample.node, wpi->sample.sampledMetricId, myProportion); int overlapBytes = GET_OVERLAP_BYTES(wpi->sample.va, wpi->sample.wpLength, wt->va, wt->accessLength); @@ -1234,7 +1234,7 @@ static WPTriggerActionType DeadStoreWPCallback(WatchPointInfo_t *wpi, int startO fprintf(stderr, "\n wpi->sample.va=%p, wpi->sample.wpLength = %d, wt->va = %p, wt->accessLength=%d\n", wpi->sample.va, wpi->sample.wpLength, wt->va, wt->accessLength); monitor_real_abort(); } - + // Now increment dead_metric_id by numDiffSamples * wpi->sample.accessLength // I could have done numDiffSamples * overlapBytes, but it will cause misattribution when access sizes are not same at dead and kill sites. // Basically, we are assuming that whatever happened in the observed watchpoints is applicable to the entire access length @@ -1275,10 +1275,10 @@ static WPTriggerActionType RedStoreWPCallback(WatchPointInfo_t *wpi, int startOf // if the ip is 0, let's drop the WP return ALREADY_DISABLED; } - + bool isFloatOperation = wt->floatType == ELEM_TYPE_UNKNOWN? false: true; bool redBytes = 0; - + // check integer instructions int overlapLen = GET_OVERLAP_BYTES(wt->va, safeAccessLen, wpi->sample.va, wpi->sample.wpLength); if(overlapLen <= 0){ @@ -1289,17 +1289,17 @@ static WPTriggerActionType RedStoreWPCallback(WatchPointInfo_t *wpi, int startOf int joinNodeIdx = wpi->sample.isSamplePointAccurate? E_ACCURATE_JOIN_NODE_IDX : E_INACCURATE_JOIN_NODE_IDX; int firstOffest = FIRST_OVERLAPPED_BYTE_OFFSET_IN_FIRST(wt->va, safeAccessLen, wpi->sample.va, wpi->sample.wpLength); int secondOffest = FIRST_OVERLAPPED_BYTE_OFFSET_IN_FIRST(wt->va, safeAccessLen, wpi->sample.va, wpi->sample.wpLength); - + void * wpiStartByte = wpi->sample.va + secondOffest; void * wtStartByte = wt->va + firstOffest; // if the overlapLen is not 4 or 8, we cannot do any FP, DP approximation. //wpiStartByte and wtStartByte are not 4 or 8 byte aligned, we cannot do any FP, DP approximation. - + // If we got an insane address that cannot be read, return silently if(!IsAddressReadable(wtStartByte)){ return ALREADY_DISABLED; } - + if(isFloatOperation){ switch (wt->floatType) { case ELEM_TYPE_SINGLE:{ @@ -1329,7 +1329,7 @@ static WPTriggerActionType RedStoreWPCallback(WatchPointInfo_t *wpi, int startOf } } break; - + case ELEM_TYPE_DOUBLE:{ if(overlapLen < sizeof(double)){ goto TreatLikeInteger; @@ -1355,7 +1355,7 @@ static WPTriggerActionType RedStoreWPCallback(WatchPointInfo_t *wpi, int startOf } } break; - + default: // unhandled!! goto TreatLikeInteger; break; @@ -1378,10 +1378,10 @@ static WPTriggerActionType RedStoreWPCallback(WatchPointInfo_t *wpi, int startOf UpdateConcatenatedPathPair(wt->ctxt, wpi->sample.node /* oldNode*/, joinNodes[E_NEW_VAL][joinNodeIdx] /* joinNode*/, measured_metric_id /* checkedMetric */, inc); } }else /* non float */{ - + TreatLikeInteger: ; - + for(int i = firstOffest, k = secondOffest ; i < firstOffest + overlapLen; i++, k++){ if(((uint8_t*)(wt->va))[i] == wpi->value[k]) { redBytes ++; @@ -1392,7 +1392,7 @@ static WPTriggerActionType RedStoreWPCallback(WatchPointInfo_t *wpi, int startOf } double myProportion = ProportionOfWatchpointAmongOthersSharingTheSameContext(wpi); uint64_t numDiffSamples = GetWeightedMetricDiffAndReset(wpi->sample.node, wpi->sample.sampledMetricId, myProportion); - + if(redBytes != 0) { // Now increment metric: if the entire overlap is redundant, amplify to numDiffSamples * wpi->sample.accessLength // This is an approximation of what might have happened. @@ -1419,9 +1419,8 @@ static WPTriggerActionType ReuseWPCallback(WatchPointInfo_t *wpi, int startOffse return ALREADY_DISABLED; } - if( wt->accessType == STORE) //jqswang - return RETAIN_WP; - + //if( wt->accessType == STORE) //jqswang + //return RETAIN_WP; // Report a reuse double myProportion = ProportionOfWatchpointAmongOthersSharingTheSameContext(wpi); @@ -1472,13 +1471,13 @@ static WPTriggerActionType ReuseWPCallback(WatchPointInfo_t *wpi, int startOffse } #endif - + // fprintf(stderr, "REUSE_DISTANCE: %c %lu %lu %lu\n", marker, cacheline_distance, inc, val[1] - wpi->sample.cachelineReuseDistance[1]); //prepare the metric updating arrays int metricIdArray[4]; uint64_t metricIncArray[4]; - + metricIncArray[0]=inc; metricIdArray[1]=reuse_time_distance_metric_id; metricIncArray[1]=time_distance; metricIdArray[2]=reuse_cacheline_distance_metric_id; metricIncArray[2]=cacheline_distance; @@ -1544,7 +1543,7 @@ static WPTriggerActionType FalseSharingWPCallback(WatchPointInfo_t *wpi, int sta joinNode = joinNodes[E_FALSE_WW_SHARE][joinNodeIdx]; } } - + sample_val_t v = hpcrun_sample_callpath(wt->ctxt, measured_metric_id, SAMPLE_UNIT_INC, 0/*skipInner*/, 1/*isSync*/, NULL); // insert a special node cct_node_t *node = hpcrun_insert_special_node(v.sample_node, joinNode); @@ -1579,7 +1578,7 @@ static WPTriggerActionType TrueSharingWPCallback(WatchPointInfo_t *wpi, int star joinNode = joinNodes[E_TRUE_WW_SHARE][joinNodeIdx]; } } - + sample_val_t v = hpcrun_sample_callpath(wt->ctxt, measured_metric_id, SAMPLE_UNIT_INC, 0/*skipInner*/, 1/*isSync*/, NULL); // insert a special node cct_node_t *node = hpcrun_insert_special_node(v.sample_node, joinNode); @@ -1610,7 +1609,7 @@ static WPTriggerActionType IPCFalseSharingWPCallback(WatchPointInfo_t *wpi, int joinNode = joinNodes[E_IPC_FALSE_WW_SHARE][joinNodeIdx]; } } - + sample_val_t v = hpcrun_sample_callpath(wt->ctxt, measured_metric_id, SAMPLE_UNIT_INC, 0/*skipInner*/, 1/*isSync*/, NULL); // insert a special node cct_node_t *node = hpcrun_insert_special_node(v.sample_node, joinNode); @@ -1639,7 +1638,7 @@ static WPTriggerActionType IPCTrueSharingWPCallback(WatchPointInfo_t *wpi, int s joinNode = joinNodes[E_IPC_TRUE_WW_SHARE][joinNodeIdx]; } } - + sample_val_t v = hpcrun_sample_callpath(wt->ctxt, measured_metric_id, SAMPLE_UNIT_INC, 0/*skipInner*/, 1/*isSync*/, NULL); // insert a special node cct_node_t *node = hpcrun_insert_special_node(v.sample_node, joinNode); @@ -1658,7 +1657,7 @@ static inline bool IsLibMonitorAddress(void * addr) { if(!libmonitorLM){ libmonitorLM = hpcrun_loadmap_findByName(hpcrun_loadmap_findLoadName("libmonitor.so"))->dso_info; } - + if (addr >= libmonitorLM->start_addr && addr < libmonitorLM->end_addr){ return true; } @@ -1669,7 +1668,7 @@ static inline bool IsHPCRunAddress(void * addr) { if(!hpcrunLM){ hpcrunLM = hpcrun_loadmap_findByName(hpcrun_loadmap_findLoadName("libhpcrun.so"))->dso_info; } - + if (addr >= hpcrunLM->start_addr && addr < hpcrunLM->end_addr){ return true; } @@ -1697,7 +1696,7 @@ static inline bool IsValidAddress(void * addr, void * pc){ thread_data_t * td = hpcrun_get_thread_data(); if( (addr == 0) ) return false; - + if( (pc == 0) ) return false; @@ -1706,15 +1705,15 @@ static inline bool IsValidAddress(void * addr, void * pc){ if(IsAltStackAddress(addr)) return false; if(IsFSorGS(addr)) - return false; - + return false; + if(IsBlackListedWatchpointAddress(addr) || IsBlackListedWatchpointAddress(pc)){ return false; } - + if (isTdataAddress(addr)) return false; - + if((addr && !(((unsigned long)addr) & 0xF0000000000000)) && (pc && !(((unsigned long)pc) & 0xF0000000000000))) return true; @@ -1728,7 +1727,7 @@ void ReadSharedDataTransactionally(SharedData_t *localSharedData){ int64_t startCounter = gSharedData.counter; if(startCounter & 1) continue; // Some writer is updating - + __sync_synchronize(); *localSharedData = gSharedData; __sync_synchronize(); @@ -1752,7 +1751,7 @@ int static inline GetFloorWPLength(int accessLen){ int static inline GetFloorWPLengthAtAddress(void * address, int accessLen){ uint8_t alignment = ((size_t) address) & (MAX_WP_LENGTH -1); - + switch (alignment) { case 1: case 3: case 5: case 7: /* 1-byte aligned */ return 1; case 2: case 6: /* 2-byte aligned */ return MIN(2, accessLen); @@ -1877,7 +1876,7 @@ void ReadIPCSharedDataTransactionally(IPC_FSInfo *ipcFSInfo){ int64_t startCounter = ipcSharedData->counter; if(startCounter & 1) continue; // Some writer is updating - + __sync_synchronize(); *ipcFSInfo = ipcSharedData->fsInfo; __sync_synchronize(); @@ -1911,11 +1910,11 @@ static inline void create_shared_memory() { if(__sync_bool_compare_and_swap(&ipcSharedData, 0, ptr)){ hpcrun_process_aux_cleanup_add(destroy_shared_memory, NULL); } - + } uint16_t GetOrCreateIPCSharedLMEntry(const char * realPath){ - + if(ipcSharedData == NULL) create_shared_memory(); // start from 1; leave 0 out; @@ -1957,7 +1956,7 @@ unsigned long GetPFN(unsigned long virt_addr){ printf("Error! Cannot open %s\n", PA_PATH); goto ErrExit; } - + //Shifting by virt-addr-offset number of bytes //and multiplying by the size of an address (the size of an entry in pagemap file) uint64_t file_offset = virt_addr / getpagesize() * PAGEMAP_ENTRY; @@ -1980,9 +1979,9 @@ unsigned long GetPFN(unsigned long virt_addr){ else c_buf[PAGEMAP_ENTRY - i - 1] = c; } - + fclose(f); - + for(int i=0; i < PAGEMAP_ENTRY; i++){ //printf("%d ",c_buf[i]); read_val = (read_val << 8) + c_buf[i]; @@ -1995,14 +1994,14 @@ unsigned long GetPFN(unsigned long virt_addr){ // printf("Page not present\n"); // if(GET_BIT(read_val, 62)) // printf("Page swapped\n"); - + return INVALID_PHYSICAL_ADDRESS; ErrExit: if(f){ fclose(f); } return INVALID_PHYSICAL_ADDRESS; - + } @@ -2020,7 +2019,7 @@ static inline struct VAPAMap* splayPAtoVAMap(struct VAPAMap* root, unsigned long static void InsertVAtoPAMap(void * va, unsigned long pa){ VAPAMap_t * found = splayVAtoPAMap(vaToPAMap, va); - + // Check if a trace node with traceKey already exists under this context node if(found && (va == found->virtualAddress)) { vaToPAMap = found; @@ -2049,7 +2048,7 @@ static void InsertVAtoPAMap(void * va, unsigned long pa){ static void InsertPAtoVAMap(unsigned long pa, void * va){ VAPAMap_t * found = splayPAtoVAMap(paToVAMap, pa); - + // Check if a trace node with traceKey already exists under this context node if(found && (pa == found->physicalAddress)) { paToVAMap = found; @@ -2142,7 +2141,7 @@ static void UpdateVMMap(){ if(s != 0){ fprintf(stderr, "\n Failed to STAT %s", VA_PATH); } - + if( ((lastVMMAPCheck % VM_MAP_CHECK_FREQUENCY) == 0) && (lastMapChangeTime != mapsStat.st_mtime)) { // New mapping @@ -2165,11 +2164,11 @@ static void HandleIPCFalseSharing(void * data_addr, void * pc, cct_node_t *node, unsigned long pa = GetPAfromVA(data_addr); // Ok, on a shared page! // Ok to publish new data? - + // Is the published address old enough (stayed for > 1 sample time span) int64_t curTime = rdtsc(); volatile IPC_FSInfo * globalIPCInfo = &(ipcSharedData->fsInfo); - + pid_t me = myTid; // Get the time, tid, and counter // This is definately racy but benign. @@ -2180,14 +2179,14 @@ static void HandleIPCFalseSharing(void * data_addr, void * pc, cct_node_t *node, && (pa != INVALID_PHYSICAL_ADDRESS) // my PA is a valid address ) { // Attempt to lockout - + if(__sync_bool_compare_and_swap(&(ipcSharedData->counter), theCounter, theCounter+1)){ } else { // Failed to update ==> someone else succeeded ==> Fetch that address and set a WP for that goto SET_FS_WP; } - - + + globalIPCInfo->time = rdtsc(); globalIPCInfo->tid = myTid; globalIPCInfo->wpType = accessType == LOAD ? WP_WRITE : WP_RW; @@ -2195,7 +2194,7 @@ static void HandleIPCFalseSharing(void * data_addr, void * pc, cct_node_t *node, globalIPCInfo->address = pa; globalIPCInfo->offset = PAGE_OFFSET(data_addr); globalIPCInfo->accessLen = accessLen; - + int btLen = 0; for(; btLen < MAX_BACKTRACE_LEN - 1; btLen++){ if (node == NULL) @@ -2203,7 +2202,7 @@ static void HandleIPCFalseSharing(void * data_addr, void * pc, cct_node_t *node, globalIPCInfo->backtrace[btLen] = *hpcrun_cct_addr(node); node = hpcrun_cct_parent(node); } - + // unlikely; if btLen == MAX_BACKTRACE_LEN; drop the WP by invalidating it if (btLen == MAX_BACKTRACE_LEN -1 ) { globalIPCInfo->tid = -1; @@ -2223,12 +2222,12 @@ static void HandleIPCFalseSharing(void * data_addr, void * pc, cct_node_t *node, if(va == INVALID_VIRUAL_ADDRESS) { goto ErrExit; } - + va = va + localIPCInfo.offset; - + long metricThreshold = hpcrun_id2metric(sampledMetricId)->period; accessedIns += metricThreshold; - + switch (theWPConfig->id) { case WP_IPC_TRUE_SHARING:{ // Set WP at the same address @@ -2296,7 +2295,7 @@ bool PrintStats(){ void * contextIP = hpcrun_context_pc(context); int v1 = get_access_type(mmap_data->ip); int v2 = get_access_type(contextIP); - + switch(v1){ case 0: unk1++; break; case 1: ld1++; break; @@ -2311,7 +2310,7 @@ bool PrintStats(){ case 3: mix2++; break; default: break; } - + float tot = unk1 + ld1 + st1 + mix1; fprintf(stderr, "W=%f (%f), L=%f(%f), M=%f(%f), U=%f(%f)\n", st1/tot, st2/tot, ld1/tot, ld2/tot, mix1/tot, mix2/tot, unk1/tot, unk2/tot); /* @@ -2325,7 +2324,7 @@ bool PrintStats(){ void * contextIP = hpcrun_context_pc(context); extern int is_same_function(void *ins1, void* ins2); int samev1 = is_same_function(contextIP, mmap_data->ip); - + switch(samev1){ case 0: difffunc++; break; case 1: samefunc++; break; @@ -2348,12 +2347,12 @@ bool OnSample(perf_mmap_data_t * mmap_data, void * contextPC, cct_node_t *node, if (!IsValidAddress(data_addr, precisePC)) { goto ErrExit; // incorrect access type } - + // do not monitor NULL CCT node if (node == NULL) { goto ErrExit; // incorrect CCT } - + // fprintf(stderr, " numWatchpointsSet=%lu\n", wpStats.numWatchpointsSet); int accessLen; @@ -2367,7 +2366,7 @@ bool OnSample(perf_mmap_data_t * mmap_data, void * contextPC, cct_node_t *node, //EMSG("Sampled sd.accessType = %d, accessLen=%d at precisePC = %p\n", accessType, accessLen, precisePC); goto ErrExit; // incorrect access type } - + // if the context PC and precise PC are not in the same function, then the sample point is inaccurate. bool isSamplePointAccurate; FunctionType ft = is_same_function(contextPC, precisePC); @@ -2376,14 +2375,14 @@ bool OnSample(perf_mmap_data_t * mmap_data, void * contextPC, cct_node_t *node, } else { isSamplePointAccurate = false; } - + switch (theWPConfig->id) { case WP_DEADSPY:{ if(accessType == LOAD){ //EMSG("Sampled accessType = %d\n", accessType); goto ErrExit; // incorrect access type } - + long metricThreshold = hpcrun_id2metric(sampledMetricId)->period; writtenBytes += accessLen * metricThreshold; SampleData_t sd= { @@ -2402,7 +2401,7 @@ bool OnSample(perf_mmap_data_t * mmap_data, void * contextPC, cct_node_t *node, SubscribeWatchpoint(&sd, OVERWRITE, false /* capture value */); } break; - + case WP_REDSPY:{ // If we got an insane address that cannot be read, return silently if(!IsAddressReadable(data_addr)){ @@ -2508,7 +2507,7 @@ bool OnSample(perf_mmap_data_t * mmap_data, void * contextPC, cct_node_t *node, } else { // 50% chance to detect the temporal reuse sd.va = (void *)(( (uint64_t)data_addr >> 2) << 2) ; //data_addr; //jqswang - sd.reuseType = REUSE_TEMPORAL; + sd.reuseType = REUSE_TEMPORAL; } if (!IsValidAddress(sd.va, precisePC)) { goto ErrExit; // incorrect access type @@ -2554,7 +2553,7 @@ bool OnSample(perf_mmap_data_t * mmap_data, void * contextPC, cct_node_t *node, case WP_FALSE_SHARING: case WP_TRUE_SHARING: case WP_ALL_SHARING:{ - + // Is the published address old enough (stayed for > 1 sample time span) int64_t curTime = rdtsc(); SharedData_t localSharedData; @@ -2564,7 +2563,7 @@ bool OnSample(perf_mmap_data_t * mmap_data, void * contextPC, cct_node_t *node, localSharedData.time = gSharedData.time; localSharedData.tid = gSharedData.tid; localSharedData.counter = gSharedData.counter; - + //ReadSharedDataTransactionally(&localSharedData); if( ((curTime-localSharedData.time) > 2 * (curTime-lastTime)) // Sufficient time passed since the last time somebody published && @@ -2580,7 +2579,7 @@ bool OnSample(perf_mmap_data_t * mmap_data, void * contextPC, cct_node_t *node, localSharedData.accessLen = accessLen; localSharedData.counter ++; // makes the counter odd localSharedData.node = node; - + if(__sync_bool_compare_and_swap(&gSharedData.counter, theCounter, theCounter+1)){ gSharedData = localSharedData; __sync_synchronize(); @@ -2594,7 +2593,7 @@ bool OnSample(perf_mmap_data_t * mmap_data, void * contextPC, cct_node_t *node, SET_FS_WP: ReadSharedDataTransactionally(&localSharedData); long metricThreshold = hpcrun_id2metric(sampledMetricId)->period; accessedIns += metricThreshold; - + switch (theWPConfig->id) { case WP_TRUE_SHARING:{ // Set WP at the same address @@ -2679,7 +2678,7 @@ bool OnSample(perf_mmap_data_t * mmap_data, void * contextPC, cct_node_t *node, lastTime = curTime; } break; - + case WP_IPC_FALSE_SHARING: case WP_IPC_TRUE_SHARING: { UpdateVMMap(); @@ -2691,10 +2690,9 @@ bool OnSample(perf_mmap_data_t * mmap_data, void * contextPC, cct_node_t *node, } wpStats.numWatchpointsSet ++; return true; - + ErrExit: wpStats.numImpreciseSamples ++; return false; - -} +} From b59b35300ce562f90370e824b5b7fac9d3a4ab09 Mon Sep 17 00:00:00 2001 From: bugubugu123 Date: Fri, 16 Mar 2018 10:56:31 -0400 Subject: [PATCH 25/43] We will count the number of LOAD_ABOVE_LATENCY even it is a cache hit --- src/tool/hpcrun/sample-sources/perf/linux_perf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/tool/hpcrun/sample-sources/perf/linux_perf.c b/src/tool/hpcrun/sample-sources/perf/linux_perf.c index 00e97406cc..7556bdb7e8 100644 --- a/src/tool/hpcrun/sample-sources/perf/linux_perf.c +++ b/src/tool/hpcrun/sample-sources/perf/linux_perf.c @@ -471,7 +471,7 @@ record_sample(event_thread_t *current, perf_mmap_data_t *mmap_data, perf_mmap_data_src_t data_src; data_src.val = mmap_data->data_src; if ( (data_src.mem_lvl & PERF_MEM_LVL_HIT) && (data_src.mem_lvl & PERF_MEM_LVL_L1)){ // L1 HIT, ignore - *sv = hpcrun_sample_callpath(context, current->event->metric, (hpcrun_metricVal_t){.i=0}, 0/*skipInner*/, 0/*isSync*/, NULL); + *sv = hpcrun_sample_callpath(context, current->event->metric, (hpcrun_metricVal_t){.r=counter}, 0/*skipInner*/, 0/*isSync*/, NULL); } else { *sv = hpcrun_sample_callpath(context, current->event->metric, (hpcrun_metricVal_t) {.r=counter}, 0/*skipInner*/, 0/*isSync*/, &info); From 2e4acee38321507711e77655ba8360b74eebeffd Mon Sep 17 00:00:00 2001 From: bugubugu123 Date: Fri, 23 Mar 2018 11:01:06 -0400 Subject: [PATCH 26/43] 1). set the monitor length from 4 to 1 2). add online attribution for memory reuse distance 3). fix a bug about sampling policy setting --- .../hpcrun/sample-sources/perf/linux_perf.c | 2 - .../sample-sources/watchpoint_clients.c | 213 ++++++++---------- .../sample-sources/watchpoint_support.c | 4 +- .../sample-sources/watchpoint_support.h | 2 +- 4 files changed, 95 insertions(+), 126 deletions(-) diff --git a/src/tool/hpcrun/sample-sources/perf/linux_perf.c b/src/tool/hpcrun/sample-sources/perf/linux_perf.c index 7556bdb7e8..61b9c838d0 100644 --- a/src/tool/hpcrun/sample-sources/perf/linux_perf.c +++ b/src/tool/hpcrun/sample-sources/perf/linux_perf.c @@ -219,7 +219,6 @@ static const char *event_name = "CPU_CYCLES"; int *linux_perf_reading_events = NULL; int linux_perf_num_reading_events = -1; -int reuse_cacheline_distance_event_index = -1; int linux_perf_sample_source_index = -1; //****************************************************************************** @@ -859,7 +858,6 @@ METHOD_FN(process_event_list, int lush_metrics) /******** For witch client WP_REUSE ***************/ if (threshold == 0){ linux_perf_reading_events[linux_perf_num_reading_events++] = i; - reuse_cacheline_distance_event_index = i; linux_perf_sample_source_index = self->sel_idx; } /**************************************************/ diff --git a/src/tool/hpcrun/sample-sources/watchpoint_clients.c b/src/tool/hpcrun/sample-sources/watchpoint_clients.c index 9f4f7481b0..287a675e4e 100755 --- a/src/tool/hpcrun/sample-sources/watchpoint_clients.c +++ b/src/tool/hpcrun/sample-sources/watchpoint_clients.c @@ -146,8 +146,12 @@ int latency_metric_id = -1; int temporal_reuse_metric_id = -1; int spatial_reuse_metric_id = -1; int reuse_time_distance_metric_id = -1; // use rdtsc() to represent the reuse distance -int reuse_cacheline_distance_metric_id = -1; // use cache miss to reprent the reuse distance -int reuse_trapped_metric_id = -1; // the times of a watch point trapped +int reuse_time_distance_count_metric_id = -1; // how many times reuse_time_distance_metric is incremented +int reuse_memory_distance_metric_id = -1; // use Loads+stores to reprent the reuse distance +int reuse_memory_distance_count_metric_id = -1; // how many times reuse_memory_distance_metric is incremented +int reuse_buffer_metric_ids[2] = {-1, -1}; // used to store temporal data for reuse client +int reuse_store_buffer_metric_id = -1; // store the last time we get an available value of stores + int false_ww_metric_id = -1; int false_rw_metric_id = -1; @@ -157,7 +161,6 @@ int true_rw_metric_id = -1; int true_wr_metric_id = -1; -extern int reuse_cacheline_distance_event_index; extern int linux_perf_sample_source_index; extern int *linux_perf_reading_events; extern int linux_perf_num_reading_events; @@ -911,23 +914,28 @@ METHOD_FN(process_event_list, int lush_metrics) case WP_REUSE: { - //set up the trace output - //char file_name[PATH_MAX]; - //int ret = snprintf(file_name, PATH_MAX, "%s-%d.reuse.hpcrun", hpcrun_get_executable_name(), TD_GET(core_profile_trace_data.id)); - //int fd = open(str, O_WRONLY | O_CREAT | O_EXCL, 0644); - //assert(fd > 0); - //hpcio_outbuf_attach(&(TD_GET(witch_client_trace_output)), fd, hpcrun_malloc(1<<10), 1<<10, HPCIO_OUTBUF_UNLOCKED); temporal_reuse_metric_id = hpcrun_new_metric(); hpcrun_set_metric_info_and_period(temporal_reuse_metric_id, "TEMPORAL", MetricFlags_ValFmt_Int, 1, metric_property_none); + #if 0 spatial_reuse_metric_id = hpcrun_new_metric(); hpcrun_set_metric_info_and_period(spatial_reuse_metric_id, "SPATIAL", MetricFlags_ValFmt_Int, 1, metric_property_none); + #endif + reuse_memory_distance_metric_id = hpcrun_new_metric(); + hpcrun_set_metric_info_and_period(reuse_memory_distance_metric_id, "MEMORY_DISTANCE_SUM", MetricFlags_ValFmt_Int, 1, metric_property_none); + reuse_memory_distance_count_metric_id = hpcrun_new_metric(); + hpcrun_set_metric_info_and_period(reuse_memory_distance_count_metric_id, "MEMORY_DISTANCE_COUNT", MetricFlags_ValFmt_Int, 1, metric_property_none); reuse_time_distance_metric_id = hpcrun_new_metric(); - hpcrun_set_metric_info_and_period(reuse_time_distance_metric_id, "TIME_DISTANCE", MetricFlags_ValFmt_Int, 1, metric_property_none); - reuse_cacheline_distance_metric_id = hpcrun_new_metric(); - hpcrun_set_metric_info_and_period(reuse_cacheline_distance_metric_id, "CACHELIN_DISTANCE", MetricFlags_ValFmt_Int, 1, metric_property_none); - reuse_trapped_metric_id = hpcrun_new_metric(); - hpcrun_set_metric_info_and_period(reuse_trapped_metric_id, "REUSE_TRAP_COUNT", MetricFlags_ValFmt_Int, 1, metric_property_none); + hpcrun_set_metric_info_and_period(reuse_time_distance_metric_id, "TIME_DISTANCE_SUM", MetricFlags_ValFmt_Int, 1, metric_property_none); + reuse_time_distance_count_metric_id = hpcrun_new_metric(); + hpcrun_set_metric_info_and_period(reuse_time_distance_count_metric_id, "TIME_DISTANCE_COUNT", MetricFlags_ValFmt_Int, 1, metric_property_none); + + // the next two buffers only for internal use + reuse_buffer_metric_ids[0] = hpcrun_new_metric(); + hpcrun_set_metric_info_and_period(reuse_buffer_metric_ids[0], "REUSE BUFFER 1", MetricFlags_ValFmt_Int, 1, metric_property_none); + reuse_buffer_metric_ids[1] = hpcrun_new_metric(); + hpcrun_set_metric_info_and_period(reuse_buffer_metric_ids[1],"REUSE BUFFER 2", MetricFlags_ValFmt_Int, 1, metric_property_none); + } break; case WP_ALL_SHARING: @@ -1171,6 +1179,30 @@ static inline void UpdateConcatenatedPathPair(void *ctxt, cct_node_t * oldNode, cct_metric_data_increment(metricId, node, (cct_metric_data_t){.i = metricInc}); } + +//possible return type: (uint64_t *) for interger, (double) for real +static void *get_metric_data_ptr(int metric_id, cct_node_t *node){ + if (! hpcrun_has_metric_set(node)) { + cct2metrics_assoc(node, hpcrun_metric_set_new()); + } + metric_set_t* set = hpcrun_get_metric_set(node); + metric_desc_t* minfo = hpcrun_id2metric(metric_id); + if (!minfo) { + return NULL; + } + hpcrun_metricVal_t* loc = hpcrun_metric_set_loc(set, metric_id); + switch (minfo->flags.fields.valFmt) { + case MetricFlags_ValFmt_Int: + return (void *) &(loc->i); + case MetricFlags_ValFmt_Real: + return (void *) &(loc->r); + default: + assert(false); + } + return NULL; +} + + static inline cct_node_t *getPreciseNode(void *ctxt, void *precise_pc, int dummyMetricId){ // currently, we assume precise_pc + 1 = context_pc for PEBS (+1 means one instruction) // we want the context to point to the exact IP @@ -1199,17 +1231,13 @@ static inline cct_node_t *getPreciseNode(void *ctxt, void *precise_pc, int dummy return new_node; } -static inline void UpdateConcatenatedPathPairMultiple(cct_node_t *bottomNode, cct_node_t * topNode, const void * joinNode, int *metricIdArray, uint64_t *metricIncArray, uint32_t numMetric){ - if (numMetric == 0) return; +static inline cct_node_t *getConcatenatedNode(cct_node_t *bottomNode, cct_node_t * topNode, const void * joinNode){ // insert a special node cct_node_t *node = hpcrun_insert_special_node(topNode, joinNode); // concatenate call paths node = hpcrun_cct_insert_path_return_leaf(bottomNode, node); - for(uint32_t i = 0; i < numMetric; i++){ - // update the foundMetric - cct_metric_data_increment(metricIdArray[i], node, (cct_metric_data_t){.i = metricIncArray[i]}); - } + return node; } @@ -1425,84 +1453,51 @@ static WPTriggerActionType ReuseWPCallback(WatchPointInfo_t *wpi, int startOffse int joinNodeIdx = wpi->sample.isSamplePointAccurate? E_ACCURATE_JOIN_NODE_IDX : E_INACCURATE_JOIN_NODE_IDX; uint64_t time_distance = rdtsc() - wpi->startTime; - uint64_t cacheline_distance = 0; -// fprintf(stderr, "SECOND_COUNTER:"); - uint64_t val[2][3]; - //uint64_t scaled; - //for (int i=0; i < linux_perf_num_reading_events; i++){ + uint64_t val[2][3]; for (int i=0; i < MIN(2, linux_perf_num_reading_events); i++){ linux_perf_read_event_counter( linux_perf_reading_events[i], val[i]); - //scaled = perf_scale(val); -// fprintf(stderr," %lu,%lu,%lu,%lu", val[0], val[1], val[2], scaled); - //cacheline_distance = perf_get_scaled_counter_delta(val, wpi->sample.cachelineReuseDistance); - //cacheline_distance = val[1] - wpi->sample.cachelineReuseDistance[1]; -#if 0 - if (val[1] - wpi->sample.cachelineReuseDistance[1] == val[2] - wpi->sample.cachelineReuseDistance[2]){ - cacheline_distance = val[0] - wpi->sample.cachelineReuseDistance[0]; - } - else{ - cacheline_distance = 0; - } -#endif - } -// fprintf(stderr, "\n"); - -#if 0 - if (cacheline_distance < wpi->sample.cachelineReuseDistance){ - fprintf(stderr, "HPCRUN: cacheline counter value decreased, previous %lu --> current %lu\n", wpi->sample.cachelineReuseDistance, cacheline_distance); - cacheline_distance = 0; // maybe set it to zero ?? - } - else { - cacheline_distance -= wpi->sample.cachelineReuseDistance; - } -#endif -#if 0 - if (cacheline_distance == 0){ - fprintf(stderr, "REUSE_DISTANCE (EST): %lu (rate %lf)\n", (uint64_t)( (val[1] - wpi->sample.cachelineReuseDistance[1]) * counting_rate), counting_rate); - //just drop it - } - else{ - fprintf(stderr, "REUSE_DISTANCE (ACC): %lu (rate %lf)\n", cacheline_distance, ((double)val[0] - wpi->sample.cachelineReuseDistance[0] )/((double)val[2] - wpi->sample.cachelineReuseDistance[2])); + for(int j=0; j < 3; j++){ + val[i][j] -= wpi->sample.reuseDistance[i][j]; + } } -#endif - - -// fprintf(stderr, "REUSE_DISTANCE: %c %lu %lu %lu\n", marker, cacheline_distance, inc, val[1] - wpi->sample.cachelineReuseDistance[1]); - //prepare the metric updating arrays - int metricIdArray[4]; - uint64_t metricIncArray[4]; - - metricIncArray[0]=inc; - metricIdArray[1]=reuse_time_distance_metric_id; metricIncArray[1]=time_distance; - metricIdArray[2]=reuse_cacheline_distance_metric_id; metricIncArray[2]=cacheline_distance; - metricIdArray[3]=reuse_trapped_metric_id; metricIncArray[3]=1; - - cct_node_t *reuseNode; - if (wpi->sample.reuseType == REUSE_TEMPORAL){ - reuseTemporal += inc; - metricIdArray[0] = temporal_reuse_metric_id; - reuseNode = getPreciseNode(wt->ctxt, wt->pc, temporal_reuse_metric_id ); - UpdateConcatenatedPathPairMultiple(wpi->sample.node /*bottomNode*/, reuseNode /*topNode*/, joinNodes[E_TEMPORALLY_REUSED][joinNodeIdx] /* joinNode*/, metricIdArray, metricIncArray, 4); - } - else { - reuseSpatial += inc; - metricIdArray[0] = spatial_reuse_metric_id; - reuseNode = getPreciseNode(wt->ctxt, wt->pc, spatial_reuse_metric_id ); - UpdateConcatenatedPathPairMultiple(wpi->sample.node /*bottomNode*/, reuseNode /*topNode*/, joinNodes[E_SPATIALLY_REUSED][joinNodeIdx] /* joinNode*/, metricIdArray, metricIncArray, 4); - } + cct_node_t *reuseNode = getPreciseNode(wt->ctxt, wt->pc, temporal_reuse_metric_id ); #ifdef REUSE_HISTO WriteWitchTraceOutput("REUSE_DISTANCE: %d %d %lu,", hpcrun_cct_persistent_id(wpi->sample.node), hpcrun_cct_persistent_id(reuseNode), inc); for(int i=0; i < MIN(2, linux_perf_num_reading_events); i++){ - - WriteWitchTraceOutput(" %lu %lu %lu,", val[i][0] - wpi->sample.cachelineReuseDistance[i][0],val[i][1] - wpi->sample.cachelineReuseDistance[i][1],val[i][2] - wpi->sample.cachelineReuseDistance[i][2]); + WriteWitchTraceOutput(" %lu %lu %lu,", val[i][0], val[i][1], val[i][2]); } -// fprintf(stderr, "\n"); WriteWitchTraceOutput("\n"); + +#else + + cct_node_t *reusePairNode = getConcatenatedNode(wpi->sample.node /*bottomNode*/, reuseNode /*topNode*/, joinNodes[E_TEMPORALLY_REUSED][joinNodeIdx] /* joinNode*/); + uint64_t obtained_val[2]; + for (int i=0; i < MIN(2, linux_perf_num_reading_events); i++){ + uint64_t * buffer_ptr = (uint64_t *) get_metric_data_ptr(reuse_buffer_metric_ids[i], reusePairNode); + if (val[i][2] == 0){ + //need to borrow value + obtained_val[i] = *buffer_ptr; + } else { + obtained_val[i] = perf_scale(val[i]); + *buffer_ptr = obtained_val[i]; + } + } + + if ( obtained_val[0] > 0 && obtained_val[1] > 0){ //attribute the value + cct_metric_data_increment(reuse_memory_distance_metric_id, reusePairNode, (cct_metric_data_t){.i = (obtained_val[0] + obtained_val[1]) }); + cct_metric_data_increment(reuse_memory_distance_count_metric_id, reusePairNode, (cct_metric_data_t){.i = 1}); + } + + reuseTemporal += inc; + cct_metric_data_increment(temporal_reuse_metric_id, reusePairNode, (cct_metric_data_t){.i = inc}); + cct_metric_data_increment(reuse_time_distance_metric_id, reusePairNode, (cct_metric_data_t){.i = time_distance}); + cct_metric_data_increment(reuse_time_distance_count_metric_id, reusePairNode, (cct_metric_data_t){.i = 1}); #endif + return ALREADY_DISABLED; } @@ -2474,17 +2469,14 @@ bool OnSample(perf_mmap_data_t * mmap_data, void * contextPC, cct_node_t *node, .isBackTrace = false, }; #ifdef REUSE_HISTO - sd.wpLength = 4; + sd.wpLength = 1; #else sd.wpLength = GetFloorWPLength(accessLen); #endif -#ifdef REUSE_HISTO - if (0) -#else +#if 0 //spatial reuse.. currently we don't need it if (rdtsc() & 1)// 50% chance to detect spatial reuse -#endif { int wpSizes[] = {8, 4, 2, 1}; FalseSharingLocs falseSharingLocs[CACHE_LINE_SZ]; @@ -2511,7 +2503,9 @@ bool OnSample(perf_mmap_data_t * mmap_data, void * contextPC, cct_node_t *node, sd.va = aligned_pc + (r * accessLen); #endif } - else { // 50% chance to detect the temporal reuse + else +#endif + { #ifdef REUSE_HISTO sd.va = (void *)(( (uint64_t)data_addr >> 2) << 2) ; #else @@ -2524,37 +2518,14 @@ bool OnSample(perf_mmap_data_t * mmap_data, void * contextPC, cct_node_t *node, } //make sure the following variables have been set assert(linux_perf_sample_source_index >= 0); - assert(reuse_cacheline_distance_event_index >= 0); - - // Read the cacheline event counter - uint64_t cachelineCount; - //event_thread_t *event_thread = (event_thread_t *)TD_GET(ss_info)[linux_perf_sample_source_index].ptr; - //sd.cachelineReuseDistance = 0; -#if 0 - fprintf(stderr, "FIRST_COUNTER: 0x%lx,%lu", precisePC, rdtsc()); - for (int i=0; i < linux_perf_num_reading_events; i++){ - //read_event_counter(&(event_thread[ linux_perf_reading_events[i]]), &cachelineCount); - linux_perf_read_event_counter(linux_perf_reading_events[i], &cachelineCount, 0); - sd.cachelineReuseDistance += cachelineCount; - fprintf(stderr, " %lu", cachelineCount); - } - fprintf(stderr,"\n"); -#else - //fprintf(stderr, "FIRST_COUNTER(rate %lf):", counting_rate); - //for (int i=0; i < linux_perf_num_reading_events; i++){ - for (int i=0; i < MIN(2, linux_perf_num_reading_events); i++){ - uint64_t val[3]; - //uint64_t scaled; - linux_perf_read_event_counter( linux_perf_reading_events[i], val); - //update_counting_rate(val); - //scaled = perf_scale(val); - //fprintf(stderr," %lu,%lu,%lu,%lu", val[0], val[1], val[2], scaled); - memcpy(sd.cachelineReuseDistance[i], val, sizeof(uint64_t)*3);; - } - //fprintf(stderr, "\n"); - //sd.cachelineReuseDistance = cachelineCount; -#endif + // Read the reuse distance event counters + // We assume the reading event is load, store or both. + for (int i=0; i < MIN(2, linux_perf_num_reading_events); i++){ + uint64_t val[3]; + linux_perf_read_event_counter( linux_perf_reading_events[i], val); + memcpy(sd.reuseDistance[i], val, sizeof(uint64_t)*3);; + } // register the watchpoint SubscribeWatchpoint(&sd, OVERWRITE, false /* capture value */); diff --git a/src/tool/hpcrun/sample-sources/watchpoint_support.c b/src/tool/hpcrun/sample-sources/watchpoint_support.c index a94ee469e6..5ecb032095 100644 --- a/src/tool/hpcrun/sample-sources/watchpoint_support.c +++ b/src/tool/hpcrun/sample-sources/watchpoint_support.c @@ -281,9 +281,9 @@ static void InitConfig(){ if(replacementScheme){ if(0 == strcasecmp(replacementScheme, "AUTO")) { wpConfig.replacementPolicy = AUTO; - } if (0 == strcasecmp(replacementScheme, "OLDEST")) { + } else if (0 == strcasecmp(replacementScheme, "OLDEST")) { wpConfig.replacementPolicy = OLDEST; - } if (0 == strcasecmp(replacementScheme, "NEWEST")) { + } else if (0 == strcasecmp(replacementScheme, "NEWEST")) { wpConfig.replacementPolicy = NEWEST; } else { // default; diff --git a/src/tool/hpcrun/sample-sources/watchpoint_support.h b/src/tool/hpcrun/sample-sources/watchpoint_support.h index 20ad878edc..9ab1135a65 100644 --- a/src/tool/hpcrun/sample-sources/watchpoint_support.h +++ b/src/tool/hpcrun/sample-sources/watchpoint_support.h @@ -103,7 +103,7 @@ typedef struct SampleData{ bool isSamplePointAccurate; bool isBackTrace; ReuseType reuseType; - uint64_t cachelineReuseDistance[2][3]; + uint64_t reuseDistance[2][3]; } SampleData_t; typedef struct WatchPointInfo{ From 778f05c429c062a713418433fdb8111cb214eea2 Mon Sep 17 00:00:00 2001 From: bugubugu123 Date: Wed, 28 Mar 2018 16:42:26 -0400 Subject: [PATCH 27/43] NOT a workable version. (But a compilable version) 1). Added cache miss metric from MEM_TRANS_RETIRED:THRESHOLD_ABOVE_LATENCY 2). Added online reuse distance estimation due to counter multiplexing 3). Disable the dummy counter from watchpoint 4). Start to implement that the number of memeroy accesses is read from overflow counters --- .../hpcrun/sample-sources/perf/linux_perf.c | 54 ++++++------------- .../hpcrun/sample-sources/perf/perf-util.c | 6 +-- .../sample-sources/watchpoint_clients.c | 35 ++++++------ .../sample-sources/watchpoint_support.c | 7 ++- 4 files changed, 42 insertions(+), 60 deletions(-) diff --git a/src/tool/hpcrun/sample-sources/perf/linux_perf.c b/src/tool/hpcrun/sample-sources/perf/linux_perf.c index 61b9c838d0..c15c30469c 100644 --- a/src/tool/hpcrun/sample-sources/perf/linux_perf.c +++ b/src/tool/hpcrun/sample-sources/perf/linux_perf.c @@ -217,9 +217,6 @@ static const char *event_name = "CPU_CYCLES"; // global variables //****************************************************************************** -int *linux_perf_reading_events = NULL; -int linux_perf_num_reading_events = -1; -int linux_perf_sample_source_index = -1; //****************************************************************************** // local variables @@ -476,6 +473,8 @@ record_sample(event_thread_t *current, perf_mmap_data_t *mmap_data, *sv = hpcrun_sample_callpath(context, current->event->metric, (hpcrun_metricVal_t) {.r=counter}, 0/*skipInner*/, 0/*isSync*/, &info); extern int latency_metric_id; cct_metric_data_increment(latency_metric_id, sv->sample_node, (cct_metric_data_t){.i = mmap_data->weight}); + extern int latency_miss_load_metric_id; + cct_metric_data_increment(latency_miss_load_metric_id, sv->sample_node, (cct_metric_data_t){.i = counter}); } } else { @@ -502,15 +501,6 @@ record_sample(event_thread_t *current, perf_mmap_data_t *mmap_data, // ok } hpcrun_clear_handling_sample(td); -#endif -#if 0 //for debugging - fprintf(stderr, "COUNTER:"); - for (int i=0; i < linux_perf_num_reading_events; i++){ - uint64_t tmp_counter; - linux_perf_read_event_counter(linux_perf_reading_events[i], &tmp_counter, 1 /* is scaled*/); - fprintf(stderr, " %lu", tmp_counter); - } - fprintf(stderr, "\n"); #endif if(WatchpointClientActive()){ OnSample(mmap_data, @@ -518,28 +508,7 @@ record_sample(event_thread_t *current, perf_mmap_data_t *mmap_data, sv->sample_node, current->event->metric); } - else { -#if 0 - fprintf(stderr, "COUNTER:"); - for (int i=0; i < linux_perf_num_reading_events; i++){ - uint64_t tmp_counter; - linux_perf_read_event_counter(linux_perf_reading_events[i], &tmp_counter, 1 /* is scaled*/); - fprintf(stderr, " %lu", tmp_counter); - } - fprintf(stderr, "\n"); - - fprintf(stderr, "COUNTER:"); - for (int i=0; i < linux_perf_num_reading_events; i++){ - uint64_t val[3]; - uint64_t scaled; - linux_perf_read_event_counter_full( linux_perf_reading_events[i], val); - scaled = perf_scale(val); - fprintf(stderr," %lu,%lu,%lu,%lu", val[0], val[1], val[2], scaled); - } - fprintf(stderr, "\n"); -#endif - } return sv; } @@ -787,8 +756,6 @@ METHOD_FN(process_event_list, int lush_metrics) size_t size = sizeof(event_info_t) * num_events; event_desc = (event_info_t*) hpcrun_malloc(size); - linux_perf_reading_events = (int *) hpcrun_malloc(sizeof(int) * num_events); - linux_perf_num_reading_events = 0; if (event_desc == NULL) { EMSG("Unable to allocate %d bytes", size); return; @@ -855,13 +822,16 @@ METHOD_FN(process_event_list, int lush_metrics) // set the metric for this perf event event_desc[i].metric = hpcrun_new_metric(); + /******** For witch client WP_REUSE ***************/ - if (threshold == 0){ - linux_perf_reading_events[linux_perf_num_reading_events++] = i; - linux_perf_sample_source_index = self->sel_idx; + if (strstr(name, "MEM_UOPS_RETIRED") != NULL){ + extern int *reuse_distance_events; + extern int reuse_distance_num_events; + reuse_distance_events[reuse_distance_num_events++] = i; } /**************************************************/ + // ------------------------------------------------------------ // if we use frequency (event_type=1) then the period is not deterministic, // it can change dynamically. In this case, the period is 1 @@ -879,6 +849,9 @@ METHOD_FN(process_event_list, int lush_metrics) extern int latency_metric_id; latency_metric_id = hpcrun_new_metric(); hpcrun_set_metric_info_and_period(latency_metric_id, "LATENCY", MetricFlags_ValFmt_Int, threshold, metric_property_none); + extern int latency_miss_load_metric_id; + latency_miss_load_metric_id = hpcrun_new_metric(); + hpcrun_set_metric_info_and_period(latency_miss_load_metric_id, "CACHE_MISS_LOAD", MetricFlags_ValFmt_Int, threshold, metric_property_none); } if (m == NULL) { @@ -1037,6 +1010,11 @@ void linux_perf_events_resume(){ int linux_perf_read_event_counter(int event_index, uint64_t *val){ sample_source_t *self = &obj_name(); event_thread_t *event_thread = TD_GET(ss_info)[self->sel_idx].ptr; + + // check whether it is a overflow counter + //jqswang: TODO + + return perf_read_event_counter(&(event_thread[event_index]), val); } diff --git a/src/tool/hpcrun/sample-sources/perf/perf-util.c b/src/tool/hpcrun/sample-sources/perf/perf-util.c index 24e8c46a24..dde2874e30 100644 --- a/src/tool/hpcrun/sample-sources/perf/perf-util.c +++ b/src/tool/hpcrun/sample-sources/perf/perf-util.c @@ -317,10 +317,8 @@ perf_attr_init( attr->sample_period = threshold; /* Period or frequency of sampling */ - //jqswang - if (threshold == 0){ - attr->read_format = PERF_FORMAT_TOTAL_TIME_ENABLED|PERF_FORMAT_TOTAL_TIME_RUNNING; - } + // It enables that we can directly read the value of the event counter via file descriptor + attr->read_format = PERF_FORMAT_TOTAL_TIME_ENABLED|PERF_FORMAT_TOTAL_TIME_RUNNING; int max_sample_rate = perf_max_sample_rate(); diff --git a/src/tool/hpcrun/sample-sources/watchpoint_clients.c b/src/tool/hpcrun/sample-sources/watchpoint_clients.c index 287a675e4e..3efdb5a2ad 100755 --- a/src/tool/hpcrun/sample-sources/watchpoint_clients.c +++ b/src/tool/hpcrun/sample-sources/watchpoint_clients.c @@ -142,6 +142,7 @@ int load_metric_id = -1; int dead_metric_id = -1; int measured_metric_id = -1; int latency_metric_id = -1; +int latency_miss_load_metric_id = -1; int temporal_reuse_metric_id = -1; int spatial_reuse_metric_id = -1; @@ -152,7 +153,6 @@ int reuse_memory_distance_count_metric_id = -1; // how many times reuse_memory_d int reuse_buffer_metric_ids[2] = {-1, -1}; // used to store temporal data for reuse client int reuse_store_buffer_metric_id = -1; // store the last time we get an available value of stores - int false_ww_metric_id = -1; int false_rw_metric_id = -1; int false_wr_metric_id = -1; @@ -161,9 +161,9 @@ int true_rw_metric_id = -1; int true_wr_metric_id = -1; -extern int linux_perf_sample_source_index; -extern int *linux_perf_reading_events; -extern int linux_perf_num_reading_events; +int reuse_distance_events[2] = {-1, -1}; +int reuse_distance_num_events = 0; + static inline uint64_t perf_scale(uint64_t *values) { //jqswang uint64_t res = 0; @@ -615,8 +615,8 @@ static void ClientTermination(){ //fprintf(stderr, "FINAL_COUNTING:"); WriteWitchTraceOutput("FINAL_COUNTING:"); - for (int i=0; i < MIN(2,linux_perf_num_reading_events); i++){ - linux_perf_read_event_counter( linux_perf_reading_events[i], val); + for (int i=0; i < MIN(2,reuse_distance_num_events); i++){ + linux_perf_read_event_counter(reuse_distance_events[i], val); //fprintf(stderr, " %lu %lu %lu,", val[0], val[1], val[2]);//jqswang WriteWitchTraceOutput(" %lu %lu %lu,", val[0], val[1], val[2]); } @@ -1440,12 +1440,13 @@ static WPTriggerActionType RedStoreWPCallback(WatchPointInfo_t *wpi, int startOf } static WPTriggerActionType ReuseWPCallback(WatchPointInfo_t *wpi, int startOffset, int safeAccessLen, WatchPointTrigger_t * wt){ +#if 0 // jqswang:TODO, how to handle it? if(!wt->pc) { // if the ip is 0, let's drop the WP //return RETAIN_WP; return ALREADY_DISABLED; } - +#endif //jqswang // Report a reuse double myProportion = ProportionOfWatchpointAmongOthersSharingTheSameContext(wpi); uint64_t numDiffSamples = GetWeightedMetricDiffAndReset(wpi->sample.node, wpi->sample.sampledMetricId, myProportion); @@ -1455,18 +1456,19 @@ static WPTriggerActionType ReuseWPCallback(WatchPointInfo_t *wpi, int startOffse uint64_t time_distance = rdtsc() - wpi->startTime; uint64_t val[2][3]; - for (int i=0; i < MIN(2, linux_perf_num_reading_events); i++){ - linux_perf_read_event_counter( linux_perf_reading_events[i], val[i]); - for(int j=0; j < 3; j++){ + for (int i=0; i < MIN(2, reuse_distance_num_events); i++){ + linux_perf_read_event_counter( reuse_distance_events[i], val[i]); + //fprintf(stderr, "VAL %lu %lu %lu\n", val[i][0], val[i][1], val[i][2]); + for(int j=0; j < 3; j++){ val[i][j] -= wpi->sample.reuseDistance[i][j]; - } + } } cct_node_t *reuseNode = getPreciseNode(wt->ctxt, wt->pc, temporal_reuse_metric_id ); #ifdef REUSE_HISTO WriteWitchTraceOutput("REUSE_DISTANCE: %d %d %lu,", hpcrun_cct_persistent_id(wpi->sample.node), hpcrun_cct_persistent_id(reuseNode), inc); - for(int i=0; i < MIN(2, linux_perf_num_reading_events); i++){ + for(int i=0; i < MIN(2, reuse_distance_num_events); i++){ WriteWitchTraceOutput(" %lu %lu %lu,", val[i][0], val[i][1], val[i][2]); } WriteWitchTraceOutput("\n"); @@ -2507,7 +2509,7 @@ bool OnSample(perf_mmap_data_t * mmap_data, void * contextPC, cct_node_t *node, #endif { #ifdef REUSE_HISTO - sd.va = (void *)(( (uint64_t)data_addr >> 2) << 2) ; + sd.va = data_addr;//sd.va = (void *)(( (uint64_t)data_addr >> 2) << 2) ; #else sd.va = data_addr; #endif @@ -2516,14 +2518,13 @@ bool OnSample(perf_mmap_data_t * mmap_data, void * contextPC, cct_node_t *node, if (!IsValidAddress(sd.va, precisePC)) { goto ErrExit; // incorrect access type } - //make sure the following variables have been set - assert(linux_perf_sample_source_index >= 0); // Read the reuse distance event counters // We assume the reading event is load, store or both. - for (int i=0; i < MIN(2, linux_perf_num_reading_events); i++){ + for (int i=0; i < MIN(2, reuse_distance_num_events); i++){ uint64_t val[3]; - linux_perf_read_event_counter( linux_perf_reading_events[i], val); + linux_perf_read_event_counter( reuse_distance_events[i], val); + //fprintf(stderr, "USE %lu %lu %lu\n", val[0], val[1], val[2]); memcpy(sd.reuseDistance[i], val, sizeof(uint64_t)*3);; } diff --git a/src/tool/hpcrun/sample-sources/watchpoint_support.c b/src/tool/hpcrun/sample-sources/watchpoint_support.c index 5ecb032095..723aa9e230 100644 --- a/src/tool/hpcrun/sample-sources/watchpoint_support.c +++ b/src/tool/hpcrun/sample-sources/watchpoint_support.c @@ -206,6 +206,9 @@ static void InitConfig(){ } else { wpConfig.isLBREnabled = false; } + // jqswang:TODO HACK HACK March/27 debugiing with Qingsen +// FIXME: + wpConfig.isLBREnabled = false; CHECK(close(fd)); @@ -454,7 +457,9 @@ static void CreateWatchPoint(WatchPointInfo_t * wpi, SampleData_t * sampleData, wpi->fileHandle = perf_fd; // mmap the file if lbr is enabled - if(wpConfig.isLBREnabled) { + //if(wpConfig.isLBREnabled) { + //jqswang: TODO + if(1 /* FIXME: HACK HACK March/27 debugging with qingsen*/) { wpi->mmapBuffer = MAPWPMBuffer(perf_fd); } } From 526df621440bf34913bd328aa5558cb94ee7cd7f Mon Sep 17 00:00:00 2001 From: bugubugu123 Date: Thu, 29 Mar 2018 14:40:44 -0400 Subject: [PATCH 28/43] NOT A WORKABLE VERSION (but a compilable version) Implemented the function linux_perf_read_event_counter() --- .../hpcrun/sample-sources/perf/linux_perf.c | 27 ++++++++++++++++--- .../hpcrun/sample-sources/perf/perf-util.h | 2 +- 2 files changed, 24 insertions(+), 5 deletions(-) diff --git a/src/tool/hpcrun/sample-sources/perf/linux_perf.c b/src/tool/hpcrun/sample-sources/perf/linux_perf.c index c15c30469c..1b4899b076 100644 --- a/src/tool/hpcrun/sample-sources/perf/linux_perf.c +++ b/src/tool/hpcrun/sample-sources/perf/linux_perf.c @@ -311,6 +311,7 @@ perf_init() static bool perf_thread_init(event_info_t *event, event_thread_t *et) { + et->num_overflows = 0; et->event = event; // ask sys to "create" the event // it returns -1 if it fails. @@ -1006,16 +1007,30 @@ void linux_perf_events_resume(){ perf_start_all(nevents, event_thread); } -// val is a uint64_t array and has at least 3 elements +// OUTPUT: val, it is a uint64_t array and has at least 3 elements. +// For a counting event, val[0] is the actual value read from counter; val[1] is the time enabling; val[2] is time running +// For a overflow event, val[0] is the actual scaled value; val[1] and val[2] are set to 0 +// RETURN: 0, sucess; -1, error int linux_perf_read_event_counter(int event_index, uint64_t *val){ sample_source_t *self = &obj_name(); event_thread_t *event_thread = TD_GET(ss_info)[self->sel_idx].ptr; - // check whether it is a overflow counter - //jqswang: TODO + event_thread_t *current = &(event_thread[event_index]); + int ret = perf_read_event_counter(current, val); + if (ret < 0) return -1; // something wrong here - return perf_read_event_counter(&(event_thread[event_index]), val); + uint64_t sample_period = current->event->attr.sample_period; + if (sample_period == 0){ // counting event + return 0; + } else { + // overflow event + uint64_t scaled_val = perf_scale(val); + val[0] = current->num_overflows * sample_period + (sample_period - scaled_val); + val[1] = 0; + val[2] = 0; + return 0; + } } @@ -1115,6 +1130,10 @@ perf_event_handler( return 1; // tell monitor the signal has not been handled. } + // Increment the number of overflows for the current event + current->num_overflows++; + + // ---------------------------------------------------------------------------- // parse the buffer until it finishes reading all buffers // ---------------------------------------------------------------------------- diff --git a/src/tool/hpcrun/sample-sources/perf/perf-util.h b/src/tool/hpcrun/sample-sources/perf/perf-util.h index 38db32e702..c2fa45f2a8 100644 --- a/src/tool/hpcrun/sample-sources/perf/perf-util.h +++ b/src/tool/hpcrun/sample-sources/perf/perf-util.h @@ -166,7 +166,7 @@ typedef struct event_thread_s { pe_mmap_t *mmap; // mmap buffer int fd; // file descriptor of the event event_info_t *event; // pointer to main event description - + uint64_t num_overflows; // record how many times this event has overflowed } event_thread_t; From c2c01cb254b14ae6b1607e5b8db69b496a003df7 Mon Sep 17 00:00:00 2001 From: bugubugu123 Date: Fri, 30 Mar 2018 16:03:21 -0400 Subject: [PATCH 29/43] Workable version. To collect reuse histo, we can just read sampling event now. --- .../hpcrun/sample-sources/perf/linux_perf.c | 66 +++++++++++++++++-- .../sample-sources/watchpoint_clients.c | 47 ++----------- .../sample-sources/watchpoint_support.c | 9 +-- .../sample-sources/watchpoint_support.h | 2 +- 4 files changed, 70 insertions(+), 54 deletions(-) diff --git a/src/tool/hpcrun/sample-sources/perf/linux_perf.c b/src/tool/hpcrun/sample-sources/perf/linux_perf.c index 1b4899b076..fbf1ae47fa 100644 --- a/src/tool/hpcrun/sample-sources/perf/linux_perf.c +++ b/src/tool/hpcrun/sample-sources/perf/linux_perf.c @@ -763,6 +763,16 @@ METHOD_FN(process_event_list, int lush_metrics) } memset(event_desc, 0, size); + extern int *reuse_distance_events; + extern int reuse_distance_num_events; + reuse_distance_events = (int *) hpcrun_malloc(sizeof(int) * num_events); + reuse_distance_num_events = 0; + if (reuse_distance_events == NULL){ + EMSG("Unable to allocate %d bytes", sizeof(int)*num_events); + return; + } + + int i=0; default_threshold = init_default_count(); @@ -826,8 +836,6 @@ METHOD_FN(process_event_list, int lush_metrics) /******** For witch client WP_REUSE ***************/ if (strstr(name, "MEM_UOPS_RETIRED") != NULL){ - extern int *reuse_distance_events; - extern int reuse_distance_num_events; reuse_distance_events[reuse_distance_num_events++] = i; } /**************************************************/ @@ -971,16 +979,40 @@ restart_perf_event(int fd) TMSG(LINUX_PERF, "Unable to start event: fd is not valid"); return -1; } - +#if 0 //jqswang + uint64_t val[3]; + read(fd, val, sizeof(uint64_t)*3); + //fprintf(stderr, "Before RESET %lx %lx %lx\n", val[0], val[1], val[2]); + fprintf(stderr, "Before RESET1 %lx\n", val[0]); + + for(volatile int i=0; i< 1000; i++); + read(fd, val, sizeof(uint64_t)*3); + fprintf(stderr, "Before RESET2 %lx\n", val[0]); +#endif int ret = ioctl(fd, PERF_EVENT_IOC_RESET, 0); if (ret == -1) { TMSG(LINUX_PERF, "error fd %d in PERF_EVENT_IOC_RESET: %s", fd, strerror(errno)); } +#if 0 //jqsang + read(fd, val, sizeof(uint64_t)*3); + //fprintf(stderr, "AFTER RESET %lx %lx %lx\n", val[0], val[1], val[2]); + fprintf(stderr, "AFTER RESET %lx\n", val[0]); +#endif + ret = ioctl(fd, PERF_EVENT_IOC_REFRESH, 1); if (ret == -1) { TMSG(LINUX_PERF, "error fd %d in IOC_REFRESH: %s", fd, strerror(errno)); } + //jqswang +#if 0 + for(volatile int i=0; i< 1000; i++); + ioctl(fd, PERF_EVENT_IOC_DISABLE, 0); + uint64_t val[3]; + ret = read(fd, val, sizeof(uint64_t) * 3 ); + fprintf(stderr, "After DISABLE %lx %lx %lx\n", val[0], val[1], val[2]); + ioctl(fd, PERF_EVENT_IOC_ENABLE, 0); +#endif return ret; } /*************************************************************************** @@ -1025,8 +1057,16 @@ int linux_perf_read_event_counter(int event_index, uint64_t *val){ return 0; } else { // overflow event - uint64_t scaled_val = perf_scale(val); - val[0] = current->num_overflows * sample_period + (sample_period - scaled_val); + //fprintf(stderr, "DEBUG: %lu, %lu %lu %lu\n", sample_period, val[0],val[1],val[2]); + //uint64_t scaled_val = perf_scale(val); + assert(val[1] == val[2]); //jqswang: TODO: I have no idea how to calculate the value under multiplexing for overflow event. + int64_t scaled_val = (int64_t) val[0] ;//% sample_period; + if (scaled_val >= sample_period || scaled_val < 0){ //jqswang: TODO: it does not filter out all the invalid values + scaled_val = 0; + } + //fprintf(stderr, "%s: %lu, %lu %lu %lu ->", current->event->metric_desc->name, current->num_overflows, val[0],val[1],val[2]); + val[0] = current->num_overflows * sample_period + scaled_val; + //fprintf(stderr, " %lu\n", val[0]); val[1] = 0; val[2] = 0; return 0; @@ -1154,10 +1194,22 @@ perf_event_handler( record_sample(current, &mmap_data, context, &sv); kernel_block_handler(current, sv, &mmap_data); - - } while (more_data); + } while (more_data); hpcrun_safe_exit(); +#if 0//jqswang + uint64_t val[3]; + read(fd, val, sizeof(uint64_t)*3); + //fprintf(stderr, "Before RESET %lx %lx %lx\n", val[0], val[1], val[2]); + //fprintf(stderr, "Before RESTART %s %lx\n", current->event->metric_desc->name,val[0]); + extern int reuse_distance_num_events; + extern int *reuse_distance_events; + for (int i=0; i < MIN(2, reuse_distance_num_events); i++){ + //linux_perf_read_event_counter( reuse_distance_events[i], val); + //fprintf(stderr, "READING %lx [%lu] ---", val[0], ); + } + //fprintf(stderr,"\n"); +#endif restart_perf_event(fd); diff --git a/src/tool/hpcrun/sample-sources/watchpoint_clients.c b/src/tool/hpcrun/sample-sources/watchpoint_clients.c index 3efdb5a2ad..bb9ef1655c 100755 --- a/src/tool/hpcrun/sample-sources/watchpoint_clients.c +++ b/src/tool/hpcrun/sample-sources/watchpoint_clients.c @@ -161,39 +161,8 @@ int true_rw_metric_id = -1; int true_wr_metric_id = -1; -int reuse_distance_events[2] = {-1, -1}; +int *reuse_distance_events = NULL; int reuse_distance_num_events = 0; - - -static inline uint64_t perf_scale(uint64_t *values) { //jqswang - uint64_t res = 0; - - if (!values[2] && !values[1] && values[0]) { - fprintf(stderr,"WARNING: time_running = 0 = time_enabled, raw count not zero\n"); - } - if (values[2] > values[1]) { - fprintf(stderr, "WARNING: time_running > time_enabled\n"); - } - if (values[2]) { - res = (uint64_t)((double)values[0] * values[1]/values[2]); - } - return res; -} - -uint64_t old_values[3] = {0,0,0}; -double counting_rate = 0.0; -static inline void update_counting_rate(uint64_t *values){ //jqswang - if ( values[2] == old_values[2]){ - //fprintf(stderr, "HPCRUN: WARNING: the sampling rate is too high for the multiplexed events\n"); - return; - } - counting_rate = ((double)(values[0] - old_values[0])) / (values[2] - old_values[2]); - memcpy(old_values, values, sizeof(uint64_t)*3); - -} - - - #define NUM_WATERMARK_METRICS (4) int curWatermarkId = 0; int watermark_metric_id[NUM_WATERMARK_METRICS] = {-1, -1, -1, -1}; @@ -1458,7 +1427,8 @@ static WPTriggerActionType ReuseWPCallback(WatchPointInfo_t *wpi, int startOffse uint64_t val[2][3]; for (int i=0; i < MIN(2, reuse_distance_num_events); i++){ linux_perf_read_event_counter( reuse_distance_events[i], val[i]); - //fprintf(stderr, "VAL %lu %lu %lu\n", val[i][0], val[i][1], val[i][2]); + //fprintf(stderr, "USE: %lu %lu %lu, REUSE: %lu %lu %lu\n", wpi->sample.reuseDistance[i][0], wpi->sample.reuseDistance[i][1], wpi->sample.reuseDistance[i][2], val[i][0], val[i][1], val[i][2]); + //fprintf(stderr, "DIFF: %lu\n", val[i][0] - wpi->sample.reuseDistance[i][0]); for(int j=0; j < 3; j++){ val[i][j] -= wpi->sample.reuseDistance[i][j]; } @@ -2508,11 +2478,7 @@ bool OnSample(perf_mmap_data_t * mmap_data, void * contextPC, cct_node_t *node, else #endif { -#ifdef REUSE_HISTO - sd.va = data_addr;//sd.va = (void *)(( (uint64_t)data_addr >> 2) << 2) ; -#else sd.va = data_addr; -#endif sd.reuseType = REUSE_TEMPORAL; } if (!IsValidAddress(sd.va, precisePC)) { @@ -2524,10 +2490,11 @@ bool OnSample(perf_mmap_data_t * mmap_data, void * contextPC, cct_node_t *node, for (int i=0; i < MIN(2, reuse_distance_num_events); i++){ uint64_t val[3]; linux_perf_read_event_counter( reuse_distance_events[i], val); - //fprintf(stderr, "USE %lu %lu %lu\n", val[0], val[1], val[2]); - memcpy(sd.reuseDistance[i], val, sizeof(uint64_t)*3);; + //fprintf(stderr, "USE %lu %lu %lu -- ", val[0], val[1], val[2]); + //fprintf(stderr, "USE %lx -- ", val[0]); + memcpy(sd.reuseDistance[i], val, sizeof(uint64_t)*3);; } - + //fprintf(stderr, "\n"); // register the watchpoint SubscribeWatchpoint(&sd, OVERWRITE, false /* capture value */); } diff --git a/src/tool/hpcrun/sample-sources/watchpoint_support.c b/src/tool/hpcrun/sample-sources/watchpoint_support.c index 723aa9e230..5aa5285b45 100644 --- a/src/tool/hpcrun/sample-sources/watchpoint_support.c +++ b/src/tool/hpcrun/sample-sources/watchpoint_support.c @@ -206,9 +206,7 @@ static void InitConfig(){ } else { wpConfig.isLBREnabled = false; } - // jqswang:TODO HACK HACK March/27 debugiing with Qingsen -// FIXME: - wpConfig.isLBREnabled = false; + wpConfig.isLBREnabled = false; //jqswang: TODO CHECK(close(fd)); @@ -377,6 +375,7 @@ void ReuseWPConfigOverride(void *v){ // dont fix IP wpConfig.dontFixIP = true; wpConfig.dontDisassembleWPAddress = true; + wpConfig.isLBREnabled = false; //jqswang } static void CreateWatchPoint(WatchPointInfo_t * wpi, SampleData_t * sampleData, bool modify) { @@ -458,10 +457,8 @@ static void CreateWatchPoint(WatchPointInfo_t * wpi, SampleData_t * sampleData, wpi->fileHandle = perf_fd; // mmap the file if lbr is enabled //if(wpConfig.isLBREnabled) { - //jqswang: TODO - if(1 /* FIXME: HACK HACK March/27 debugging with qingsen*/) { wpi->mmapBuffer = MAPWPMBuffer(perf_fd); - } + //} } wpi->isActive = true; diff --git a/src/tool/hpcrun/sample-sources/watchpoint_support.h b/src/tool/hpcrun/sample-sources/watchpoint_support.h index 9ab1135a65..fb6b32d54d 100644 --- a/src/tool/hpcrun/sample-sources/watchpoint_support.h +++ b/src/tool/hpcrun/sample-sources/watchpoint_support.h @@ -83,7 +83,7 @@ typedef enum ReplacementPolicy {AUTO, EMPTY_SLOT_ONLY, OLDEST, NEWEST} Replaceme typedef enum MergePolicy {AUTO_MERGE, NO_MERGE, CLIENT_ACTION} MergePolicy; typedef enum OverwritePolicy {OVERWRITE, NO_OVERWRITE} OverwritePolicy; typedef enum VictimType {EMPTY_SLOT, NON_EMPTY_SLOT, NONE_AVAILABLE} VictimType; -typedef enum WPTriggerActionType {DISABLE_WP, ALREADY_DISABLED, DISABLE_ALL_WP, RETAIN_WP} WPTriggerActionType; //jqswang: what do they mean? +typedef enum WPTriggerActionType {DISABLE_WP, ALREADY_DISABLED, DISABLE_ALL_WP, RETAIN_WP} WPTriggerActionType; typedef enum ReuseType { REUSE_TEMPORAL, REUSE_SPATIAL} ReuseType; // for reuse client // Data structure that is given by clients to set a WP From 9892b2ac14554c718f1a4b70f0f45b5f0569e2a6 Mon Sep 17 00:00:00 2001 From: bugubugu123 Date: Fri, 30 Mar 2018 18:19:29 -0400 Subject: [PATCH 30/43] Implemented the reuse guide version --- .../hpcrun/sample-sources/perf/linux_perf.c | 7 ++++++- .../hpcrun/sample-sources/watchpoint_clients.c | 18 +++++++++++++++++- 2 files changed, 23 insertions(+), 2 deletions(-) diff --git a/src/tool/hpcrun/sample-sources/perf/linux_perf.c b/src/tool/hpcrun/sample-sources/perf/linux_perf.c index fbf1ae47fa..29d9d4f29a 100644 --- a/src/tool/hpcrun/sample-sources/perf/linux_perf.c +++ b/src/tool/hpcrun/sample-sources/perf/linux_perf.c @@ -835,7 +835,12 @@ METHOD_FN(process_event_list, int lush_metrics) /******** For witch client WP_REUSE ***************/ - if (strstr(name, "MEM_UOPS_RETIRED") != NULL){ +#ifdef REUSE_HISTO + if (strstr(name, "MEM_UOPS_RETIRED") != NULL) +#else + if (strstr(name, "MEM_UOPS_RETIRED") != NULL && threshold == 0) +#endif + { reuse_distance_events[reuse_distance_num_events++] = i; } /**************************************************/ diff --git a/src/tool/hpcrun/sample-sources/watchpoint_clients.c b/src/tool/hpcrun/sample-sources/watchpoint_clients.c index bb9ef1655c..70b784201a 100755 --- a/src/tool/hpcrun/sample-sources/watchpoint_clients.c +++ b/src/tool/hpcrun/sample-sources/watchpoint_clients.c @@ -168,6 +168,22 @@ int curWatermarkId = 0; int watermark_metric_id[NUM_WATERMARK_METRICS] = {-1, -1, -1, -1}; int pebs_metric_id[NUM_WATERMARK_METRICS] = {-1, -1, -1, -1}; +static inline uint64_t perf_scale(uint64_t *values) { //jqswang + uint64_t res = 0; + + if (!values[2] && !values[1] && values[0]) { + fprintf(stderr,"WARNING: time_running = 0 = time_enabled, raw count not zero\n"); + } + if (values[2] > values[1]) { + fprintf(stderr, "WARNING: time_running > time_enabled\n"); + } + if (values[2]) { + res = (uint64_t)((double)values[0] * values[1]/values[2]); + } + return res; +} + + void SetupWatermarkMetric(int metricId){ if (curWatermarkId == NUM_WATERMARK_METRICS) { EEMSG("curWatermarkId == NUM_WATERMARK_METRICS = %d", NUM_WATERMARK_METRICS); @@ -1447,7 +1463,7 @@ static WPTriggerActionType ReuseWPCallback(WatchPointInfo_t *wpi, int startOffse cct_node_t *reusePairNode = getConcatenatedNode(wpi->sample.node /*bottomNode*/, reuseNode /*topNode*/, joinNodes[E_TEMPORALLY_REUSED][joinNodeIdx] /* joinNode*/); uint64_t obtained_val[2]; - for (int i=0; i < MIN(2, linux_perf_num_reading_events); i++){ + for (int i=0; i < MIN(2, reuse_distance_num_events); i++){ uint64_t * buffer_ptr = (uint64_t *) get_metric_data_ptr(reuse_buffer_metric_ids[i], reusePairNode); if (val[i][2] == 0){ //need to borrow value From 8801537bb04e05e4d95dfe71868e7e279d2ceeb1 Mon Sep 17 00:00:00 2001 From: bugubugu123 Date: Sat, 31 Mar 2018 11:21:09 -0400 Subject: [PATCH 31/43] Fixed some bug --- .../sample-sources/watchpoint_clients.c | 54 ++++++++++++++++++- 1 file changed, 52 insertions(+), 2 deletions(-) diff --git a/src/tool/hpcrun/sample-sources/watchpoint_clients.c b/src/tool/hpcrun/sample-sources/watchpoint_clients.c index 70b784201a..c5237985d7 100755 --- a/src/tool/hpcrun/sample-sources/watchpoint_clients.c +++ b/src/tool/hpcrun/sample-sources/watchpoint_clients.c @@ -163,6 +163,12 @@ int true_wr_metric_id = -1; int *reuse_distance_events = NULL; int reuse_distance_num_events = 0; +#ifdef REUSE_HISTO +#else +AccessType reuse_monitor_type = LOAD_AND_STORE; // WP_REUSE: what kind of memory access can be used to subscribe the watchpoint +WatchPointType reuse_trap_type = WP_RW; // WP_REUSE: what kind of memory access can trap the watchpoint +#endif + #define NUM_WATERMARK_METRICS (4) int curWatermarkId = 0; int watermark_metric_id[NUM_WATERMARK_METRICS] = {-1, -1, -1, -1}; @@ -899,6 +905,45 @@ METHOD_FN(process_event_list, int lush_metrics) case WP_REUSE: { +#ifdef REUSE_HISTO +#else + { + char * monitor_type_str = getenv("HPCRUN_WP_REUSE_MONITOR_TYPE"); + if(monitor_type_str){ + if(0 == strcasecmp(monitor_type_str, "LOAD")) { + reuse_monitor_type = LOAD; + } else if (0 == strcasecmp(monitor_type_str, "STORE")) { + reuse_monitor_type = STORE; + } else if (0 == strcasecmp(monitor_type_str, "LS") || 0 == strcasecmp(monitor_type_str, "ALL") ) { + reuse_monitor_type = LOAD_AND_STORE; + } else { + // default; + reuse_monitor_type = LOAD_AND_STORE; + } + } else{ + // default + reuse_monitor_type = LOAD_AND_STORE; + } + } + { + char *trap_type_str = getenv("HPCRUN_WP_REUSE_TRAP_TYPE"); + if(trap_type_str){ + if(0 == strcasecmp(trap_type_str, "LOAD")) { + reuse_trap_type = WP_RW; // NO WP_READ allowed + } else if (0 == strcasecmp(trap_type_str, "STORE")) { + reuse_trap_type = WP_WRITE; + } else if (0 == strcasecmp(trap_type_str, "LS") || 0 == strcasecmp(trap_type_str, "ALL") ) { + reuse_trap_type = WP_RW; + } else { + // default; + reuse_trap_type = WP_RW; + } + } else{ + // default + reuse_trap_type = WP_RW; + } + } +#endif temporal_reuse_metric_id = hpcrun_new_metric(); hpcrun_set_metric_info_and_period(temporal_reuse_metric_id, "TEMPORAL", MetricFlags_ValFmt_Int, 1, metric_property_none); @@ -917,9 +962,9 @@ METHOD_FN(process_event_list, int lush_metrics) // the next two buffers only for internal use reuse_buffer_metric_ids[0] = hpcrun_new_metric(); - hpcrun_set_metric_info_and_period(reuse_buffer_metric_ids[0], "REUSE BUFFER 1", MetricFlags_ValFmt_Int, 1, metric_property_none); + hpcrun_set_metric_info_and_period(reuse_buffer_metric_ids[0], "REUSE_BUFFER_1", MetricFlags_ValFmt_Int, 1, metric_property_none); reuse_buffer_metric_ids[1] = hpcrun_new_metric(); - hpcrun_set_metric_info_and_period(reuse_buffer_metric_ids[1],"REUSE BUFFER 2", MetricFlags_ValFmt_Int, 1, metric_property_none); + hpcrun_set_metric_info_and_period(reuse_buffer_metric_ids[1],"REUSE_BUFFER_2", MetricFlags_ValFmt_Int, 1, metric_property_none); } break; @@ -2443,6 +2488,10 @@ bool OnSample(perf_mmap_data_t * mmap_data, void * contextPC, cct_node_t *node, } break; case WP_REUSE:{ +#ifdef REUSE_HISTO +#else + if ( accessType != reuse_monitor_type && reuse_monitor_type != LOAD_AND_STORE) break; +#endif long metricThreshold = hpcrun_id2metric(sampledMetricId)->period; accessedIns += metricThreshold; SampleData_t sd= { @@ -2460,6 +2509,7 @@ bool OnSample(perf_mmap_data_t * mmap_data, void * contextPC, cct_node_t *node, sd.wpLength = 1; #else sd.wpLength = GetFloorWPLength(accessLen); + sd.type = reuse_trap_type; #endif From 9d5f4d7e286eb69f7c753c6556125477f4ef8e90 Mon Sep 17 00:00:00 2001 From: bugubugu123 Date: Mon, 16 Apr 2018 10:48:42 -0400 Subject: [PATCH 32/43] Fixed some bug; How to concatenate the use and reuse pair can be determined by the environmental variable HPCRUN_WP_REUSE_CONCATENATE_ORDER --- .../hpcrun/sample-sources/perf/linux_perf.c | 2 +- .../sample-sources/watchpoint_clients.c | 46 +++++++++++++++---- .../sample-sources/watchpoint_support.c | 11 ++++- 3 files changed, 48 insertions(+), 11 deletions(-) diff --git a/src/tool/hpcrun/sample-sources/perf/linux_perf.c b/src/tool/hpcrun/sample-sources/perf/linux_perf.c index 29d9d4f29a..22a7034b72 100644 --- a/src/tool/hpcrun/sample-sources/perf/linux_perf.c +++ b/src/tool/hpcrun/sample-sources/perf/linux_perf.c @@ -838,7 +838,7 @@ METHOD_FN(process_event_list, int lush_metrics) #ifdef REUSE_HISTO if (strstr(name, "MEM_UOPS_RETIRED") != NULL) #else - if (strstr(name, "MEM_UOPS_RETIRED") != NULL && threshold == 0) + if (strstr(name, "MEM_UOPS_RETIRED") != NULL) //jqswang: TODO // && threshold == 0) #endif { reuse_distance_events[reuse_distance_num_events++] = i; diff --git a/src/tool/hpcrun/sample-sources/watchpoint_clients.c b/src/tool/hpcrun/sample-sources/watchpoint_clients.c index c5237985d7..641361c920 100755 --- a/src/tool/hpcrun/sample-sources/watchpoint_clients.c +++ b/src/tool/hpcrun/sample-sources/watchpoint_clients.c @@ -167,6 +167,7 @@ int reuse_distance_num_events = 0; #else AccessType reuse_monitor_type = LOAD_AND_STORE; // WP_REUSE: what kind of memory access can be used to subscribe the watchpoint WatchPointType reuse_trap_type = WP_RW; // WP_REUSE: what kind of memory access can trap the watchpoint +bool reuse_concatenate_use_reuse = false; // WP_REUSE: how to concatentate the use and reuse #endif #define NUM_WATERMARK_METRICS (4) @@ -943,6 +944,15 @@ METHOD_FN(process_event_list, int lush_metrics) reuse_trap_type = WP_RW; } } + + { + char *concatenate_order_str = getenv("HPCRUN_WP_REUSE_CONCATENATE_ORDER"); + if(concatenate_order_str && 0 == strcasecmp(concatenate_order_str, "USE_REUSE")){ + reuse_concatenate_use_reuse = true; + } else{ + reuse_concatenate_use_reuse = false; + } + } #endif temporal_reuse_metric_id = hpcrun_new_metric(); @@ -1057,8 +1067,9 @@ enum JoinNodeType { E_KILLED=0, E_USED, E_NEW_VAL, - E_TEMPORALLY_REUSED, - E_SPATIALLY_REUSED, + E_TEMPORALLY_REUSED_FROM, + E_TEMPORALLY_REUSED_BY, + E_SPATIALLY_REUSED_FROM, E_TRUE_WW_SHARE, E_TRUE_WR_SHARE, E_TRUE_RW_SHARE, @@ -1091,6 +1102,9 @@ static void NEW_VAL_BY_INACCURATE_PC(void) {} static void TEMPORALLY_REUSED_FROM(void) {} static void TEMPORALLY_REUSED_FROM_INACCURATE_PC(void) {} +static void TEMPORALLY_REUSED_BY(void) {} +static void TEMPORALLY_REUSED_BY_INACCURATE_PC(void) {} + static void SPATIALLY_REUSED_FROM(void) {} static void SPATIALLY_REUSED_FROM_INACCURATE_PC(void) {} @@ -1137,8 +1151,9 @@ static const void * joinNodes[][2] = { [E_KILLED] = GET_FUN_ADDR(KILLED_BY), [E_USED] = GET_FUN_ADDR(USED_BY), [E_NEW_VAL] = GET_FUN_ADDR(NEW_VAL_BY), - [E_TEMPORALLY_REUSED] = GET_FUN_ADDR(TEMPORALLY_REUSED_FROM), - [E_SPATIALLY_REUSED] = GET_FUN_ADDR(SPATIALLY_REUSED_FROM), + [E_TEMPORALLY_REUSED_FROM] = GET_FUN_ADDR(TEMPORALLY_REUSED_FROM), + [E_TEMPORALLY_REUSED_BY] = GET_FUN_ADDR(TEMPORALLY_REUSED_BY), + [E_SPATIALLY_REUSED_FROM] = GET_FUN_ADDR(SPATIALLY_REUSED_FROM), [E_TRUE_WW_SHARE] = GET_FUN_ADDR(TRUE_WW_SHARE), [E_TRUE_WR_SHARE] = GET_FUN_ADDR(TRUE_WR_SHARE), [E_TRUE_RW_SHARE] = GET_FUN_ADDR(TRUE_RW_SHARE), @@ -1241,7 +1256,7 @@ static inline cct_node_t *getPreciseNode(void *ctxt, void *precise_pc, int dummy sample_val_t v = hpcrun_sample_callpath(ctxt, dummyMetricId, SAMPLE_NO_INC, 0/*skipInner*/, 1/*isSync*/, NULL); cct_node_t *new_node = v.sample_node; if (precise_pc == 0) return new_node; - + //fprintf("precise_pc = %lx\n", precise_pc); cct_node_t *tmp_node = hpcrun_cct_parent(new_node); assert(tmp_node); if (is_same_function(hpcrun_context_pc(ctxt), precise_pc) == SAME_FN){ @@ -1495,7 +1510,9 @@ static WPTriggerActionType ReuseWPCallback(WatchPointInfo_t *wpi, int startOffse } } - cct_node_t *reuseNode = getPreciseNode(wt->ctxt, wt->pc, temporal_reuse_metric_id ); + //cct_node_t *reuseNode = getPreciseNode(wt->ctxt, wt->pc, temporal_reuse_metric_id ); + sample_val_t v = hpcrun_sample_callpath(wt->ctxt, temporal_reuse_metric_id, SAMPLE_NO_INC, 0/*skipInner*/, 1/*isSync*/, NULL); + cct_node_t *reuseNode = v.sample_node; #ifdef REUSE_HISTO WriteWitchTraceOutput("REUSE_DISTANCE: %d %d %lu,", hpcrun_cct_persistent_id(wpi->sample.node), hpcrun_cct_persistent_id(reuseNode), inc); @@ -1506,7 +1523,14 @@ static WPTriggerActionType ReuseWPCallback(WatchPointInfo_t *wpi, int startOffse #else - cct_node_t *reusePairNode = getConcatenatedNode(wpi->sample.node /*bottomNode*/, reuseNode /*topNode*/, joinNodes[E_TEMPORALLY_REUSED][joinNodeIdx] /* joinNode*/); + cct_node_t *reusePairNode; + if (reuse_concatenate_use_reuse){ + reusePairNode = getConcatenatedNode(reuseNode /*bottomNode*/, wpi->sample.node /*topNode*/, joinNodes[E_TEMPORALLY_REUSED_BY][joinNodeIdx] /* joinNode*/); + }else{ + reusePairNode = getConcatenatedNode(wpi->sample.node /*bottomNode*/, reuseNode /*topNode*/, joinNodes[E_TEMPORALLY_REUSED_FROM][joinNodeIdx] /* joinNode*/); + } + +#if 0 //jqswang: currently disable the value borrowing process uint64_t obtained_val[2]; for (int i=0; i < MIN(2, reuse_distance_num_events); i++){ uint64_t * buffer_ptr = (uint64_t *) get_metric_data_ptr(reuse_buffer_metric_ids[i], reusePairNode); @@ -1519,11 +1543,15 @@ static WPTriggerActionType ReuseWPCallback(WatchPointInfo_t *wpi, int startOffse } } - if ( obtained_val[0] > 0 && obtained_val[1] > 0){ //attribute the value + if ( obtained_val[0] > 0 && obtained_val[1] > 0) //attribute the value + { cct_metric_data_increment(reuse_memory_distance_metric_id, reusePairNode, (cct_metric_data_t){.i = (obtained_val[0] + obtained_val[1]) }); cct_metric_data_increment(reuse_memory_distance_count_metric_id, reusePairNode, (cct_metric_data_t){.i = 1}); } - +#else + cct_metric_data_increment(reuse_memory_distance_metric_id, reusePairNode, (cct_metric_data_t){.i = (val[0][0] + val[1][0]) }); + cct_metric_data_increment(reuse_memory_distance_count_metric_id, reusePairNode, (cct_metric_data_t){.i = 1}); +#endif reuseTemporal += inc; cct_metric_data_increment(temporal_reuse_metric_id, reusePairNode, (cct_metric_data_t){.i = inc}); cct_metric_data_increment(reuse_time_distance_metric_id, reusePairNode, (cct_metric_data_t){.i = time_distance}); diff --git a/src/tool/hpcrun/sample-sources/watchpoint_support.c b/src/tool/hpcrun/sample-sources/watchpoint_support.c index 5aa5285b45..913908e08a 100644 --- a/src/tool/hpcrun/sample-sources/watchpoint_support.c +++ b/src/tool/hpcrun/sample-sources/watchpoint_support.c @@ -206,7 +206,16 @@ static void InitConfig(){ } else { wpConfig.isLBREnabled = false; } - wpConfig.isLBREnabled = false; //jqswang: TODO + { //jqswang: Maybe we can move this part to other better location? + char * lbr_flag_str = getenv("HPCRUN_WP_REUSE_LBR"); + if (lbr_flag_str && 0 == strcasecmp(lbr_flag_str, "ENABLE")){ + wpConfig.isLBREnabled = true; + //printf(stderr,"LBR is set to TRUE\n"); + } else { + wpConfig.isLBREnabled = false; + //fprintf(stderr,"LBR is set to FALSE\n"); + } + } CHECK(close(fd)); From 44d793c271441757bf9633ad4b8115e287fedc30 Mon Sep 17 00:00:00 2001 From: bugubugu123 Date: Mon, 16 Apr 2018 11:35:44 -0400 Subject: [PATCH 33/43] Fixed a bug of overflow --- .../sample-sources/watchpoint_clients.c | 24 ++++++++++++------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/src/tool/hpcrun/sample-sources/watchpoint_clients.c b/src/tool/hpcrun/sample-sources/watchpoint_clients.c index 641361c920..b2df7a6e43 100755 --- a/src/tool/hpcrun/sample-sources/watchpoint_clients.c +++ b/src/tool/hpcrun/sample-sources/watchpoint_clients.c @@ -1492,24 +1492,30 @@ static WPTriggerActionType ReuseWPCallback(WatchPointInfo_t *wpi, int startOffse return ALREADY_DISABLED; } #endif //jqswang - // Report a reuse - double myProportion = ProportionOfWatchpointAmongOthersSharingTheSameContext(wpi); - uint64_t numDiffSamples = GetWeightedMetricDiffAndReset(wpi->sample.node, wpi->sample.sampledMetricId, myProportion); - uint64_t inc = numDiffSamples; - int joinNodeIdx = wpi->sample.isSamplePointAccurate? E_ACCURATE_JOIN_NODE_IDX : E_INACCURATE_JOIN_NODE_IDX; - uint64_t time_distance = rdtsc() - wpi->startTime; - - uint64_t val[2][3]; + uint64_t val[2][3]; for (int i=0; i < MIN(2, reuse_distance_num_events); i++){ linux_perf_read_event_counter( reuse_distance_events[i], val[i]); //fprintf(stderr, "USE: %lu %lu %lu, REUSE: %lu %lu %lu\n", wpi->sample.reuseDistance[i][0], wpi->sample.reuseDistance[i][1], wpi->sample.reuseDistance[i][2], val[i][0], val[i][1], val[i][2]); //fprintf(stderr, "DIFF: %lu\n", val[i][0] - wpi->sample.reuseDistance[i][0]); for(int j=0; j < 3; j++){ - val[i][j] -= wpi->sample.reuseDistance[i][j]; + if (val[i][j] >= wpi->sample.reuseDistance[i][j]){ + val[i][j] -= wpi->sample.reuseDistance[i][j]; + } + else { //Something wrong happens here and the record is not reliable. Drop it! + return ALREADY_DISABLED; + } } } + // Report a reuse + double myProportion = ProportionOfWatchpointAmongOthersSharingTheSameContext(wpi); + uint64_t numDiffSamples = GetWeightedMetricDiffAndReset(wpi->sample.node, wpi->sample.sampledMetricId, myProportion); + uint64_t inc = numDiffSamples; + int joinNodeIdx = wpi->sample.isSamplePointAccurate? E_ACCURATE_JOIN_NODE_IDX : E_INACCURATE_JOIN_NODE_IDX; + + uint64_t time_distance = rdtsc() - wpi->startTime; + //cct_node_t *reuseNode = getPreciseNode(wt->ctxt, wt->pc, temporal_reuse_metric_id ); sample_val_t v = hpcrun_sample_callpath(wt->ctxt, temporal_reuse_metric_id, SAMPLE_NO_INC, 0/*skipInner*/, 1/*isSync*/, NULL); cct_node_t *reuseNode = v.sample_node; From 0f4821c2606fcf0148fffc48adbf743e9ef54e21 Mon Sep 17 00:00:00 2001 From: bugubugu123 Date: Wed, 18 Apr 2018 15:02:03 -0400 Subject: [PATCH 34/43] Fixed some conflicts --- src/tool/hpcrun/sample-sources/perf/linux_perf.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/src/tool/hpcrun/sample-sources/perf/linux_perf.c b/src/tool/hpcrun/sample-sources/perf/linux_perf.c index e262bad713..1ab9af35c0 100644 --- a/src/tool/hpcrun/sample-sources/perf/linux_perf.c +++ b/src/tool/hpcrun/sample-sources/perf/linux_perf.c @@ -518,8 +518,6 @@ record_sample(event_thread_t *current, perf_mmap_data_t *mmap_data, } hpcrun_clear_handling_sample(td); #endif -<<<<<<< HEAD -======= // check whether we can get the ra_loc in each frame if (ENABLED(RALOC)) { @@ -533,7 +531,6 @@ record_sample(event_thread_t *current, perf_mmap_data_t *mmap_data, } ->>>>>>> upstream/master if(WatchpointClientActive()){ OnSample(mmap_data, hpcrun_context_pc(context), @@ -773,11 +770,8 @@ METHOD_FN(process_event_list, int lush_metrics) // automatically. But in practice, it didn't. Not sure why. for (event = start_tok(evlist); more_tok(); event = next_tok(), num_events++); -<<<<<<< HEAD self->evl.nevents = num_events; -======= ->>>>>>> upstream/master // setup all requested events // if an event cannot be initialized, we still keep it in our list From b154c5d00e1ec7e9afef02d5d50d7af114de1fc4 Mon Sep 17 00:00:00 2001 From: bugubugu123 Date: Tue, 24 Apr 2018 14:57:45 -0400 Subject: [PATCH 35/43] Added the reservoir sampling patch --- src/tool/hpcrun/sample-sources/watchpoint_support.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/tool/hpcrun/sample-sources/watchpoint_support.c b/src/tool/hpcrun/sample-sources/watchpoint_support.c index 9a8d3a5342..0017a47799 100644 --- a/src/tool/hpcrun/sample-sources/watchpoint_support.c +++ b/src/tool/hpcrun/sample-sources/watchpoint_support.c @@ -633,6 +633,12 @@ static VictimType GetVictim(int * location, ReplacementPolicy policy){ for(int i = 0; i < wpConfig.maxWP; i++){ if(!tData.watchPointArray[i].isActive) { *location = i; + // Increase samplePostFull for those who survived. + for(int rest = 0; rest < wpConfig.maxWP; rest++){ + if (tData.watchPointArray[rest].isActive) { + tData.watchPointArray[rest].samplePostFull++; + } + } return EMPTY_SLOT; } } @@ -692,6 +698,9 @@ static VictimType GetVictim(int * location, ReplacementPolicy policy){ if(randValue <= probabilityToReplace) { *location = loc; + for(int rest = i+1; rest < wpConfig.maxWP; rest++){ + tData.watchPointArray[slots[rest]].samplePostFull++; + } return NON_EMPTY_SLOT; } // TODO: Milind: Not sure whether I should increment samplePostFull of the remainiing slots. From 2c3818cf73cd625d85bb05a70442844fe5be9fdc Mon Sep 17 00:00:00 2001 From: bugubugu123 Date: Tue, 8 May 2018 15:49:31 -0400 Subject: [PATCH 36/43] Added online profiling (without tracing) --- .../sample-sources/watchpoint_clients.c | 159 ++++++++++++++++-- .../sample-sources/watchpoint_support.h | 1 + 2 files changed, 143 insertions(+), 17 deletions(-) diff --git a/src/tool/hpcrun/sample-sources/watchpoint_clients.c b/src/tool/hpcrun/sample-sources/watchpoint_clients.c index b2df7a6e43..e737755757 100755 --- a/src/tool/hpcrun/sample-sources/watchpoint_clients.c +++ b/src/tool/hpcrun/sample-sources/watchpoint_clients.c @@ -164,6 +164,12 @@ int true_wr_metric_id = -1; int *reuse_distance_events = NULL; int reuse_distance_num_events = 0; #ifdef REUSE_HISTO +bool reuse_output_trace = false; +double reuse_bin_start = 0; +double reuse_bin_ratio = 0; +uint64_t * reuse_bin_list = NULL; +double * reuse_bin_pivot_list = NULL; // store the bin intervals +int reuse_bin_size = 0; #else AccessType reuse_monitor_type = LOAD_AND_STORE; // WP_REUSE: what kind of memory access can be used to subscribe the watchpoint WatchPointType reuse_trap_type = WP_RW; // WP_REUSE: what kind of memory access can trap the watchpoint @@ -359,7 +365,58 @@ static int WriteWitchTraceOutput(const char *fmt, ...){ return 0; } - +#ifdef REUSE_HISTO +void ExpandReuseBinList(){ + // each time we double the size of reuse_bin_list + uint64_t *old_reuse_bin_list = reuse_bin_list; + double *old_reuse_bin_pivot_list = reuse_bin_pivot_list; + int old_reuse_bin_size = reuse_bin_size; + reuse_bin_size *= 2; + + reuse_bin_list = hpcrun_malloc(sizeof(uint64_t) * reuse_bin_size); + memset(reuse_bin_list, 0, sizeof(uint64_t) * reuse_bin_size); + memcpy(reuse_bin_list, old_reuse_bin_list, sizeof(uint64_t) * old_reuse_bin_size); + + reuse_bin_pivot_list = hpcrun_malloc(sizeof(double) * reuse_bin_size); + memset(reuse_bin_pivot_list, 0, sizeof(double) * reuse_bin_size); + memcpy(reuse_bin_pivot_list, old_reuse_bin_pivot_list, sizeof(double) * old_reuse_bin_size); + for(int i=old_reuse_bin_size; i < reuse_bin_size; i++){ + reuse_bin_pivot_list[i] = reuse_bin_pivot_list[i-1] * reuse_bin_ratio; + } + + //hpcrun_free(old_reuse_bin_list); + //hpcrun_free(old_reuse_bin_pivot_list); +} + +int FindReuseBinIndex(uint64_t distance){ + + if (distance < reuse_bin_pivot_list[0]){ + return 0; + } + if (distance >= reuse_bin_pivot_list[reuse_bin_size - 1]){ + ExpandReuseBinList(); + return FindReuseBinIndex(distance); + } + + int left = 0, right = reuse_bin_size - 1; + while(left + 1 < right){ + int mid = (left + right) / 2; + if ( distance < reuse_bin_pivot_list[mid]){ + right = mid; + } else { + left = mid; + } + } + assert(left + 1 == right); + return left + 1; +} + + +void ReuseAddDistance(uint64_t distance, uint64_t inc ){ + int index = FindReuseBinIndex(distance); + reuse_bin_list[index] += inc; +} +#endif /****************************************************************************** @@ -572,7 +629,7 @@ METHOD_FN(start) } td->ss_state[self->sel_idx] = START; #ifdef REUSE_HISTO - assert(OpenWitchTraceOutput()==0); + assert(OpenWitchTraceOutput()==0); #endif } @@ -605,8 +662,15 @@ static void ClientTermination(){ #ifdef REUSE_HISTO uint64_t val[3]; //fprintf(stderr, "FINAL_COUNTING:"); - WriteWitchTraceOutput("FINAL_COUNTING:"); - + if (reuse_output_trace == false){ //dump the bin info + WriteWitchTraceOutput("BIN_START: %lf\n", reuse_bin_start); + WriteWitchTraceOutput("BIN_RATIO: %lf\n", reuse_bin_ratio); + for(int i=0; i < reuse_bin_size; i++){ + WriteWitchTraceOutput("BIN: %d %lu\n", i, reuse_bin_list[i]); + } + } + + WriteWitchTraceOutput("FINAL_COUNTING:"); for (int i=0; i < MIN(2,reuse_distance_num_events); i++){ linux_perf_read_event_counter(reuse_distance_events[i], val); //fprintf(stderr, " %lu %lu %lu,", val[0], val[1], val[2]);//jqswang @@ -907,7 +971,60 @@ METHOD_FN(process_event_list, int lush_metrics) case WP_REUSE: { #ifdef REUSE_HISTO -#else + { + char * bin_scheme_str = getenv("HPCRUN_WP_REUSE_BIN_SCHEME"); + if (bin_scheme_str){ + if ( 0 == strcasecmp(bin_scheme_str, "TRACE")){ + reuse_output_trace = true; + } + else { // it should be two numbers connected by "," + // For example, 4000.0,2.0 + char *dup_str = strdup(bin_scheme_str); + char *pos = strchr(dup_str, ','); + if ( pos == NULL){ + EEMSG("Invalid value of the environmental variable HPCRUN_WP_REUSE_BIN_SCHEME"); + free(dup_str); + monitor_real_abort(); + } + pos[0] = '\0'; + pos += 1; + + char *endptr; + reuse_bin_start = strtod(dup_str, &endptr); + if (reuse_bin_start <= 0.0 || reuse_bin_start == HUGE_VAL || endptr[0] != '\0'){ + EEMSG("Invalid value of the environmental variable HPCRUN_WP_REUSE_BIN_SCHEME"); + free(dup_str); + monitor_real_abort(); + } + reuse_bin_ratio = strtod(pos, &endptr); + if (reuse_bin_ratio <= 1.0 || reuse_bin_ratio == HUGE_VAL || endptr[0] != '\0'){ + EEMSG("Invalid value of the environmental variable HPCRUN_WP_REUSE_BIN_SCHEME"); + free(dup_str); + monitor_real_abort(); + } + free(dup_str); + printf("HPCRUN: start %lf, ratio %lf\n", reuse_bin_start, reuse_bin_ratio); + } + } else { //default + reuse_output_trace = false; + reuse_bin_start = 4000; + reuse_bin_ratio = 2; + } + + if (reuse_output_trace == false){ + reuse_bin_size = 20; + reuse_bin_list = hpcrun_malloc(sizeof(uint64_t)*reuse_bin_size); + memset(reuse_bin_list, 0, sizeof(uint64_t)*reuse_bin_size); + reuse_bin_pivot_list = hpcrun_malloc(sizeof(double)*reuse_bin_size); + reuse_bin_pivot_list[0] = reuse_bin_start; + for(int i=1; i < reuse_bin_size; i++){ + reuse_bin_pivot_list[i] = reuse_bin_pivot_list[i-1] * reuse_bin_ratio; + } + } + + } + +#else { char * monitor_type_str = getenv("HPCRUN_WP_REUSE_MONITOR_TYPE"); if(monitor_type_str){ @@ -948,7 +1065,7 @@ METHOD_FN(process_event_list, int lush_metrics) { char *concatenate_order_str = getenv("HPCRUN_WP_REUSE_CONCATENATE_ORDER"); if(concatenate_order_str && 0 == strcasecmp(concatenate_order_str, "USE_REUSE")){ - reuse_concatenate_use_reuse = true; + reuse_concatenate_use_reuse = true; } else{ reuse_concatenate_use_reuse = false; } @@ -969,7 +1086,7 @@ METHOD_FN(process_event_list, int lush_metrics) hpcrun_set_metric_info_and_period(reuse_time_distance_metric_id, "TIME_DISTANCE_SUM", MetricFlags_ValFmt_Int, 1, metric_property_none); reuse_time_distance_count_metric_id = hpcrun_new_metric(); hpcrun_set_metric_info_and_period(reuse_time_distance_count_metric_id, "TIME_DISTANCE_COUNT", MetricFlags_ValFmt_Int, 1, metric_property_none); - + // the next two buffers only for internal use reuse_buffer_metric_ids[0] = hpcrun_new_metric(); hpcrun_set_metric_info_and_period(reuse_buffer_metric_ids[0], "REUSE_BUFFER_1", MetricFlags_ValFmt_Int, 1, metric_property_none); @@ -1496,7 +1613,7 @@ static WPTriggerActionType ReuseWPCallback(WatchPointInfo_t *wpi, int startOffse uint64_t val[2][3]; for (int i=0; i < MIN(2, reuse_distance_num_events); i++){ linux_perf_read_event_counter( reuse_distance_events[i], val[i]); - //fprintf(stderr, "USE: %lu %lu %lu, REUSE: %lu %lu %lu\n", wpi->sample.reuseDistance[i][0], wpi->sample.reuseDistance[i][1], wpi->sample.reuseDistance[i][2], val[i][0], val[i][1], val[i][2]); + //fprintf(stderr, "USE: %lu %lu %lu, REUSE: %lu %lu %lu\n", wpi->sample.reuseDistance[i][0], wpi->sample.reuseDistance[i][1], wpi->sample.reuseDistance[i][2], val[i][0], val[i][1], val[i][2]); //fprintf(stderr, "DIFF: %lu\n", val[i][0] - wpi->sample.reuseDistance[i][0]); for(int j=0; j < 3; j++){ if (val[i][j] >= wpi->sample.reuseDistance[i][j]){ @@ -1520,20 +1637,28 @@ static WPTriggerActionType ReuseWPCallback(WatchPointInfo_t *wpi, int startOffse sample_val_t v = hpcrun_sample_callpath(wt->ctxt, temporal_reuse_metric_id, SAMPLE_NO_INC, 0/*skipInner*/, 1/*isSync*/, NULL); cct_node_t *reuseNode = v.sample_node; -#ifdef REUSE_HISTO - WriteWitchTraceOutput("REUSE_DISTANCE: %d %d %lu,", hpcrun_cct_persistent_id(wpi->sample.node), hpcrun_cct_persistent_id(reuseNode), inc); - for(int i=0; i < MIN(2, reuse_distance_num_events); i++){ - WriteWitchTraceOutput(" %lu %lu %lu,", val[i][0], val[i][1], val[i][2]); +#ifdef REUSE_HISTO + if (reuse_output_trace){ + WriteWitchTraceOutput("REUSE_DISTANCE: %d %d %lu,", hpcrun_cct_persistent_id(wpi->sample.node), hpcrun_cct_persistent_id(reuseNode), inc); + for(int i=0; i < MIN(2, reuse_distance_num_events); i++){ + WriteWitchTraceOutput(" %lu %lu %lu,", val[i][0], val[i][1], val[i][2]); + } + WriteWitchTraceOutput("\n"); + } else{ + uint64_t rd = 0; + for(int i=0; i < MIN(2, reuse_distance_num_events); i++){ + assert(val[i][1] == 0 && val[i][2] == 0); // no counter multiplexing allowed + rd += val[i][0]; + } + ReuseAddDistance(rd, inc); } - WriteWitchTraceOutput("\n"); - #else cct_node_t *reusePairNode; if (reuse_concatenate_use_reuse){ reusePairNode = getConcatenatedNode(reuseNode /*bottomNode*/, wpi->sample.node /*topNode*/, joinNodes[E_TEMPORALLY_REUSED_BY][joinNodeIdx] /* joinNode*/); }else{ - reusePairNode = getConcatenatedNode(wpi->sample.node /*bottomNode*/, reuseNode /*topNode*/, joinNodes[E_TEMPORALLY_REUSED_FROM][joinNodeIdx] /* joinNode*/); + reusePairNode = getConcatenatedNode(wpi->sample.node /*bottomNode*/, reuseNode /*topNode*/, joinNodes[E_TEMPORALLY_REUSED_FROM][joinNodeIdx] /* joinNode*/); } #if 0 //jqswang: currently disable the value borrowing process @@ -1561,7 +1686,7 @@ static WPTriggerActionType ReuseWPCallback(WatchPointInfo_t *wpi, int startOffse reuseTemporal += inc; cct_metric_data_increment(temporal_reuse_metric_id, reusePairNode, (cct_metric_data_t){.i = inc}); cct_metric_data_increment(reuse_time_distance_metric_id, reusePairNode, (cct_metric_data_t){.i = time_distance}); - cct_metric_data_increment(reuse_time_distance_count_metric_id, reusePairNode, (cct_metric_data_t){.i = 1}); + cct_metric_data_increment(reuse_time_distance_count_metric_id, reusePairNode, (cct_metric_data_t){.i = 1}); #endif @@ -2575,7 +2700,7 @@ bool OnSample(perf_mmap_data_t * mmap_data, void * contextPC, cct_node_t *node, sd.va = aligned_pc + (r * accessLen); #endif } - else + else #endif { sd.va = data_addr; diff --git a/src/tool/hpcrun/sample-sources/watchpoint_support.h b/src/tool/hpcrun/sample-sources/watchpoint_support.h index d3b8117d64..91538f2c8d 100644 --- a/src/tool/hpcrun/sample-sources/watchpoint_support.h +++ b/src/tool/hpcrun/sample-sources/watchpoint_support.h @@ -60,6 +60,7 @@ #define MIN(x, y) (((x) < (y)) ? (x) : (y)) +#define MAX(x, y) (((x) > (y)) ? (x) : (y)) #define MAX_WP_LENGTH (8L) #define CACHE_LINE_SZ (64) #define ALIGN_TO_CACHE_LINE(addr) ((uint64_t)(addr) & (~(CACHE_LINE_SZ-1))) From d8b28921f7082c07855c9a93a208f74fd57a9dc3 Mon Sep 17 00:00:00 2001 From: bugubugu123 Date: Wed, 9 May 2018 11:50:13 -0400 Subject: [PATCH 37/43] cleaned some code --- src/tool/hpcrun/hpcrun_stats.c | 50 ++--- src/tool/hpcrun/hpcrun_stats.h | 10 +- src/tool/hpcrun/metrics.c | 25 ++- .../hpcrun/sample-sources/perf/linux_perf.c | 142 +++++--------- .../sample-sources/watchpoint_clients.c | 11 +- .../sample-sources/watchpoint_support.c | 184 +++++++++--------- 6 files changed, 187 insertions(+), 235 deletions(-) diff --git a/src/tool/hpcrun/hpcrun_stats.c b/src/tool/hpcrun/hpcrun_stats.c index f0c9391163..5d4867eaa8 100644 --- a/src/tool/hpcrun/hpcrun_stats.c +++ b/src/tool/hpcrun/hpcrun_stats.c @@ -275,70 +275,70 @@ void hpcrun_stats_num_insane_ip_inc(long val) { atomic_fetch_add_explicit(&num_insane_ip, val, memory_order_relaxed); -} - - -long +} + + +long hpcrun_stats_num_insane_ip(void) -{ +{ return atomic_load_explicit(&num_insane_ip, memory_order_relaxed); -} +} void hpcrun_stats_num_writtenBytes_inc(long val) { atomic_fetch_add_explicit(&num_writtenBytes, val, memory_order_relaxed); -} - +} + void hpcrun_stats_num_usedBytes_inc(long val) { atomic_fetch_add_explicit(&num_usedBytes, val, memory_order_relaxed); -} +} void hpcrun_stats_num_deadBytes_inc(long val) { atomic_fetch_add_explicit(&num_deadBytes, val, memory_order_relaxed); -} +} void hpcrun_stats_num_newBytes_inc(long val) { atomic_fetch_add_explicit(&num_newBytes, val, memory_order_relaxed); -} +} void hpcrun_stats_num_oldAppxBytes_inc(long val) { atomic_fetch_add_explicit(&num_oldAppxBytes, val, memory_order_relaxed); -} +} void hpcrun_stats_num_oldBytes_inc(long val) { atomic_fetch_add_explicit(&num_oldBytes, val, memory_order_relaxed); -} - +} + void hpcrun_stats_num_loadedBytes_inc(long val) { atomic_fetch_add_explicit(&num_loadedBytes, val, memory_order_relaxed); -} +} void hpcrun_stats_num_accessedIns_inc(long val) { atomic_fetch_add_explicit(&num_accessedIns, val, memory_order_relaxed); -} +} void hpcrun_stats_num_reuseTemporal_inc(long val) { atomic_fetch_add_explicit(&num_reuseTemporal, val, memory_order_relaxed); -} +} void hpcrun_stats_num_reuseSpatial_inc(long val) @@ -350,26 +350,26 @@ void hpcrun_stats_num_latency_inc(long val) { atomic_fetch_add_explicit(&num_latency, val, memory_order_relaxed); -} +} void hpcrun_stats_num_falseWWIns_inc(long val) { atomic_fetch_add_explicit(&num_falseWWIns, val, memory_order_relaxed); -} +} void hpcrun_stats_num_falseRWIns_inc(long val) { atomic_fetch_add_explicit(&num_falseRWIns, val, memory_order_relaxed); -} +} void hpcrun_stats_num_falseWRIns_inc(long val) { atomic_fetch_add_explicit(&num_falseWRIns, val, memory_order_relaxed); -} +} void hpcrun_stats_num_trueWWIns_inc(long val) @@ -391,7 +391,7 @@ hpcrun_stats_num_trueWRIns_inc(long val) //----------------------------- -// samples total +// samples total //----------------------------- void @@ -410,7 +410,7 @@ hpcrun_stats_num_samples_total(void) //----------------------------- -// samples attempted +// samples attempted //----------------------------- void @@ -429,7 +429,7 @@ hpcrun_stats_num_samples_attempted(void) //----------------------------- -// samples blocked async +// samples blocked async //----------------------------- // The async blocks happen in the signal handlers, without getting to @@ -451,7 +451,7 @@ hpcrun_stats_num_samples_blocked_async(void) //----------------------------- -// samples blocked dlopen +// samples blocked dlopen //----------------------------- void diff --git a/src/tool/hpcrun/hpcrun_stats.h b/src/tool/hpcrun/hpcrun_stats.h index 7e2c70c724..045424cf31 100644 --- a/src/tool/hpcrun/hpcrun_stats.h +++ b/src/tool/hpcrun/hpcrun_stats.h @@ -52,7 +52,7 @@ void hpcrun_stats_reinit(void); //----------------------------- -// watchpoint +// watchpoint //----------------------------- void hpcrun_stats_num_samples_imprecise_inc(long val); long hpcrun_stats_num_samples_imprecise(void); @@ -91,7 +91,7 @@ void hpcrun_stats_num_loadedBytes_inc(long val); //----------------------------- -// samples total +// samples total //----------------------------- void hpcrun_stats_num_samples_total_inc(void); @@ -99,7 +99,7 @@ long hpcrun_stats_num_samples_total(void); //----------------------------- -// samples attempted +// samples attempted //----------------------------- void hpcrun_stats_num_samples_attempted_inc(void); @@ -107,7 +107,7 @@ long hpcrun_stats_num_samples_attempted(void); //----------------------------- -// samples blocked async +// samples blocked async //----------------------------- void hpcrun_stats_num_samples_blocked_async_inc(void); @@ -115,7 +115,7 @@ long hpcrun_stats_num_samples_blocked_async(void); //----------------------------- -// samples blocked dlopen +// samples blocked dlopen //----------------------------- void hpcrun_stats_num_samples_blocked_dlopen_inc(void); diff --git a/src/tool/hpcrun/metrics.c b/src/tool/hpcrun/metrics.c index a4627f24d8..d269c79493 100644 --- a/src/tool/hpcrun/metrics.c +++ b/src/tool/hpcrun/metrics.c @@ -211,7 +211,7 @@ hpcrun_get_num_metrics() id2metric[l->id] = &(l->val); } metric_proc_tbl = (metric_upd_proc_t**) hpcrun_malloc(n_metrics * sizeof(metric_upd_proc_t*)); - + for(metric_proc_map_t* l = proc_map; l; l = l->next) { // for(metric_proc_map_t* l = proc_map; l; l = l->next) { TMSG(METRICS_FINALIZE, "metric_proc[%d] = %p", l->id, l->proc); @@ -233,7 +233,7 @@ hpcrun_get_num_metrics() // Finalize metrics -void hpcrun_finalize_metrics() +void hpcrun_finalize_metrics() { hpcrun_get_num_metrics(); } @@ -241,7 +241,7 @@ void hpcrun_finalize_metrics() metric_desc_t* hpcrun_id2metric(int metric_id) { - hpcrun_get_num_metrics(); + hpcrun_get_num_metrics(); if ((0 <= metric_id) && (metric_id < n_metrics)) { return id2metric[metric_id]; } @@ -307,7 +307,7 @@ hpcrun_new_metric_of_kind(kind_info_t* kind) kind->idx++; n_metrics++; - + // // No preallocation for metric_proc tbl // @@ -316,7 +316,7 @@ hpcrun_new_metric_of_kind(kind_info_t* kind) m->id = metric_data->id; m->proc = (metric_upd_proc_t*) NULL; proc_map = m; - + return metric_data->id; } @@ -326,7 +326,7 @@ hpcrun_new_metric(void) return hpcrun_new_metric_of_kind(current_kind); } -metric_desc_t* +metric_desc_t* hpcrun_set_metric_info_w_fn(int metric_id, const char* name, MetricFlags_ValFmt_t valFmt, size_t period, metric_upd_proc_t upd_fn, metric_desc_properties_t prop) @@ -377,7 +377,7 @@ hpcrun_set_metric_info_w_fn(int metric_id, const char* name, } -metric_desc_t* +metric_desc_t* hpcrun_set_metric_info_and_period(int metric_id, const char* name, MetricFlags_ValFmt_t valFmt, size_t period, metric_desc_properties_t prop) { @@ -454,7 +454,7 @@ hpcrun_metric_std(int metric_id, metric_set_t* set, switch (minfo->flags.fields.valFmt) { case MetricFlags_ValFmt_Int: if (operation == '+') - loc->i += val.i; + loc->i += val.i; else if (operation == '=') loc->i = val.i; break; @@ -479,13 +479,13 @@ hpcrun_metric_std_set(int metric_id, metric_set_t* set, } // -// Given two metrics, metric_id1 and metric_id2, -// bump up metric_id2 to reach metric_id1 and return +// Given two metrics, metric_id1 and metric_id2, +// bump up metric_id2 to reach metric_id1 and return // the difference between them multiplied by the period. // int -hpcrun_get_weighted_metric_diff(int metric_id1, int metric_id2, - metric_set_t* set, cct_metric_data_t * diff, +hpcrun_get_weighted_metric_diff(int metric_id1, int metric_id2, + metric_set_t* set, cct_metric_data_t * diff, cct_metric_data_t * diffWithPeriod) { metric_desc_t* minfo1 = hpcrun_id2metric(metric_id1); @@ -509,7 +509,6 @@ hpcrun_get_weighted_metric_diff(int metric_id1, int metric_id2, diff->i = (loc1->i - loc2->i); break; case MetricFlags_ValFmt_Real: - //assert(loc1->r >= loc2->r); //jqswang if (loc1->r < loc2->r){ diff->r = 0; } diff --git a/src/tool/hpcrun/sample-sources/perf/linux_perf.c b/src/tool/hpcrun/sample-sources/perf/linux_perf.c index 1ab9af35c0..f9bec9f2c0 100644 --- a/src/tool/hpcrun/sample-sources/perf/linux_perf.c +++ b/src/tool/hpcrun/sample-sources/perf/linux_perf.c @@ -62,7 +62,7 @@ #include #include -#include +#include #include #include #include @@ -106,7 +106,7 @@ #include // prefix for metric helper -#include +#include #ifdef ENABLE_PERFMON #include "perfmon-util.h" @@ -173,36 +173,22 @@ struct event_threshold_s { }; //****************************************************************************** -// forward declarations +// forward declarations //****************************************************************************** static int restart_perf_event(int fd); -static bool +static bool perf_thread_init(event_info_t *event, event_thread_t *et); -static void +static void perf_thread_fini(int nevents, event_thread_t *event_thread); -static int +static int perf_event_handler( int sig, siginfo_t* siginfo, void* context); -static inline uint64_t perf_scale(uint64_t *values) { //jqswang - uint64_t res = 0; - - if (!values[2] && !values[1] && values[0]) { - fprintf(stderr,"WARNING: time_running = 0 = time_enabled, raw count not zero\n"); - } - if (values[2] > values[1]) { - fprintf(stderr, "WARNING: time_running > time_enabled\n"); - } - if (values[2]) { - res = (uint64_t)((double)values[0] * values[1]/values[2]); - } - return res; -} //****************************************************************************** // constants @@ -240,13 +226,30 @@ extern __thread bool hpcrun_thread_suppress_sample; //****************************************************************************** -// private operations +// private operations //****************************************************************************** +// The array values consists of three elements. +// values[0]: raw counter value; values[1]: the time enabling; values[1]: the time running +static inline uint64_t perf_scale(uint64_t *values) { + uint64_t res = 0; + + if (!values[2] && !values[1] && values[0]) { + EMSG("WARNING: time_running = 0 = time_enabled, raw count not zero\n"); + } + if (values[2] > values[1]) { + EMSG("WARNING: time_running > time_enabled\n"); + } + if (values[2]) { + res = (uint64_t)((double)values[0] * values[1]/values[2]); + } + return res; +} + /* * Enable all the counters - */ + */ static void perf_start_all(int nevents, event_thread_t *event_thread) { @@ -258,7 +261,7 @@ perf_start_all(int nevents, event_thread_t *event_thread) /* * Disable all the counters - */ + */ static void perf_stop_all(int nevents, event_thread_t *event_thread) { @@ -286,12 +289,12 @@ perf_get_pmu_support(const char *name, struct perf_event_attr *event_attr) // initialization //---------------------------------------------------------- -static void +static void perf_init() { perf_mmap_init(); - // initialize mask to block PERF_SIGNAL + // initialize mask to block PERF_SIGNAL sigemptyset(&sig_mask); sigaddset(&sig_mask, PERF_SIGNAL); @@ -321,7 +324,7 @@ perf_init() // initialize an event // event_num: event number // name: name of event (has to be recognized by perf event) -// threshold: sampling threshold +// threshold: sampling threshold //---------------------------------------------------------- static bool perf_thread_init(event_info_t *event, event_thread_t *et) @@ -343,14 +346,14 @@ perf_thread_init(event_info_t *event, event_thread_t *et) return false; } - // create mmap buffer for this file + // create mmap buffer for this file et->mmap = set_mmap(et->fd); // make sure the file I/O is asynchronous int flag = fcntl(et->fd, F_GETFL, 0); int ret = fcntl(et->fd, F_SETFL, flag | O_ASYNC ); if (ret == -1) { - EMSG("Can't set notification for event %d, fd: %d: %s", + EMSG("Can't set notification for event %d, fd: %d: %s", event->id, et->fd, strerror(errno)); } @@ -368,7 +371,7 @@ perf_thread_init(event_info_t *event, event_thread_t *et) owner.pid = syscall(SYS_gettid); ret = fcntl(et->fd, F_SETOWN_EX, &owner); if (ret == -1) { - EMSG("Can't set thread owner for event %d, fd: %d: %s", + EMSG("Can't set thread owner for event %d, fd: %d: %s", event->id, et->fd, strerror(errno)); } @@ -378,7 +381,7 @@ perf_thread_init(event_info_t *event, event_thread_t *et) //---------------------------------------------------------- -// actions when the program terminates: +// actions when the program terminates: // - unmap the memory // - close file descriptors used by each event //---------------------------------------------------------- @@ -386,10 +389,10 @@ static void perf_thread_fini(int nevents, event_thread_t *event_thread) { for(int i=0; iprecise_pc = 0; } - - if ( strstr(current->event->metric_desc->name, "LATENCY_ABOVE_THRESHOLD") || strstr(current->event->metric_desc->name, "LOAD_LATENCY") ) { + + if ( strstr(current->event->metric_desc->name, "LATENCY_ABOVE_THRESHOLD") || strstr(current->event->metric_desc->name, "LOAD_LATENCY") ) { perf_mmap_data_src_t data_src; data_src.val = mmap_data->data_src; if ( (data_src.mem_lvl & PERF_MEM_LVL_HIT) && (data_src.mem_lvl & PERF_MEM_LVL_L1)){ // L1 HIT, ignore @@ -530,7 +533,7 @@ record_sample(event_thread_t *current, perf_mmap_data_t *mmap_data, TMSG(RALOC, "--------------------------"); } - + if(WatchpointClientActive()){ OnSample(mmap_data, hpcrun_context_pc(context), @@ -710,7 +713,7 @@ METHOD_FN(shutdown) // FIXME: add component shutdown code here event_thread_t *event_thread = TD_GET(ss_info)[self->sel_idx].ptr; - int nevents = (self->evl).nevents; + int nevents = (self->evl).nevents; perf_thread_fini(nevents, event_thread); @@ -750,7 +753,7 @@ METHOD_FN(supports_event, const char *ev_str) } - + // -------------------------------------------------------------------------- // handle a list of events // -------------------------------------------------------------------------- @@ -770,9 +773,9 @@ METHOD_FN(process_event_list, int lush_metrics) // automatically. But in practice, it didn't. Not sure why. for (event = start_tok(evlist); more_tok(); event = next_tok(), num_events++); - + self->evl.nevents = num_events; - + // setup all requested events // if an event cannot be initialized, we still keep it in our list // but there will be no samples @@ -845,7 +848,7 @@ METHOD_FN(process_event_list, int lush_metrics) // ------------------------------------------------------------ // initialize the property of the metric - // if the metric's name has "CYCLES" it mostly a cycle metric + // if the metric's name has "CYCLES" it mostly a cycle metric // this assumption is not true, but it's quite closed // ------------------------------------------------------------ @@ -867,14 +870,14 @@ METHOD_FN(process_event_list, int lush_metrics) reuse_distance_events[reuse_distance_num_events++] = i; } /**************************************************/ - + // ------------------------------------------------------------ // if we use frequency (event_type=1) then the period is not deterministic, // it can change dynamically. In this case, the period is 1 // ------------------------------------------------------------ if (!is_period) { - // using frequency : the threshold is always 1, + // using frequency : the threshold is always 1, // since the period is determine dynamically threshold = 1; } @@ -1004,40 +1007,15 @@ restart_perf_event(int fd) TMSG(LINUX_PERF, "Unable to start event: fd is not valid"); return -1; } -#if 0 //jqswang - uint64_t val[3]; - read(fd, val, sizeof(uint64_t)*3); - //fprintf(stderr, "Before RESET %lx %lx %lx\n", val[0], val[1], val[2]); - fprintf(stderr, "Before RESET1 %lx\n", val[0]); - - for(volatile int i=0; i< 1000; i++); - read(fd, val, sizeof(uint64_t)*3); - fprintf(stderr, "Before RESET2 %lx\n", val[0]); -#endif int ret = ioctl(fd, PERF_EVENT_IOC_RESET, 0); if (ret == -1) { TMSG(LINUX_PERF, "error fd %d in PERF_EVENT_IOC_RESET: %s", fd, strerror(errno)); } -#if 0 //jqsang - read(fd, val, sizeof(uint64_t)*3); - //fprintf(stderr, "AFTER RESET %lx %lx %lx\n", val[0], val[1], val[2]); - fprintf(stderr, "AFTER RESET %lx\n", val[0]); -#endif - ret = ioctl(fd, PERF_EVENT_IOC_REFRESH, 1); if (ret == -1) { TMSG(LINUX_PERF, "error fd %d in IOC_REFRESH: %s", fd, strerror(errno)); } - //jqswang -#if 0 - for(volatile int i=0; i< 1000; i++); - ioctl(fd, PERF_EVENT_IOC_DISABLE, 0); - uint64_t val[3]; - ret = read(fd, val, sizeof(uint64_t) * 3 ); - fprintf(stderr, "After DISABLE %lx %lx %lx\n", val[0], val[1], val[2]); - ioctl(fd, PERF_EVENT_IOC_ENABLE, 0); -#endif return ret; } /*************************************************************************** @@ -1073,7 +1051,7 @@ int linux_perf_read_event_counter(int event_index, uint64_t *val){ event_thread_t *event_thread = TD_GET(ss_info)[self->sel_idx].ptr; event_thread_t *current = &(event_thread[event_index]); - + int ret = perf_read_event_counter(current, val); if (ret < 0) return -1; // something wrong here @@ -1082,8 +1060,6 @@ int linux_perf_read_event_counter(int event_index, uint64_t *val){ return 0; } else { // overflow event - //fprintf(stderr, "DEBUG: %lu, %lu %lu %lu\n", sample_period, val[0],val[1],val[2]); - //uint64_t scaled_val = perf_scale(val); assert(val[1] == val[2]); //jqswang: TODO: I have no idea how to calculate the value under multiplexing for overflow event. int64_t scaled_val = (int64_t) val[0] ;//% sample_period; if (scaled_val >= sample_period || scaled_val < 0){ //jqswang: TODO: it does not filter out all the invalid values @@ -1105,8 +1081,8 @@ int linux_perf_read_event_counter(int event_index, uint64_t *val){ static int perf_event_handler( - int sig, - siginfo_t* siginfo, + int sig, + siginfo_t* siginfo, void* context ) { @@ -1141,7 +1117,7 @@ perf_event_handler( // ---------------------------------------------------------------------------- if (siginfo->si_code < 0) { - TMSG(LINUX_PERF, "signal si_code %d < 0 indicates not from kernel", + TMSG(LINUX_PERF, "signal si_code %d < 0 indicates not from kernel", siginfo->si_code); perf_start_all(nevents, event_thread); @@ -1177,7 +1153,7 @@ perf_event_handler( // ---------------------------------------------------------------------------- // check #4: // check the index of the file descriptor (if we have multiple events) - // if the file descriptor is not on the list, we shouldn't store the + // if the file descriptor is not on the list, we shouldn't store the // metrics. Perhaps we should throw away? // ---------------------------------------------------------------------------- @@ -1195,7 +1171,7 @@ perf_event_handler( return 1; // tell monitor the signal has not been handled. } - // Increment the number of overflows for the current event + // Increment the number of overflows for the current event current->num_overflows++; @@ -1221,20 +1197,6 @@ perf_event_handler( kernel_block_handler(current, sv, &mmap_data); } while (more_data); -#if 0//jqswang - uint64_t val[3]; - read(fd, val, sizeof(uint64_t)*3); - //fprintf(stderr, "Before RESET %lx %lx %lx\n", val[0], val[1], val[2]); - //fprintf(stderr, "Before RESTART %s %lx\n", current->event->metric_desc->name,val[0]); - extern int reuse_distance_num_events; - extern int *reuse_distance_events; - for (int i=0; i < MIN(2, reuse_distance_num_events); i++){ - //linux_perf_read_event_counter( reuse_distance_events[i], val); - //fprintf(stderr, "READING %lx [%lu] ---", val[0], ); - } - //fprintf(stderr,"\n"); -#endif - restart_perf_event(fd); perf_start_all(nevents, event_thread); diff --git a/src/tool/hpcrun/sample-sources/watchpoint_clients.c b/src/tool/hpcrun/sample-sources/watchpoint_clients.c index e737755757..8edcbe0a5a 100755 --- a/src/tool/hpcrun/sample-sources/watchpoint_clients.c +++ b/src/tool/hpcrun/sample-sources/watchpoint_clients.c @@ -181,7 +181,7 @@ int curWatermarkId = 0; int watermark_metric_id[NUM_WATERMARK_METRICS] = {-1, -1, -1, -1}; int pebs_metric_id[NUM_WATERMARK_METRICS] = {-1, -1, -1, -1}; -static inline uint64_t perf_scale(uint64_t *values) { //jqswang +static inline uint64_t perf_scale(uint64_t *values) { uint64_t res = 0; if (!values[2] && !values[1] && values[0]) { @@ -2683,15 +2683,6 @@ bool OnSample(perf_mmap_data_t * mmap_data, void * contextPC, cct_node_t *node, int idx = rdtsc() % numFSLocs; //randomly choose one location to monitor sd.va = (void *)falseSharingLocs[idx].va; sd.reuseType = REUSE_SPATIAL; -#if 0 - // jqswang: I am not sure what the following code does - // randomly protect another word in the same cache line - uint64_t aligned_pc = ALIGN_TO_CACHE_LINE((uint64_t)data_addr); - if ((rdtsc() & 1) == 0) - sd.va = (void*) (aligned_pc - CACHE_LINE_SZ); - else - sd.va = (void *) (aligned_pc + CACHE_LINE_SZ); -#endif #if 0 int offset = ((uint64_t)data_addr - aligned_pc) / accessLen; int bound = CACHE_LINE_SZ / accessLen; diff --git a/src/tool/hpcrun/sample-sources/watchpoint_support.c b/src/tool/hpcrun/sample-sources/watchpoint_support.c index 0017a47799..1546517cfe 100644 --- a/src/tool/hpcrun/sample-sources/watchpoint_support.c +++ b/src/tool/hpcrun/sample-sources/watchpoint_support.c @@ -190,10 +190,10 @@ static int OnWatchPoint(int signum, siginfo_t *info, void *context); __attribute__((constructor)) static void InitConfig(){ tData.fptr = NULL; - + volatile int dummyWP[MAX_WP_SLOTS]; wpConfig.isLBREnabled = true; - + struct perf_event_attr peLBR = { .type = PERF_TYPE_BREAKPOINT, .size = sizeof(struct perf_event_attr), @@ -225,8 +225,8 @@ static void InitConfig(){ } } CHECK(close(fd)); - - + + #if defined(FAST_BP_IOC_FLAG) wpConfig.isWPModifyEnabled = true; #else @@ -236,7 +236,7 @@ static void InitConfig(){ //wpConfig.signalDelivered = SIGIO; //wpConfig.signalDelivered = SIGUSR1; wpConfig.signalDelivered = SIGRTMIN + 3; - + // Setup the signal handler sigset_t block_mask; sigfillset(&block_mask); @@ -246,18 +246,18 @@ static void InitConfig(){ .sa_mask = block_mask, .sa_flags = SA_SIGINFO | SA_RESTART | SA_NODEFER | SA_ONSTACK }; - + if(monitor_sigaction(wpConfig.signalDelivered, OnWatchPoint, 0 /*flags*/, &sa1) == -1) { fprintf(stderr, "Failed to set WHICH_SIG handler: %s\n", strerror(errno)); monitor_real_abort(); } - - - - - + + + + + wpConfig.pgsz = sysconf(_SC_PAGESIZE); - + // identify max WP supported by the architecture volatile int wpHandles[MAX_WP_SLOTS]; int i = 0; @@ -281,7 +281,7 @@ static void InitConfig(){ break; } } - + if(i == 0) { fprintf(stderr, "Cannot create a single watch point\n"); monitor_real_abort(); @@ -290,10 +290,10 @@ static void InitConfig(){ CHECK(close(wpHandles[j])); } wpConfig.maxWP = i; - + // Should we get the floating point type in an access? wpConfig.getFloatType = false; - + // Get the replacement scheme char * replacementScheme = getenv("HPCRUN_WP_REPLACEMENT_SCHEME"); if(replacementScheme){ @@ -313,7 +313,7 @@ static void InitConfig(){ // default; wpConfig.replacementPolicy = AUTO; } - + // Should we fix IP off by one? char * fixIP = getenv("HPCRUN_WP_DONT_FIX_IP"); if(fixIP){ @@ -329,7 +329,7 @@ static void InitConfig(){ // default; wpConfig.dontFixIP = false; } - + // Should we get the address in a WP trigger? char * disassembleWPAddress = getenv("HPCRUN_WP_DONT_DISASSEMBLE_TRIGGER_ADDRESS"); if(disassembleWPAddress){ @@ -346,8 +346,8 @@ static void InitConfig(){ wpConfig.dontDisassembleWPAddress = false; } - - + + } void RedSpyWPConfigOverride(void *v){ @@ -412,7 +412,7 @@ static void CreateWatchPoint(WatchPointInfo_t * wpi, SampleData_t * sampleData, .exclude_hv = 1, .disabled = 0, /* enabled */ }; - + switch (sampleData->wpLength) { case 1: pe.bp_len = HW_BREAKPOINT_LEN_1; break; case 2: pe.bp_len = HW_BREAKPOINT_LEN_2; break; @@ -423,13 +423,13 @@ static void CreateWatchPoint(WatchPointInfo_t * wpi, SampleData_t * sampleData, monitor_real_abort(); } pe.bp_addr = (uintptr_t)sampleData->va; - + switch (sampleData->type) { case WP_READ: pe.bp_type = HW_BREAKPOINT_R; break; case WP_WRITE: pe.bp_type = HW_BREAKPOINT_W; break; default: pe.bp_type = HW_BREAKPOINT_W | HW_BREAKPOINT_R; } - + #if defined(FAST_BP_IOC_FLAG) if(modify) { // modification @@ -452,10 +452,10 @@ static void CreateWatchPoint(WatchPointInfo_t * wpi, SampleData_t * sampleData, } // Set the perf_event file to async mode CHECK(fcntl(perf_fd, F_SETFL, fcntl(perf_fd, F_GETFL, 0) | O_ASYNC)); - + // Tell the file to send a signal when an event occurs CHECK(fcntl(perf_fd, F_SETSIG, wpConfig.signalDelivered)); - + // Deliver the signal to this thread struct f_owner_ex fown_ex; fown_ex.type = F_OWNER_TID; @@ -465,17 +465,17 @@ static void CreateWatchPoint(WatchPointInfo_t * wpi, SampleData_t * sampleData, EMSG("Failed to set the owner of the perf event file: %s\n", strerror(errno)); return; } - - + + // CHECK(fcntl(perf_fd, F_SETOWN, gettid())); - + wpi->fileHandle = perf_fd; // mmap the file if lbr is enabled //if(wpConfig.isLBREnabled) { wpi->mmapBuffer = MAPWPMBuffer(perf_fd); //} } - + wpi->isActive = true; wpi->va = (void *) pe.bp_addr; wpi->sample = *sampleData; @@ -498,7 +498,7 @@ static void CreateDummyHardwareEvent(void) { .exclude_hv = 1, .branch_sample_type = PERF_SAMPLE_BRANCH_ANY, }; - + // Create the perf_event for this thread on all CPUs with no event group int perf_fd = perf_event_open(&pe, 0, -1, -1, 0); if (perf_fd == -1) { @@ -516,14 +516,14 @@ static void CloseDummyHardwareEvent(int perf_fd){ /*********** Client interfaces *******/ static void DisArm(WatchPointInfo_t * wpi){ - + // assert(wpi->isActive); assert(wpi->fileHandle != -1); - + if(wpi->mmapBuffer) UNMAPWPMBuffer(wpi->mmapBuffer); wpi->mmapBuffer = 0; - + CHECK(close(wpi->fileHandle)); wpi->fileHandle = -1; wpi->isActive = false; @@ -539,7 +539,7 @@ static bool ArmWatchPoint(WatchPointInfo_t * wpi, SampleData_t * sampleData) { return true; } } - + // disable the old WP if active if(wpi->isActive) { DisArm(wpi); @@ -562,7 +562,7 @@ void WatchpointThreadInit(WatchPointUpCall_t func){ EMSG("Failed sigaltstack"); monitor_real_abort(); } - + tData.lbrDummyFD = -1; tData.fptr = func; tData.fs_reg_val = (void*)-1; @@ -576,15 +576,15 @@ void WatchpointThreadInit(WatchPointUpCall_t func){ tData.numWatchpointDropped = 0; tData.numSampleTriggeringWatchpoints = 0; tData.numInsaneIP = 0; - - + + for (int i=0; i tData.watchPointArray[i].startTime) { @@ -738,7 +738,7 @@ static VictimType GetVictim(int * location, ReplacementPolicy policy){ return NON_EMPTY_SLOT; } break; - + case EMPTY_SLOT_ONLY:{ return NONE_AVAILABLE; } @@ -763,12 +763,12 @@ static void ConsumeAllRingBufferData(void *mbuf) { * data points to beginning of buffer payload */ void * data = ((void *)hdr) + wpConfig.pgsz; - + /* * position of tail within the buffer payload */ tail = hdr->data_tail & pgmsk; - + /* * size of what is available * @@ -798,12 +798,12 @@ static int ReadMampBuffer(void *mbuf, void *buf, size_t sz) { * data points to beginning of buffer payload */ data = ((void *)hdr) + wpConfig.pgsz; - + /* * position of tail within the buffer payload */ tail = hdr->data_tail & pgmsk; - + /* * size of what is available * @@ -815,15 +815,15 @@ static int ReadMampBuffer(void *mbuf, void *buf, size_t sz) { rmb(); return -1; } - + /* From perf_event_open() manpage */ rmb(); - - + + /* * sz <= avail_sz, we can satisfy the request */ - + /* * c = size till end of buffer * @@ -831,23 +831,23 @@ static int ReadMampBuffer(void *mbuf, void *buf, size_t sz) { * a power of two, so we can do: */ c = pgmsk + 1 - tail; - + /* * min with requested size */ m = c < sz ? c : sz; - + /* copy beginning */ memcpy(buf, data + tail, m); - + /* * copy wrapped around leftover */ if (sz > m) memcpy(buf + m, data, sz - m); - + hdr->data_tail += sz; - + return 0; } @@ -901,7 +901,7 @@ static inline void * GetPatchedIP(void * contextIP) { static bool CollectWatchPointTriggerInfo(WatchPointInfo_t * wpi, WatchPointTrigger_t *wpt, void * context){ //struct perf_event_mmap_page * b = wpi->mmapBuffer; struct perf_event_header hdr; - + if (ReadMampBuffer(wpi->mmapBuffer, &hdr, sizeof(struct perf_event_header)) < 0) { EMSG("Failed to ReadMampBuffer: %s\n", strerror(errno)); monitor_real_abort(); @@ -966,9 +966,9 @@ static bool CollectWatchPointTriggerInfo(WatchPointInfo_t * wpi, WatchPointTrig reliableIP = contextIP-1; } } - + wpt->pc = reliableIP; - + if(wpConfig.dontDisassembleWPAddress == false){ FloatType * floatType = wpConfig.getFloatType? &wpt->floatType : 0; if(false == get_mem_access_length_and_type_address(wpt->pc, (uint32_t*) &(wpt->accessLength), &(wpt->accessType), floatType, context, &addr)){ @@ -979,8 +979,8 @@ static bool CollectWatchPointTriggerInfo(WatchPointInfo_t * wpi, WatchPointTrig //EMSG("WP triggered 0 access length! at pc=%p\n", wpt->pc); goto ErrExit; } - - + + void * patchedAddr = (void *)-1; // Stack affecting addresses will be off by 8 // Some instructions affect the address computing register: mov (%rax),%eax @@ -991,7 +991,7 @@ static bool CollectWatchPointTriggerInfo(WatchPointInfo_t * wpi, WatchPointTrig else tData.numWatchpointImpreciseAddressArbitraryLength ++; - + tData.numWatchpointImpreciseAddressArbitraryLength ++; patchedAddr = wpi->va; } else { @@ -1026,7 +1026,7 @@ static bool CollectWatchPointTriggerInfo(WatchPointInfo_t * wpi, WatchPointTrig //SkipBuffer(wpi->mmapBuffer , hdr.size - sizeof(hdr)); goto ErrExit; } - + ErrExit: // We must cleanup the mmap buffer if there is any data left ConsumeAllRingBufferData(wpi->mmapBuffer); @@ -1051,12 +1051,12 @@ static int OnWatchPoint(int signum, siginfo_t *info, void *context){ // and return and avoid any MSG. void* pc = hpcrun_context_pc(context); if (!hpcrun_safe_enter_async(pc)) return 0; - + linux_perf_events_pause(); - + tData.numWatchpointTriggers++; //fprintf(stderr, " numWatchpointTriggers = %lu, \n", tData.numWatchpointTriggers); - + //find which watchpoint fired int location = -1; for(int i = 0 ; i < wpConfig.maxWP; i++) { @@ -1065,7 +1065,7 @@ static int OnWatchPoint(int signum, siginfo_t *info, void *context){ break; } } - + // Ensure it is an active WP if(location == -1) { EMSG("\n WP trigger did not match any known active WP\n"); @@ -1075,7 +1075,7 @@ static int OnWatchPoint(int signum, siginfo_t *info, void *context){ //fprintf("\n WP trigger did not match any known active WP\n"); return 0; } - + WatchPointTrigger_t wpt; WPTriggerActionType retVal; WatchPointInfo_t *wpi = &tData.watchPointArray[location]; @@ -1096,15 +1096,15 @@ static int OnWatchPoint(int signum, siginfo_t *info, void *context){ monitor_real_abort(); break; } - - + + if( false == CollectWatchPointTriggerInfo(wpi, &wpt, context)) { tData.numWatchpointDropped++; retVal = DISABLE_WP; // disable if unable to collect any info. } else { retVal = tData.fptr(wpi, 0, wpt.accessLength/* invalid*/, &wpt); } - + // Let the client take action. switch (retVal) { case DISABLE_WP: { @@ -1166,7 +1166,7 @@ static bool ValidateWPData(SampleData_t * sampleData){ else return false; break; - + default: EMSG("Unsuppported WP length %d", sampleData->wpLength); monitor_real_abort(); @@ -1209,19 +1209,19 @@ bool SubscribeWatchpoint(SampleData_t * sampleData, OverwritePolicy overwritePol if(IsOveralpped(sampleData)){ return false; // drop the sample if it overlaps an existing address } - + // No overlap, look for a victim slot int victimLocation = -1; // Find a slot to install WP VictimType r = GetVictim(&victimLocation, wpConfig.replacementPolicy); - + if(r != NONE_AVAILABLE) { // VV IMP: Capture value before arming the WP. if(captureValue) CaptureValue(sampleData, &tData.watchPointArray[victimLocation]); // I know the error case that we have captured the value but ArmWatchPoint fails. // I am not handling that corner case because ArmWatchPoint() will fail with a monitor_real_abort(). - + if(ArmWatchPoint(&tData.watchPointArray[victimLocation], sampleData) == false){ //LOG to hpcrun log EMSG("ArmWatchPoint failed for address %p", sampleData->va); @@ -1241,14 +1241,14 @@ WPUpCallTRetType Test1UpCall(WatchPointInfo_t * wp, WatchPointTrigger_t * wt) { printf("\n Test1UpCall %p\n", wt->va); if(wpConfig.isLBREnabled) assert(wp->sample.va == wt->va); - + cnt ++; return DISABLE; } void TestBasic(){ tData.fptr = Test1UpCall; - + sigset_t block_mask; sigemptyset (&block_mask); // Set a signal handler for SIGUSR1 @@ -1257,18 +1257,18 @@ void TestBasic(){ // .sa_mask = block_mask, .sa_flags = SA_SIGINFO | SA_RESTART | SA_NODEFER }; - + if(sigaction(wpConfig.signalDelivered, &sa1, NULL) == -1) { fprintf(stderr, "Failed to set WHICH_SIG handler: %s\n", strerror(errno)); monitor_real_abort(); } - - + + WatchpointThreadInit(); int N = 10000; volatile int dummyWPLocation[10000]; cnt = 0; - + for(int i = 0 ; i < N; i++) { SampleData_t s = {.va = &dummyWPLocation[i], .wpLength = sizeof(int), .type = WP_WRITE}; SubscribeWatchpoint(&s, AUTO); From 0360f542324e8dd92f79389261f7363030e1343f Mon Sep 17 00:00:00 2001 From: bugubugu123 Date: Tue, 22 May 2018 00:10:53 -0400 Subject: [PATCH 38/43] Fixed a bug of opening perf events more than needed. --- src/tool/hpcrun/sample-sources/perf/perf-util.c | 1 + 1 file changed, 1 insertion(+) diff --git a/src/tool/hpcrun/sample-sources/perf/perf-util.c b/src/tool/hpcrun/sample-sources/perf/perf-util.c index 490f6b6996..8a4597dabd 100644 --- a/src/tool/hpcrun/sample-sources/perf/perf-util.c +++ b/src/tool/hpcrun/sample-sources/perf/perf-util.c @@ -342,6 +342,7 @@ get_precise_ip(struct perf_event_attr *attr) THREAD_SELF, CPU_ANY, GROUP_FD, PERF_FLAGS); if (ret >= 0) { + close(ret); return val; } EMSG("The kernel does not support the requested ip-precision: %d." From 9fcc35cd06383a87ec7b4487e791e32e4b2131bc Mon Sep 17 00:00:00 2001 From: bugubugu123 Date: Mon, 16 Jul 2018 23:27:26 -0400 Subject: [PATCH 39/43] Added the missing "}" due to resolving the conflicts of previous merge --- src/tool/hpcrun/sample-sources/perf/linux_perf.c | 1 + 1 file changed, 1 insertion(+) diff --git a/src/tool/hpcrun/sample-sources/perf/linux_perf.c b/src/tool/hpcrun/sample-sources/perf/linux_perf.c index f40f4a974d..c2a1f5adc9 100644 --- a/src/tool/hpcrun/sample-sources/perf/linux_perf.c +++ b/src/tool/hpcrun/sample-sources/perf/linux_perf.c @@ -243,6 +243,7 @@ static inline uint64_t perf_scale(uint64_t *values) { res = (uint64_t)((double)values[0] * values[1]/values[2]); } return res; +} /* * determine whether the perf sample source has been finalized for this thread From 2500cf1e782b87ecd40f04a46339f0a3deed44b0 Mon Sep 17 00:00:00 2001 From: bugubugu123 Date: Sat, 11 Aug 2018 00:21:03 -0400 Subject: [PATCH 40/43] Fixed some counter issues. HPCA submission version --- src/tool/hpcrun/hpcrun_stats.c | 15 +++++++++-- .../hpcrun/sample-sources/perf/linux_perf.c | 25 ++++++++++++++++--- .../sample-sources/watchpoint_clients.c | 6 ++--- 3 files changed, 38 insertions(+), 8 deletions(-) diff --git a/src/tool/hpcrun/hpcrun_stats.c b/src/tool/hpcrun/hpcrun_stats.c index 5d4867eaa8..f53f5e7421 100644 --- a/src/tool/hpcrun/hpcrun_stats.c +++ b/src/tool/hpcrun/hpcrun_stats.c @@ -105,6 +105,7 @@ static atomic_long num_falseWRIns = ATOMIC_VAR_INIT(0); static atomic_long num_reuseSpatial = ATOMIC_VAR_INIT(0); static atomic_long num_reuseTemporal = ATOMIC_VAR_INIT(0); static atomic_long num_latency = ATOMIC_VAR_INIT(0); +static atomic_long num_corrected_reuse_distance = ATOMIC_VAR_INIT(0); static atomic_long num_unwind_intervals_total = ATOMIC_VAR_INIT(0); static atomic_long num_unwind_intervals_suspicious = ATOMIC_VAR_INIT(0); @@ -156,6 +157,11 @@ hpcrun_stats_reinit(void) atomic_store_explicit(&num_trueWWIns, 0, memory_order_relaxed); atomic_store_explicit(&num_trueRWIns, 0, memory_order_relaxed); atomic_store_explicit(&num_trueWRIns, 0, memory_order_relaxed); + + atomic_store_explicit(&num_reuseSpatial, 0, memory_order_relaxed); + atomic_store_explicit(&num_reuseTemporal, 0, memory_order_relaxed); + atomic_store_explicit(&num_latency, 0, memory_order_relaxed); + atomic_store_explicit(&num_corrected_reuse_distance, 0, memory_order_relaxed); } @@ -352,6 +358,12 @@ hpcrun_stats_num_latency_inc(long val) atomic_fetch_add_explicit(&num_latency, val, memory_order_relaxed); } +void +hpcrun_stats_num_corrected_reuse_distance_inc(long val) +{ + atomic_fetch_add_explicit(&num_corrected_reuse_distance, val, memory_order_relaxed); +} + void hpcrun_stats_num_falseWWIns_inc(long val) { @@ -624,7 +636,6 @@ hpcrun_stats_num_samples_yielded(void) //----------------------------- // print summary //----------------------------- - void hpcrun_stats_print_summary(void) { @@ -644,7 +655,7 @@ hpcrun_stats_print_summary(void) getrusage(RUSAGE_SELF, &rusage); //AMSG("WATCHPOINT ANOMALIES: samples:%ld, SM_imprecise:%ld, WP_Set:%ld, WP_triggered:%ld, WP_SampleTriggering:%ld, WP_ImpreciseIP:%ld, WP_InsaneIP:%ld, WP_Off8Addr:%ld, WP_ImpreciseAddr:%ld, WP_Dropped:%ld", num_samples_total, num_samples_imprecise, num_watchpoints_set, num_watchpoints_triggered, num_sample_triggering_watchpoints, num_watchpoints_imprecise, num_insane_ip, num_watchpoints_imprecise_address_8_byte, num_watchpoints_imprecise_address, num_watchpoints_dropped); - AMSG("WATCHPOINT ANOMALIES: samples:%.2e, SM_imprecise:%.2e, WP_Set:%.2e, WP_triggered:%.2e, WP_SampleTriggering:%.2e, WP_ImpreciseIP:%.2e, WP_InsaneIP:%.2e, WP_Off8Addr:%.2e, WP_ImpreciseAddr:%.2e, WP_Dropped:%.2e", (double)atomic_load(&num_samples_total), (double)atomic_load(&num_samples_imprecise), (double)atomic_load(&num_watchpoints_set), (double)atomic_load(&num_watchpoints_triggered), (double)atomic_load(&num_sample_triggering_watchpoints), (double)atomic_load(&num_watchpoints_imprecise), (double)atomic_load(&num_insane_ip), (double)atomic_load(&num_watchpoints_imprecise_address_8_byte), (double)atomic_load(&num_watchpoints_imprecise_address), (double)atomic_load(&num_watchpoints_dropped)); + AMSG("WATCHPOINT ANOMALIES: samples:%.2e, SM_imprecise:%.2e, WP_Set:%.2e, WP_triggered:%.2e, WP_SampleTriggering:%.2e, WP_ImpreciseIP:%.2e, WP_InsaneIP:%.2e, WP_Off8Addr:%.2e, WP_ImpreciseAddr:%.2e, WP_Dropped:%.2e, CORRECTED_REUSE_DISTANCE:%.2e", (double)atomic_load(&num_samples_total), (double)atomic_load(&num_samples_imprecise), (double)atomic_load(&num_watchpoints_set), (double)atomic_load(&num_watchpoints_triggered), (double)atomic_load(&num_sample_triggering_watchpoints), (double)atomic_load(&num_watchpoints_imprecise), (double)atomic_load(&num_insane_ip), (double)atomic_load(&num_watchpoints_imprecise_address_8_byte), (double)atomic_load(&num_watchpoints_imprecise_address), (double)atomic_load(&num_watchpoints_dropped), (double)atomic_load(&num_corrected_reuse_distance)); AMSG("WATCHPOINT STATS: writtenBytes:%ld, usedBytes:%ld, deadBytes:%ld, newBytes:%ld, oldBytes:%ld, oldAppxBytes:%ld, loadedBytes:%ld, accessedIns:%ld, falseWWIns:%ld, falseRWIns:%ld, falseWRIns:%ld, trueWWIns:%ld, trueRWIns:%ld, trueWRIns:%ld, RSS:%ld, reuseTemporal:%ld, reuseSpatial:%ldlatency:%ld", num_writtenBytes, num_usedBytes, num_deadBytes, num_newBytes, num_oldBytes, num_oldAppxBytes, num_loadedBytes, num_accessedIns, num_falseWWIns, num_falseRWIns, num_falseWRIns, num_trueWWIns, num_trueRWIns, num_trueWRIns, (size_t)(rusage.ru_maxrss), num_reuseTemporal, num_reuseSpatial, num_latency); diff --git a/src/tool/hpcrun/sample-sources/perf/linux_perf.c b/src/tool/hpcrun/sample-sources/perf/linux_perf.c index c2a1f5adc9..036848a834 100644 --- a/src/tool/hpcrun/sample-sources/perf/linux_perf.c +++ b/src/tool/hpcrun/sample-sources/perf/linux_perf.c @@ -300,6 +300,18 @@ perf_stop_all(int nevents, event_thread_t *event_thread) } } +/* + * Reset a counter value by the mapped file descriptor + */ +static void +perf_reset_counter(int fd) +{ + int ret = ioctl(fd, PERF_EVENT_IOC_RESET, 0); + if (ret == -1){ + EMSG("Can't reset event with fd: %d: %s", fd, strerror(errno)); + } +} + static int perf_get_pmu_support(const char *name, struct perf_event_attr *event_attr) { @@ -1131,6 +1143,7 @@ int linux_perf_read_event_counter(int event_index, uint64_t *val){ event_thread_t *current = &(event_thread[event_index]); int ret = perf_read_event_counter(current, val); + if (ret < 0) return -1; // something wrong here uint64_t sample_period = current->event->attr.sample_period; @@ -1140,10 +1153,14 @@ int linux_perf_read_event_counter(int event_index, uint64_t *val){ // overflow event assert(val[1] == val[2]); //jqswang: TODO: I have no idea how to calculate the value under multiplexing for overflow event. int64_t scaled_val = (int64_t) val[0] ;//% sample_period; - if (scaled_val >= sample_period || scaled_val < 0){ //jqswang: TODO: it does not filter out all the invalid values - scaled_val = 0; + if (scaled_val >= sample_period * 10 // The counter value can become larger than the sampling period but they are usually less than 2 * sample_period + || scaled_val < 0){ + //jqswang: TODO: it does not filter out all the invalid values + //fprintf(stderr, "WEIRD_COUNTER: %ld %s\n", scaled_val, current->event->metric_desc->name); + hpcrun_stats_num_corrected_reuse_distance_inc(1); + scaled_val = 0; } - //fprintf(stderr, "%s: %lu, %lu %lu %lu ->", current->event->metric_desc->name, current->num_overflows, val[0],val[1],val[2]); + //fprintf(stderr, "%s: %lu, %lu(%ld) %lu %lu ->", current->event->metric_desc->name, current->num_overflows, val[0],val[0],val[1],val[2]); val[0] = current->num_overflows * sample_period + scaled_val; //fprintf(stderr, " %lu\n", val[0]); val[1] = 0; @@ -1278,6 +1295,8 @@ perf_event_handler( kernel_block_handler(current, sv, &mmap_data); } while (more_data); + perf_reset_counter(fd); + perf_start_all(nevents, event_thread); hpcrun_safe_exit(); diff --git a/src/tool/hpcrun/sample-sources/watchpoint_clients.c b/src/tool/hpcrun/sample-sources/watchpoint_clients.c index 8edcbe0a5a..22e9121a99 100755 --- a/src/tool/hpcrun/sample-sources/watchpoint_clients.c +++ b/src/tool/hpcrun/sample-sources/watchpoint_clients.c @@ -672,7 +672,7 @@ static void ClientTermination(){ WriteWitchTraceOutput("FINAL_COUNTING:"); for (int i=0; i < MIN(2,reuse_distance_num_events); i++){ - linux_perf_read_event_counter(reuse_distance_events[i], val); + assert(linux_perf_read_event_counter(reuse_distance_events[i], val) >= 0); //fprintf(stderr, " %lu %lu %lu,", val[0], val[1], val[2]);//jqswang WriteWitchTraceOutput(" %lu %lu %lu,", val[0], val[1], val[2]); } @@ -1612,7 +1612,7 @@ static WPTriggerActionType ReuseWPCallback(WatchPointInfo_t *wpi, int startOffse uint64_t val[2][3]; for (int i=0; i < MIN(2, reuse_distance_num_events); i++){ - linux_perf_read_event_counter( reuse_distance_events[i], val[i]); + assert(linux_perf_read_event_counter( reuse_distance_events[i], val[i]) >= 0); //fprintf(stderr, "USE: %lu %lu %lu, REUSE: %lu %lu %lu\n", wpi->sample.reuseDistance[i][0], wpi->sample.reuseDistance[i][1], wpi->sample.reuseDistance[i][2], val[i][0], val[i][1], val[i][2]); //fprintf(stderr, "DIFF: %lu\n", val[i][0] - wpi->sample.reuseDistance[i][0]); for(int j=0; j < 3; j++){ @@ -2705,7 +2705,7 @@ bool OnSample(perf_mmap_data_t * mmap_data, void * contextPC, cct_node_t *node, // We assume the reading event is load, store or both. for (int i=0; i < MIN(2, reuse_distance_num_events); i++){ uint64_t val[3]; - linux_perf_read_event_counter( reuse_distance_events[i], val); + assert(linux_perf_read_event_counter( reuse_distance_events[i], val) >= 0); //fprintf(stderr, "USE %lu %lu %lu -- ", val[0], val[1], val[2]); //fprintf(stderr, "USE %lx -- ", val[0]); memcpy(sd.reuseDistance[i], val, sizeof(uint64_t)*3);; From e59904f312ee567084594e474416d5e08952ef0c Mon Sep 17 00:00:00 2001 From: bugubugu123 Date: Fri, 10 Aug 2018 21:30:34 -0700 Subject: [PATCH 41/43] Cleaned some code --- .../sample-sources/watchpoint_clients.c | 24 +++---------------- 1 file changed, 3 insertions(+), 21 deletions(-) diff --git a/src/tool/hpcrun/sample-sources/watchpoint_clients.c b/src/tool/hpcrun/sample-sources/watchpoint_clients.c index 22e9121a99..9332309214 100755 --- a/src/tool/hpcrun/sample-sources/watchpoint_clients.c +++ b/src/tool/hpcrun/sample-sources/watchpoint_clients.c @@ -1661,28 +1661,10 @@ static WPTriggerActionType ReuseWPCallback(WatchPointInfo_t *wpi, int startOffse reusePairNode = getConcatenatedNode(wpi->sample.node /*bottomNode*/, reuseNode /*topNode*/, joinNodes[E_TEMPORALLY_REUSED_FROM][joinNodeIdx] /* joinNode*/); } -#if 0 //jqswang: currently disable the value borrowing process - uint64_t obtained_val[2]; - for (int i=0; i < MIN(2, reuse_distance_num_events); i++){ - uint64_t * buffer_ptr = (uint64_t *) get_metric_data_ptr(reuse_buffer_metric_ids[i], reusePairNode); - if (val[i][2] == 0){ - //need to borrow value - obtained_val[i] = *buffer_ptr; - } else { - obtained_val[i] = perf_scale(val[i]); - *buffer_ptr = obtained_val[i]; - } - } - if ( obtained_val[0] > 0 && obtained_val[1] > 0) //attribute the value - { - cct_metric_data_increment(reuse_memory_distance_metric_id, reusePairNode, (cct_metric_data_t){.i = (obtained_val[0] + obtained_val[1]) }); - cct_metric_data_increment(reuse_memory_distance_count_metric_id, reusePairNode, (cct_metric_data_t){.i = 1}); - } -#else - cct_metric_data_increment(reuse_memory_distance_metric_id, reusePairNode, (cct_metric_data_t){.i = (val[0][0] + val[1][0]) }); - cct_metric_data_increment(reuse_memory_distance_count_metric_id, reusePairNode, (cct_metric_data_t){.i = 1}); -#endif + cct_metric_data_increment(reuse_memory_distance_metric_id, reusePairNode, (cct_metric_data_t){.i = (val[0][0] + val[1][0]) }); + cct_metric_data_increment(reuse_memory_distance_count_metric_id, reusePairNode, (cct_metric_data_t){.i = 1}); + reuseTemporal += inc; cct_metric_data_increment(temporal_reuse_metric_id, reusePairNode, (cct_metric_data_t){.i = inc}); cct_metric_data_increment(reuse_time_distance_metric_id, reusePairNode, (cct_metric_data_t){.i = time_distance}); From ef41f34733a7e2328c4542d8a0e0473c2ec08ae3 Mon Sep 17 00:00:00 2001 From: bugubugu123 Date: Sat, 11 Aug 2018 01:21:54 -0400 Subject: [PATCH 42/43] Start to add spatial reuse --- .../sample-sources/watchpoint_clients.c | 26 +++++++++++++++++-- .../sample-sources/watchpoint_support.h | 2 +- 2 files changed, 25 insertions(+), 3 deletions(-) diff --git a/src/tool/hpcrun/sample-sources/watchpoint_clients.c b/src/tool/hpcrun/sample-sources/watchpoint_clients.c index 22e9121a99..dfaa5e499d 100755 --- a/src/tool/hpcrun/sample-sources/watchpoint_clients.c +++ b/src/tool/hpcrun/sample-sources/watchpoint_clients.c @@ -1025,6 +1025,25 @@ METHOD_FN(process_event_list, int lush_metrics) } #else + { + char * monitor_type_str = getenv("HPCRUN_WP_REUSE_TYPE"); + if(monitor_type_str){ + if(0 == strcasecmp(monitor_type_str, "TEMPORAL")) { + reuse_type = REUSE_TEMPORAL; + } else if (0 == strcasecmp(monitor_type_str, "SPATIAL")) { + reuse_type = REUSE_SPATIAL; + } else if ( 0 == strcasecmp(monitor_type_str, "ALL") ) { + reuse_type = REUSE_BOTH; + } else { + // default; + reuse_type = REUSE_BOTH; + } + } else{ + // default + reuse_type = REUSE_BOTH; + } + } + { char * monitor_type_str = getenv("HPCRUN_WP_REUSE_MONITOR_TYPE"); if(monitor_type_str){ @@ -1074,10 +1093,8 @@ METHOD_FN(process_event_list, int lush_metrics) temporal_reuse_metric_id = hpcrun_new_metric(); hpcrun_set_metric_info_and_period(temporal_reuse_metric_id, "TEMPORAL", MetricFlags_ValFmt_Int, 1, metric_property_none); - #if 0 spatial_reuse_metric_id = hpcrun_new_metric(); hpcrun_set_metric_info_and_period(spatial_reuse_metric_id, "SPATIAL", MetricFlags_ValFmt_Int, 1, metric_property_none); - #endif reuse_memory_distance_metric_id = hpcrun_new_metric(); hpcrun_set_metric_info_and_period(reuse_memory_distance_metric_id, "MEMORY_DISTANCE_SUM", MetricFlags_ValFmt_Int, 1, metric_property_none); reuse_memory_distance_count_metric_id = hpcrun_new_metric(); @@ -1187,6 +1204,7 @@ enum JoinNodeType { E_TEMPORALLY_REUSED_FROM, E_TEMPORALLY_REUSED_BY, E_SPATIALLY_REUSED_FROM, + E_SPATIALLY_REUSED_BY, E_TRUE_WW_SHARE, E_TRUE_WR_SHARE, E_TRUE_RW_SHARE, @@ -1225,6 +1243,9 @@ static void TEMPORALLY_REUSED_BY_INACCURATE_PC(void) {} static void SPATIALLY_REUSED_FROM(void) {} static void SPATIALLY_REUSED_FROM_INACCURATE_PC(void) {} +static void SPATIALLY_REUSED_BY(void) {} +static void SPATIALLY_REUSED_BY_INACCURATE_PC(void) {} + static void TRUE_WW_SHARE(void) {} static void TRUE_WW_SHARE_INACCURATE_PC(void) {} @@ -1271,6 +1292,7 @@ static const void * joinNodes[][2] = { [E_TEMPORALLY_REUSED_FROM] = GET_FUN_ADDR(TEMPORALLY_REUSED_FROM), [E_TEMPORALLY_REUSED_BY] = GET_FUN_ADDR(TEMPORALLY_REUSED_BY), [E_SPATIALLY_REUSED_FROM] = GET_FUN_ADDR(SPATIALLY_REUSED_FROM), + [E_SPATIALLY_REUSED_BY] = GET_FUN_ADDR(SPATIALLY_REUSED_BY), [E_TRUE_WW_SHARE] = GET_FUN_ADDR(TRUE_WW_SHARE), [E_TRUE_WR_SHARE] = GET_FUN_ADDR(TRUE_WR_SHARE), [E_TRUE_RW_SHARE] = GET_FUN_ADDR(TRUE_RW_SHARE), diff --git a/src/tool/hpcrun/sample-sources/watchpoint_support.h b/src/tool/hpcrun/sample-sources/watchpoint_support.h index ea762c84f1..bcd5a4c664 100644 --- a/src/tool/hpcrun/sample-sources/watchpoint_support.h +++ b/src/tool/hpcrun/sample-sources/watchpoint_support.h @@ -85,7 +85,7 @@ typedef enum MergePolicy {AUTO_MERGE, NO_MERGE, CLIENT_ACTION} MergePolicy; typedef enum OverwritePolicy {OVERWRITE, NO_OVERWRITE} OverwritePolicy; typedef enum VictimType {EMPTY_SLOT, NON_EMPTY_SLOT, NONE_AVAILABLE} VictimType; typedef enum WPTriggerActionType {DISABLE_WP, ALREADY_DISABLED, DISABLE_ALL_WP, RETAIN_WP} WPTriggerActionType; -typedef enum ReuseType { REUSE_TEMPORAL, REUSE_SPATIAL} ReuseType; // for reuse client +typedef enum ReuseType { REUSE_TEMPORAL, REUSE_SPATIAL, RUESE_BOTH} ReuseType; // for reuse client // Data structure that is given by clients to set a WP typedef struct SampleData{ From 4dfee897716b00fe3f25ab8236817481a950c91f Mon Sep 17 00:00:00 2001 From: bugubugu123 Date: Thu, 13 Jun 2019 14:46:03 -0400 Subject: [PATCH 43/43] Added latency metrics of different cache levels and more configurable options --- .../hpcrun/sample-sources/perf/linux_perf.c | 38 ++++++-- .../sample-sources/watchpoint_clients.c | 91 ++++++++++++------- .../sample-sources/watchpoint_support.h | 2 +- 3 files changed, 89 insertions(+), 42 deletions(-) diff --git a/src/tool/hpcrun/sample-sources/perf/linux_perf.c b/src/tool/hpcrun/sample-sources/perf/linux_perf.c index 036848a834..81dfc5cd9a 100644 --- a/src/tool/hpcrun/sample-sources/perf/linux_perf.c +++ b/src/tool/hpcrun/sample-sources/perf/linux_perf.c @@ -604,15 +604,29 @@ record_sample(event_thread_t *current, perf_mmap_data_t *mmap_data, if ( strstr(current->event->metric_desc->name, "LATENCY_ABOVE_THRESHOLD") || strstr(current->event->metric_desc->name, "LOAD_LATENCY") ) { perf_mmap_data_src_t data_src; data_src.val = mmap_data->data_src; - if ( (data_src.mem_lvl & PERF_MEM_LVL_HIT) && (data_src.mem_lvl & PERF_MEM_LVL_L1)){ // L1 HIT, ignore - *sv = hpcrun_sample_callpath(context, current->event->metric, (hpcrun_metricVal_t){.r=counter}, 0/*skipInner*/, 0/*isSync*/, NULL); - } - else { + + assert( (data_src.mem_lvl & PERF_MEM_LVL_MISS) == 0); // jqswang: Have not met PERF_MEM_LVL_MISS before. Notify me if there is one. + + if ( (data_src.mem_lvl & PERF_MEM_LVL_HIT) && ( (data_src.mem_lvl & PERF_MEM_LVL_L1) == 0) ){ // L1 MISS *sv = hpcrun_sample_callpath(context, current->event->metric, (hpcrun_metricVal_t) {.r=counter}, 0/*skipInner*/, 0/*isSync*/, &info); extern int latency_metric_id; cct_metric_data_increment(latency_metric_id, sv->sample_node, (cct_metric_data_t){.i = mmap_data->weight}); - extern int latency_miss_load_metric_id; - cct_metric_data_increment(latency_miss_load_metric_id, sv->sample_node, (cct_metric_data_t){.i = counter}); + extern int latency_l1_miss_load_metric_id; + cct_metric_data_increment(latency_l1_miss_load_metric_id, sv->sample_node, (cct_metric_data_t){.i = counter}); + + if ( (data_src.mem_lvl & PERF_MEM_LVL_LFB) == 0) { + if ( (data_src.mem_lvl & PERF_MEM_LVL_L2) == 0) { // L2 miss + extern int latency_l2_miss_load_metric_id; + cct_metric_data_increment(latency_l2_miss_load_metric_id, sv->sample_node, (cct_metric_data_t){.i = counter}); + + if ( (data_src.mem_lvl & PERF_MEM_LVL_L3) == 0) { // L3 miss + extern int latency_l3_miss_load_metric_id; + cct_metric_data_increment(latency_l3_miss_load_metric_id, sv->sample_node, (cct_metric_data_t){.i = counter}); + } + } + } + } else { // Otherwise (L1 HIT, Non_Available) + *sv = hpcrun_sample_callpath(context, current->event->metric, (hpcrun_metricVal_t){.r=counter}, 0/*skipInner*/, 0/*isSync*/, &info); } } else { @@ -1011,9 +1025,15 @@ METHOD_FN(process_event_list, int lush_metrics) extern int latency_metric_id; latency_metric_id = hpcrun_new_metric(); hpcrun_set_metric_info_and_period(latency_metric_id, "LATENCY", MetricFlags_ValFmt_Int, threshold, metric_property_none); - extern int latency_miss_load_metric_id; - latency_miss_load_metric_id = hpcrun_new_metric(); - hpcrun_set_metric_info_and_period(latency_miss_load_metric_id, "CACHE_MISS_LOAD", MetricFlags_ValFmt_Int, threshold, metric_property_none); + extern int latency_l1_miss_load_metric_id; + latency_l1_miss_load_metric_id = hpcrun_new_metric(); + hpcrun_set_metric_info_and_period(latency_l1_miss_load_metric_id, "L1_CACHE_MISS_LOAD", MetricFlags_ValFmt_Int, threshold, metric_property_none); + extern int latency_l2_miss_load_metric_id; + latency_l2_miss_load_metric_id = hpcrun_new_metric(); + hpcrun_set_metric_info_and_period(latency_l2_miss_load_metric_id, "L2_CACHE_MISS_LOAD", MetricFlags_ValFmt_Int, threshold, metric_property_none); + extern int latency_l3_miss_load_metric_id; + latency_l3_miss_load_metric_id = hpcrun_new_metric(); + hpcrun_set_metric_info_and_period(latency_l3_miss_load_metric_id, "L3_CACHE_MISS_LOAD", MetricFlags_ValFmt_Int, threshold, metric_property_none); } if (m == NULL) { diff --git a/src/tool/hpcrun/sample-sources/watchpoint_clients.c b/src/tool/hpcrun/sample-sources/watchpoint_clients.c index a53c7a34c7..be0835a027 100755 --- a/src/tool/hpcrun/sample-sources/watchpoint_clients.c +++ b/src/tool/hpcrun/sample-sources/watchpoint_clients.c @@ -142,7 +142,9 @@ int load_metric_id = -1; int dead_metric_id = -1; int measured_metric_id = -1; int latency_metric_id = -1; -int latency_miss_load_metric_id = -1; +int latency_l1_miss_load_metric_id = -1; +int latency_l2_miss_load_metric_id = -1; +int latency_l3_miss_load_metric_id = -1; int temporal_reuse_metric_id = -1; int spatial_reuse_metric_id = -1; @@ -173,6 +175,7 @@ int reuse_bin_size = 0; #else AccessType reuse_monitor_type = LOAD_AND_STORE; // WP_REUSE: what kind of memory access can be used to subscribe the watchpoint WatchPointType reuse_trap_type = WP_RW; // WP_REUSE: what kind of memory access can trap the watchpoint +ReuseType reuse_profile_type = REUSE_BOTH; // WP_REUSE: we want to collect temporal reuse, spatial reuse OR both? bool reuse_concatenate_use_reuse = false; // WP_REUSE: how to concatentate the use and reuse #endif @@ -665,6 +668,7 @@ static void ClientTermination(){ if (reuse_output_trace == false){ //dump the bin info WriteWitchTraceOutput("BIN_START: %lf\n", reuse_bin_start); WriteWitchTraceOutput("BIN_RATIO: %lf\n", reuse_bin_ratio); + for(int i=0; i < reuse_bin_size; i++){ WriteWitchTraceOutput("BIN: %d %lu\n", i, reuse_bin_list[i]); } @@ -683,7 +687,6 @@ static void ClientTermination(){ #endif hpcrun_stats_num_accessedIns_inc(accessedIns); hpcrun_stats_num_reuseTemporal_inc(reuseTemporal); - hpcrun_stats_num_accessedIns_inc(accessedIns); hpcrun_stats_num_reuseSpatial_inc(reuseSpatial); } break; case WP_FALSE_SHARING: @@ -1026,21 +1029,21 @@ METHOD_FN(process_event_list, int lush_metrics) #else { - char * monitor_type_str = getenv("HPCRUN_WP_REUSE_TYPE"); + char * monitor_type_str = getenv("HPCRUN_WP_REUSE_PROFILE_TYPE"); if(monitor_type_str){ if(0 == strcasecmp(monitor_type_str, "TEMPORAL")) { - reuse_type = REUSE_TEMPORAL; + reuse_profile_type = REUSE_TEMPORAL; } else if (0 == strcasecmp(monitor_type_str, "SPATIAL")) { - reuse_type = REUSE_SPATIAL; + reuse_profile_type = REUSE_SPATIAL; } else if ( 0 == strcasecmp(monitor_type_str, "ALL") ) { - reuse_type = REUSE_BOTH; + reuse_profile_type = REUSE_BOTH; } else { // default; - reuse_type = REUSE_BOTH; + reuse_profile_type = REUSE_BOTH; } } else{ // default - reuse_type = REUSE_BOTH; + reuse_profile_type = REUSE_BOTH; } } @@ -1655,11 +1658,11 @@ static WPTriggerActionType ReuseWPCallback(WatchPointInfo_t *wpi, int startOffse uint64_t time_distance = rdtsc() - wpi->startTime; +#ifdef REUSE_HISTO //cct_node_t *reuseNode = getPreciseNode(wt->ctxt, wt->pc, temporal_reuse_metric_id ); sample_val_t v = hpcrun_sample_callpath(wt->ctxt, temporal_reuse_metric_id, SAMPLE_NO_INC, 0/*skipInner*/, 1/*isSync*/, NULL); cct_node_t *reuseNode = v.sample_node; -#ifdef REUSE_HISTO if (reuse_output_trace){ WriteWitchTraceOutput("REUSE_DISTANCE: %d %d %lu,", hpcrun_cct_persistent_id(wpi->sample.node), hpcrun_cct_persistent_id(reuseNode), inc); for(int i=0; i < MIN(2, reuse_distance_num_events); i++){ @@ -1674,21 +1677,38 @@ static WPTriggerActionType ReuseWPCallback(WatchPointInfo_t *wpi, int startOffse } ReuseAddDistance(rd, inc); } + #else cct_node_t *reusePairNode; - if (reuse_concatenate_use_reuse){ - reusePairNode = getConcatenatedNode(reuseNode /*bottomNode*/, wpi->sample.node /*topNode*/, joinNodes[E_TEMPORALLY_REUSED_BY][joinNodeIdx] /* joinNode*/); - }else{ - reusePairNode = getConcatenatedNode(wpi->sample.node /*bottomNode*/, reuseNode /*topNode*/, joinNodes[E_TEMPORALLY_REUSED_FROM][joinNodeIdx] /* joinNode*/); + if (wpi->sample.reuseType == REUSE_TEMPORAL){ + sample_val_t v = hpcrun_sample_callpath(wt->ctxt, temporal_reuse_metric_id, SAMPLE_NO_INC, 0/*skipInner*/, 1/*isSync*/, NULL); + cct_node_t *reuseNode = v.sample_node; + if (reuse_concatenate_use_reuse){ + reusePairNode = getConcatenatedNode(reuseNode /*bottomNode*/, wpi->sample.node /*topNode*/, joinNodes[E_TEMPORALLY_REUSED_BY][joinNodeIdx] /* joinNode*/); + }else{ + reusePairNode = getConcatenatedNode(wpi->sample.node /*bottomNode*/, reuseNode /*topNode*/, joinNodes[E_TEMPORALLY_REUSED_FROM][joinNodeIdx] /* joinNode*/); + } + } + else { // REUSE_SPATIAL + sample_val_t v = hpcrun_sample_callpath(wt->ctxt, spatial_reuse_metric_id, SAMPLE_NO_INC, 0/*skipInner*/, 1/*isSync*/, NULL); + cct_node_t *reuseNode = v.sample_node; + if (reuse_concatenate_use_reuse){ + reusePairNode = getConcatenatedNode(reuseNode /*bottomNode*/, wpi->sample.node /*topNode*/, joinNodes[E_SPATIALLY_REUSED_BY][joinNodeIdx] /* joinNode*/); + }else{ + reusePairNode = getConcatenatedNode(wpi->sample.node /*bottomNode*/, reuseNode /*topNode*/, joinNodes[E_SPATIALLY_REUSED_FROM][joinNodeIdx] /* joinNode*/); + } } - cct_metric_data_increment(reuse_memory_distance_metric_id, reusePairNode, (cct_metric_data_t){.i = (val[0][0] + val[1][0]) }); cct_metric_data_increment(reuse_memory_distance_count_metric_id, reusePairNode, (cct_metric_data_t){.i = 1}); reuseTemporal += inc; - cct_metric_data_increment(temporal_reuse_metric_id, reusePairNode, (cct_metric_data_t){.i = inc}); + if (wpi->sample.reuseType == REUSE_TEMPORAL){ + cct_metric_data_increment(temporal_reuse_metric_id, reusePairNode, (cct_metric_data_t){.i = inc}); + } else { + cct_metric_data_increment(spatial_reuse_metric_id, reusePairNode, (cct_metric_data_t){.i = inc}); + } cct_metric_data_increment(reuse_time_distance_metric_id, reusePairNode, (cct_metric_data_t){.i = time_distance}); cct_metric_data_increment(reuse_time_distance_count_metric_id, reusePairNode, (cct_metric_data_t){.i = 1}); #endif @@ -2675,32 +2695,39 @@ bool OnSample(perf_mmap_data_t * mmap_data, void * contextPC, cct_node_t *node, sd.type = reuse_trap_type; #endif - -#if 0 //spatial reuse.. currently we don't need it - if (rdtsc() & 1)// 50% chance to detect spatial reuse - { + bool isProfileSpatial; + if (reuse_profile_type == REUSE_TEMPORAL){ + isProfileSpatial = false; + } else if (reuse_profile_type == REUSE_SPATIAL){ + isProfileSpatial = true; + } else { + isProfileSpatial = (rdtsc() & 1); + } + if (isProfileSpatial) {// detect spatial reuse int wpSizes[] = {8, 4, 2, 1}; FalseSharingLocs falseSharingLocs[CACHE_LINE_SZ]; int numFSLocs = 0; GetAllFalseSharingLocations((size_t)data_addr, accessLen, ALIGN_TO_CACHE_LINE((size_t)(data_addr)), CACHE_LINE_SZ, wpSizes, 0 /*curWPSizeIdx*/ , 4 /*totalWPSizes*/, falseSharingLocs, &numFSLocs); - assert(numFSLocs > 0); // at least there is one location to monitor - int idx = rdtsc() % numFSLocs; //randomly choose one location to monitor - sd.va = (void *)falseSharingLocs[idx].va; - sd.reuseType = REUSE_SPATIAL; + if (numFSLocs == 0) { // No location is found. It is probably due to the access length already occupies one cache line. So we just monitor the temporal reuse instead. + sd.va = data_addr; + sd.reuseType = REUSE_TEMPORAL; + } else { + int idx = rdtsc() % numFSLocs; //randomly choose one location to monitor + sd.va = (void *)falseSharingLocs[idx].va; + sd.reuseType = REUSE_SPATIAL; #if 0 - int offset = ((uint64_t)data_addr - aligned_pc) / accessLen; - int bound = CACHE_LINE_SZ / accessLen; - int r = rdtsc() % bound; - if (r == offset) r = (r+1) % bound; - sd.va = aligned_pc + (r * accessLen); + int offset = ((uint64_t)data_addr - aligned_pc) / accessLen; + int bound = CACHE_LINE_SZ / accessLen; + int r = rdtsc() % bound; + if (r == offset) r = (r+1) % bound; + sd.va = aligned_pc + (r * accessLen); #endif - } - else -#endif - { + } + } else { sd.va = data_addr; sd.reuseType = REUSE_TEMPORAL; } + if (!IsValidAddress(sd.va, precisePC)) { goto ErrExit; // incorrect access type } diff --git a/src/tool/hpcrun/sample-sources/watchpoint_support.h b/src/tool/hpcrun/sample-sources/watchpoint_support.h index bcd5a4c664..233502a103 100644 --- a/src/tool/hpcrun/sample-sources/watchpoint_support.h +++ b/src/tool/hpcrun/sample-sources/watchpoint_support.h @@ -85,7 +85,7 @@ typedef enum MergePolicy {AUTO_MERGE, NO_MERGE, CLIENT_ACTION} MergePolicy; typedef enum OverwritePolicy {OVERWRITE, NO_OVERWRITE} OverwritePolicy; typedef enum VictimType {EMPTY_SLOT, NON_EMPTY_SLOT, NONE_AVAILABLE} VictimType; typedef enum WPTriggerActionType {DISABLE_WP, ALREADY_DISABLED, DISABLE_ALL_WP, RETAIN_WP} WPTriggerActionType; -typedef enum ReuseType { REUSE_TEMPORAL, REUSE_SPATIAL, RUESE_BOTH} ReuseType; // for reuse client +typedef enum ReuseType { REUSE_TEMPORAL, REUSE_SPATIAL, REUSE_BOTH} ReuseType; // for reuse client // Data structure that is given by clients to set a WP typedef struct SampleData{