diff --git a/src/tool/hpcrun/hpcrun_stats.c b/src/tool/hpcrun/hpcrun_stats.c index f764c50ecc..3b21f34570 100644 --- a/src/tool/hpcrun/hpcrun_stats.c +++ b/src/tool/hpcrun/hpcrun_stats.c @@ -102,8 +102,10 @@ static atomic_long num_falseWWIns = ATOMIC_VAR_INIT(0); static atomic_long num_falseRWIns = ATOMIC_VAR_INIT(0); static atomic_long num_falseWRIns = ATOMIC_VAR_INIT(0); -static atomic_long num_reuse = ATOMIC_VAR_INIT(0); +static atomic_long num_reuseSpatial = ATOMIC_VAR_INIT(0); +static atomic_long num_reuseTemporal = ATOMIC_VAR_INIT(0); static atomic_long num_latency = ATOMIC_VAR_INIT(0); +static atomic_long num_corrected_reuse_distance = ATOMIC_VAR_INIT(0); static atomic_long num_unwind_intervals_total = ATOMIC_VAR_INIT(0); static atomic_long num_unwind_intervals_suspicious = ATOMIC_VAR_INIT(0); @@ -155,6 +157,11 @@ hpcrun_stats_reinit(void) atomic_store_explicit(&num_trueWWIns, 0, memory_order_relaxed); atomic_store_explicit(&num_trueRWIns, 0, memory_order_relaxed); atomic_store_explicit(&num_trueWRIns, 0, memory_order_relaxed); + + atomic_store_explicit(&num_reuseSpatial, 0, memory_order_relaxed); + atomic_store_explicit(&num_reuseTemporal, 0, memory_order_relaxed); + atomic_store_explicit(&num_latency, 0, memory_order_relaxed); + atomic_store_explicit(&num_corrected_reuse_distance, 0, memory_order_relaxed); } @@ -274,95 +281,107 @@ void hpcrun_stats_num_insane_ip_inc(long val) { atomic_fetch_add_explicit(&num_insane_ip, val, memory_order_relaxed); -} - - -long +} + + +long hpcrun_stats_num_insane_ip(void) -{ +{ return atomic_load_explicit(&num_insane_ip, memory_order_relaxed); -} +} void hpcrun_stats_num_writtenBytes_inc(long val) { atomic_fetch_add_explicit(&num_writtenBytes, val, memory_order_relaxed); -} - +} + void hpcrun_stats_num_usedBytes_inc(long val) { atomic_fetch_add_explicit(&num_usedBytes, val, memory_order_relaxed); -} +} void hpcrun_stats_num_deadBytes_inc(long val) { atomic_fetch_add_explicit(&num_deadBytes, val, memory_order_relaxed); -} +} void hpcrun_stats_num_newBytes_inc(long val) { atomic_fetch_add_explicit(&num_newBytes, val, memory_order_relaxed); -} +} void hpcrun_stats_num_oldAppxBytes_inc(long val) { atomic_fetch_add_explicit(&num_oldAppxBytes, val, memory_order_relaxed); -} +} void hpcrun_stats_num_oldBytes_inc(long val) { atomic_fetch_add_explicit(&num_oldBytes, val, memory_order_relaxed); -} - +} + void hpcrun_stats_num_loadedBytes_inc(long val) { atomic_fetch_add_explicit(&num_loadedBytes, val, memory_order_relaxed); -} +} void hpcrun_stats_num_accessedIns_inc(long val) { atomic_fetch_add_explicit(&num_accessedIns, val, memory_order_relaxed); -} +} + +void +hpcrun_stats_num_reuseTemporal_inc(long val) +{ + atomic_fetch_add_explicit(&num_reuseTemporal, val, memory_order_relaxed); +} void -hpcrun_stats_num_reuse_inc(long val) +hpcrun_stats_num_reuseSpatial_inc(long val) { - atomic_fetch_add_explicit(&num_reuse, val, memory_order_relaxed); -} + atomic_fetch_add_explicit(&num_reuseSpatial, val, memory_order_relaxed); +} void hpcrun_stats_num_latency_inc(long val) { atomic_fetch_add_explicit(&num_latency, val, memory_order_relaxed); -} +} + +void +hpcrun_stats_num_corrected_reuse_distance_inc(long val) +{ + atomic_fetch_add_explicit(&num_corrected_reuse_distance, val, memory_order_relaxed); +} void hpcrun_stats_num_falseWWIns_inc(long val) { atomic_fetch_add_explicit(&num_falseWWIns, val, memory_order_relaxed); -} +} void hpcrun_stats_num_falseRWIns_inc(long val) { atomic_fetch_add_explicit(&num_falseRWIns, val, memory_order_relaxed); -} +} void hpcrun_stats_num_falseWRIns_inc(long val) { atomic_fetch_add_explicit(&num_falseWRIns, val, memory_order_relaxed); -} +} void hpcrun_stats_num_trueWWIns_inc(long val) @@ -384,7 +403,7 @@ hpcrun_stats_num_trueWRIns_inc(long val) //----------------------------- -// samples total +// samples total //----------------------------- void @@ -403,7 +422,7 @@ hpcrun_stats_num_samples_total(void) //----------------------------- -// samples attempted +// samples attempted //----------------------------- void @@ -422,7 +441,7 @@ hpcrun_stats_num_samples_attempted(void) //----------------------------- -// samples blocked async +// samples blocked async //----------------------------- // The async blocks happen in the signal handlers, without getting to @@ -444,7 +463,7 @@ hpcrun_stats_num_samples_blocked_async(void) //----------------------------- -// samples blocked dlopen +// samples blocked dlopen //----------------------------- void @@ -617,7 +636,6 @@ hpcrun_stats_num_samples_yielded(void) //----------------------------- // print summary //----------------------------- - void hpcrun_stats_print_summary(void) { @@ -637,9 +655,9 @@ hpcrun_stats_print_summary(void) getrusage(RUSAGE_SELF, &rusage); //AMSG("WATCHPOINT ANOMALIES: samples:%ld, SM_imprecise:%ld, WP_Set:%ld, WP_triggered:%ld, WP_SampleTriggering:%ld, WP_ImpreciseIP:%ld, WP_InsaneIP:%ld, WP_Off8Addr:%ld, WP_ImpreciseAddr:%ld, WP_Dropped:%ld", num_samples_total, num_samples_imprecise, num_watchpoints_set, num_watchpoints_triggered, num_sample_triggering_watchpoints, num_watchpoints_imprecise, num_insane_ip, num_watchpoints_imprecise_address_8_byte, num_watchpoints_imprecise_address, num_watchpoints_dropped); - AMSG("WATCHPOINT ANOMALIES: samples:%.2e, SM_imprecise:%.2e, WP_Set:%.2e, WP_triggered:%.2e, WP_SampleTriggering:%.2e, WP_ImpreciseIP:%.2e, WP_InsaneIP:%.2e, WP_Off8Addr:%.2e, WP_ImpreciseAddr:%.2e, WP_Dropped:%.2e", (double)atomic_load(&num_samples_total), (double)atomic_load(&num_samples_imprecise), (double)atomic_load(&num_watchpoints_set), (double)atomic_load(&num_watchpoints_triggered), (double)atomic_load(&num_sample_triggering_watchpoints), (double)atomic_load(&num_watchpoints_imprecise), (double)atomic_load(&num_insane_ip), (double)atomic_load(&num_watchpoints_imprecise_address_8_byte), (double)atomic_load(&num_watchpoints_imprecise_address), (double)atomic_load(&num_watchpoints_dropped)); + AMSG("WATCHPOINT ANOMALIES: samples:%.2e, SM_imprecise:%.2e, WP_Set:%.2e, WP_triggered:%.2e, WP_SampleTriggering:%.2e, WP_ImpreciseIP:%.2e, WP_InsaneIP:%.2e, WP_Off8Addr:%.2e, WP_ImpreciseAddr:%.2e, WP_Dropped:%.2e, CORRECTED_REUSE_DISTANCE:%.2e", (double)atomic_load(&num_samples_total), (double)atomic_load(&num_samples_imprecise), (double)atomic_load(&num_watchpoints_set), (double)atomic_load(&num_watchpoints_triggered), (double)atomic_load(&num_sample_triggering_watchpoints), (double)atomic_load(&num_watchpoints_imprecise), (double)atomic_load(&num_insane_ip), (double)atomic_load(&num_watchpoints_imprecise_address_8_byte), (double)atomic_load(&num_watchpoints_imprecise_address), (double)atomic_load(&num_watchpoints_dropped), (double)atomic_load(&num_corrected_reuse_distance)); - AMSG("WATCHPOINT STATS: writtenBytes:%ld, usedBytes:%ld, deadBytes:%ld, newBytes:%ld, oldBytes:%ld, oldAppxBytes:%ld, loadedBytes:%ld, accessedIns:%ld, falseWWIns:%ld, falseRWIns:%ld, falseWRIns:%ld, trueWWIns:%ld, trueRWIns:%ld, trueWRIns:%ld, RSS:%ld, reuse:%ld, latency:%ld", num_writtenBytes, num_usedBytes, num_deadBytes, num_newBytes, num_oldBytes, num_oldAppxBytes, num_loadedBytes, num_accessedIns, num_falseWWIns, num_falseRWIns, num_falseWRIns, num_trueWWIns, num_trueRWIns, num_trueWRIns, (size_t)(rusage.ru_maxrss), num_reuse, num_latency); + AMSG("WATCHPOINT STATS: writtenBytes:%ld, usedBytes:%ld, deadBytes:%ld, newBytes:%ld, oldBytes:%ld, oldAppxBytes:%ld, loadedBytes:%ld, accessedIns:%ld, falseWWIns:%ld, falseRWIns:%ld, falseWRIns:%ld, trueWWIns:%ld, trueRWIns:%ld, trueWRIns:%ld, RSS:%ld, reuseTemporal:%ld, reuseSpatial:%ldlatency:%ld", num_writtenBytes, num_usedBytes, num_deadBytes, num_newBytes, num_oldBytes, num_oldAppxBytes, num_loadedBytes, num_accessedIns, num_falseWWIns, num_falseRWIns, num_falseWRIns, num_trueWWIns, num_trueRWIns, num_trueWRIns, (size_t)(rusage.ru_maxrss), num_reuseTemporal, num_reuseSpatial, num_latency); AMSG("SAMPLE ANOMALIES: blocks: %ld (async: %ld, dlopen: %ld), " "errors: %ld (segv: %ld, soft: %ld)", diff --git a/src/tool/hpcrun/hpcrun_stats.h b/src/tool/hpcrun/hpcrun_stats.h index 67452f1da7..cf009f05d0 100644 --- a/src/tool/hpcrun/hpcrun_stats.h +++ b/src/tool/hpcrun/hpcrun_stats.h @@ -52,7 +52,7 @@ void hpcrun_stats_reinit(void); //----------------------------- -// watchpoint +// watchpoint //----------------------------- void hpcrun_stats_num_samples_imprecise_inc(long val); long hpcrun_stats_num_samples_imprecise(void); @@ -85,12 +85,13 @@ void hpcrun_stats_num_deadBytes_inc(long val); void hpcrun_stats_num_newBytes_inc(long val); void hpcrun_stats_num_oldBytes_inc(long val); void hpcrun_stats_num_oldAppxBytes_inc(long val); -void hpcrun_stats_num_reuse_inc(long val); +void hpcrun_stats_num_reuseTemporal_inc(long val); +void hpcrun_stats_num_reuseSpatial_inc(long val); void hpcrun_stats_num_loadedBytes_inc(long val); //----------------------------- -// samples total +// samples total //----------------------------- void hpcrun_stats_num_samples_total_inc(void); @@ -98,7 +99,7 @@ long hpcrun_stats_num_samples_total(void); //----------------------------- -// samples attempted +// samples attempted //----------------------------- void hpcrun_stats_num_samples_attempted_inc(void); @@ -106,7 +107,7 @@ long hpcrun_stats_num_samples_attempted(void); //----------------------------- -// samples blocked async +// samples blocked async //----------------------------- void hpcrun_stats_num_samples_blocked_async_inc(void); @@ -114,7 +115,7 @@ long hpcrun_stats_num_samples_blocked_async(void); //----------------------------- -// samples blocked dlopen +// samples blocked dlopen //----------------------------- void hpcrun_stats_num_samples_blocked_dlopen_inc(void); diff --git a/src/tool/hpcrun/metrics.c b/src/tool/hpcrun/metrics.c index 92ea033171..cb1bd54180 100644 --- a/src/tool/hpcrun/metrics.c +++ b/src/tool/hpcrun/metrics.c @@ -211,7 +211,7 @@ hpcrun_get_num_metrics() id2metric[l->id] = &(l->val); } metric_proc_tbl = (metric_upd_proc_t**) hpcrun_malloc(n_metrics * sizeof(metric_upd_proc_t*)); - + for(metric_proc_map_t* l = proc_map; l; l = l->next) { // for(metric_proc_map_t* l = proc_map; l; l = l->next) { TMSG(METRICS_FINALIZE, "metric_proc[%d] = %p", l->id, l->proc); @@ -233,7 +233,7 @@ hpcrun_get_num_metrics() // Finalize metrics -void hpcrun_finalize_metrics() +void hpcrun_finalize_metrics() { hpcrun_get_num_metrics(); } @@ -241,7 +241,7 @@ void hpcrun_finalize_metrics() metric_desc_t* hpcrun_id2metric(int metric_id) { - hpcrun_get_num_metrics(); + hpcrun_get_num_metrics(); if ((0 <= metric_id) && (metric_id < n_metrics)) { return id2metric[metric_id]; } @@ -307,7 +307,7 @@ hpcrun_new_metric_of_kind(kind_info_t* kind) kind->idx++; n_metrics++; - + // // No preallocation for metric_proc tbl // @@ -316,7 +316,7 @@ hpcrun_new_metric_of_kind(kind_info_t* kind) m->id = metric_data->id; m->proc = (metric_upd_proc_t*) NULL; proc_map = m; - + return metric_data->id; } @@ -326,7 +326,7 @@ hpcrun_new_metric(void) return hpcrun_new_metric_of_kind(current_kind); } -metric_desc_t* +metric_desc_t* hpcrun_set_metric_info_w_fn(int metric_id, const char* name, MetricFlags_ValFmt_t valFmt, size_t period, metric_upd_proc_t upd_fn, metric_desc_properties_t prop) @@ -377,7 +377,7 @@ hpcrun_set_metric_info_w_fn(int metric_id, const char* name, } -metric_desc_t* +metric_desc_t* hpcrun_set_metric_info_and_period(int metric_id, const char* name, MetricFlags_ValFmt_t valFmt, size_t period, metric_desc_properties_t prop) { @@ -454,7 +454,7 @@ hpcrun_metric_std(int metric_id, metric_set_t* set, switch (minfo->flags.fields.valFmt) { case MetricFlags_ValFmt_Int: if (operation == '+') - loc->i += val.i; + loc->i += val.i; else if (operation == '=') loc->i = val.i; break; @@ -479,13 +479,13 @@ hpcrun_metric_std_set(int metric_id, metric_set_t* set, } // -// Given two metrics, metric_id1 and metric_id2, -// bump up metric_id2 to reach metric_id1 and return +// Given two metrics, metric_id1 and metric_id2, +// bump up metric_id2 to reach metric_id1 and return // the difference between them multiplied by the period. // int -hpcrun_get_weighted_metric_diff(int metric_id1, int metric_id2, - metric_set_t* set, cct_metric_data_t * diff, +hpcrun_get_weighted_metric_diff(int metric_id1, int metric_id2, + metric_set_t* set, cct_metric_data_t * diff, cct_metric_data_t * diffWithPeriod) { metric_desc_t* minfo1 = hpcrun_id2metric(metric_id1); @@ -509,8 +509,12 @@ hpcrun_get_weighted_metric_diff(int metric_id1, int metric_id2, diff->i = (loc1->i - loc2->i); break; case MetricFlags_ValFmt_Real: - assert(loc1->r >= loc2->r); - diff->r = (loc1->r - loc2->r); + if (loc1->r < loc2->r){ + diff->r = 0; + } + else { + diff->r = (loc1->r - loc2->r); + } diffWithPeriod->r = (loc1->r - loc2->r) * minfo1->period; break; default: diff --git a/src/tool/hpcrun/sample-sources/perf/linux_perf.c b/src/tool/hpcrun/sample-sources/perf/linux_perf.c index 0569ac54cb..823d99af5c 100644 --- a/src/tool/hpcrun/sample-sources/perf/linux_perf.c +++ b/src/tool/hpcrun/sample-sources/perf/linux_perf.c @@ -62,7 +62,7 @@ #include #include -#include +#include #include #include #include @@ -90,7 +90,7 @@ #include "sample-sources/common.h" #include "sample-sources/watchpoint_support.h" #include "sample-sources/ss-errno.h" - + #include #include #include @@ -109,7 +109,7 @@ #include // prefix for metric helper #include // hostid -#include +#include #include "perfmon-util.h" @@ -135,7 +135,7 @@ // default number of samples per second per thread // // linux perf has a default of 4000. this seems high, but the overhead for perf -// is still small. however, for some processors (e.g., KNL), overhead +// is still small. however, for some processors (e.g., KNL), overhead // at such a high sampling rate is significant and as a result, the kernel // will adjust the threshold to less than 100. // @@ -178,19 +178,20 @@ struct event_threshold_s { }; //****************************************************************************** -// forward declarations +// forward declarations //****************************************************************************** -static bool +static bool perf_thread_init(event_info_t *event, event_thread_t *et); -static void +static void perf_thread_fini(int nevents, event_thread_t *event_thread); -static int +static int perf_event_handler( int sig, siginfo_t* siginfo, void* context); + //****************************************************************************** // constants //****************************************************************************** @@ -199,6 +200,12 @@ static const struct timespec nowait = {0, 0}; + +//****************************************************************************** +// global variables +//****************************************************************************** + + //****************************************************************************** // local variables //****************************************************************************** @@ -219,17 +226,34 @@ extern __thread bool hpcrun_thread_suppress_sample; //****************************************************************************** -// private operations +// private operations //****************************************************************************** -/* +// The array values consists of three elements. +// values[0]: raw counter value; values[1]: the time enabling; values[1]: the time running +static inline uint64_t perf_scale(uint64_t *values) { + uint64_t res = 0; + + if (!values[2] && !values[1] && values[0]) { + EMSG("WARNING: time_running = 0 = time_enabled, raw count not zero\n"); + } + if (values[2] > values[1]) { + EMSG("WARNING: time_running > time_enabled\n"); + } + if (values[2]) { + res = (uint64_t)((double)values[0] * values[1]/values[2]); + } + return res; +} + +/* * determine whether the perf sample source has been finalized for this thread - */ -static int + */ +static int perf_was_finalized ( - int nevents, + int nevents, event_thread_t *event_thread ) { @@ -239,7 +263,7 @@ perf_was_finalized /* * Enable all the counters - */ + */ static void perf_start_all(int nevents, event_thread_t *event_thread) { @@ -247,9 +271,9 @@ perf_start_all(int nevents, event_thread_t *event_thread) for(i=0; inum_overflows = 0; et->event = event; // ask sys to "create" the event // it returns -1 if it fails. @@ -403,20 +440,20 @@ perf_thread_init(event_info_t *event, event_thread_t *et) " id: %d, fd: %d, skid: %d," " config: %d, type: %d, sample_freq: %d," " freq: %d, error: %s", - event->id, et->fd, event->attr.precise_ip, - event->attr.config, event->attr.type, event->attr.sample_freq, + event->id, et->fd, event->attr.precise_ip, + event->attr.config, event->attr.type, event->attr.sample_freq, event->attr.freq, strerror(errno)); return false; } - // create mmap buffer for this file + // create mmap buffer for this file et->mmap = set_mmap(et->fd); // make sure the file I/O is asynchronous int flag = fcntl(et->fd, F_GETFL, 0); int ret = fcntl(et->fd, F_SETFL, flag | O_ASYNC ); if (ret == -1) { - EMSG("Can't set notification for event %d, fd: %d: %s", + EMSG("Can't set notification for event %d, fd: %d: %s", event->id, et->fd, strerror(errno)); } @@ -434,13 +471,13 @@ perf_thread_init(event_info_t *event, event_thread_t *et) owner.pid = syscall(SYS_gettid); ret = fcntl(et->fd, F_SETOWN_EX, &owner); if (ret == -1) { - EMSG("Can't set thread owner for event %d, fd: %d: %s", + EMSG("Can't set thread owner for event %d, fd: %d: %s", event->id, et->fd, strerror(errno)); } ret = ioctl(et->fd, PERF_EVENT_IOC_RESET, 0); if (ret == -1) { - EMSG("Can't reset event %d, fd: %d: %s", + EMSG("Can't reset event %d, fd: %d: %s", event->id, et->fd, strerror(errno)); } return (ret >= 0); @@ -448,7 +485,7 @@ perf_thread_init(event_info_t *event, event_thread_t *et) //---------------------------------------------------------- -// actions when the program terminates: +// actions when the program terminates: // - unmap the memory // - close file descriptors used by each event //---------------------------------------------------------- @@ -473,7 +510,7 @@ perf_thread_fini(int nevents, event_thread_t *event_thread) event_thread[i].fd = PERF_FD_FINALIZED; } - if (event_thread[i].mmap) { + if (event_thread[i].mmap) { perf_unmmap(event_thread[i].mmap); event_thread[i].mmap = 0; } @@ -499,7 +536,7 @@ get_fd_index(int nevents, int fd, event_thread_t *event_thread) if (event_thread[i].fd == fd) return &(event_thread[i]); } - return NULL; + return NULL; } static sample_val_t* @@ -571,11 +608,40 @@ record_sample(event_thread_t *current, perf_mmap_data_t *mmap_data, } else { td->precise_pc = 0; } - - *sv = hpcrun_sample_callpath(context, current->event->metric, - (hpcrun_metricVal_t) {.r=counter}, - 0/*skipInner*/, 0/*isSync*/, &info); + if ( strstr(current->event->metric_desc->name, "LATENCY_ABOVE_THRESHOLD") || strstr(current->event->metric_desc->name, "LOAD_LATENCY") ) { + perf_mmap_data_src_t data_src; + data_src.val = mmap_data->data_src; + + assert( (data_src.mem_lvl & PERF_MEM_LVL_MISS) == 0); // jqswang: Have not met PERF_MEM_LVL_MISS before. Notify me if there is one. + + if ( (data_src.mem_lvl & PERF_MEM_LVL_HIT) && ( (data_src.mem_lvl & PERF_MEM_LVL_L1) == 0) ){ // L1 MISS + *sv = hpcrun_sample_callpath(context, current->event->metric, (hpcrun_metricVal_t) {.r=counter}, 0/*skipInner*/, 0/*isSync*/, &info); + extern int latency_metric_id; + cct_metric_data_increment(latency_metric_id, sv->sample_node, (cct_metric_data_t){.i = mmap_data->weight}); + extern int latency_l1_miss_load_metric_id; + cct_metric_data_increment(latency_l1_miss_load_metric_id, sv->sample_node, (cct_metric_data_t){.i = counter}); + + if ( (data_src.mem_lvl & PERF_MEM_LVL_LFB) == 0) { + if ( (data_src.mem_lvl & PERF_MEM_LVL_L2) == 0) { // L2 miss + extern int latency_l2_miss_load_metric_id; + cct_metric_data_increment(latency_l2_miss_load_metric_id, sv->sample_node, (cct_metric_data_t){.i = counter}); + + if ( (data_src.mem_lvl & PERF_MEM_LVL_L3) == 0) { // L3 miss + extern int latency_l3_miss_load_metric_id; + cct_metric_data_increment(latency_l3_miss_load_metric_id, sv->sample_node, (cct_metric_data_t){.i = counter}); + } + } + } + } else { // Otherwise (L1 HIT, Non_Available) + *sv = hpcrun_sample_callpath(context, current->event->metric, (hpcrun_metricVal_t){.r=counter}, 0/*skipInner*/, 0/*isSync*/, &info); + } + } + else { + *sv = hpcrun_sample_callpath(context, current->event->metric, + (hpcrun_metricVal_t) {.r=counter}, + 0/*skipInner*/, 0/*isSync*/, &info); + } // no need to reset the precise_pc; hpcrun_sample_callpath does so // td->precise_pc = 0; @@ -603,7 +669,7 @@ record_sample(event_thread_t *current, perf_mmap_data_t *mmap_data, sv->sample_node, current->event->metric); } - blame_shift_apply(current->event->metric, sv->sample_node, + blame_shift_apply(current->event->metric, sv->sample_node, counter /*metricIncr*/); return sv; @@ -730,10 +796,10 @@ METHOD_FN(thread_fini_action) { TMSG(LINUX_PERF, "%d: unregister thread", self->sel_idx); - METHOD_CALL(self, stop); // stop the sample source + METHOD_CALL(self, stop); // stop the sample source event_thread_t *event_thread = TD_GET(ss_info)[self->sel_idx].ptr; - int nevents = (self->evl).nevents; + int nevents = (self->evl).nevents; perf_thread_fini(nevents, event_thread); @@ -783,10 +849,10 @@ METHOD_FN(shutdown) { TMSG(LINUX_PERF, "shutdown"); - METHOD_CALL(self, stop); // stop the sample source + METHOD_CALL(self, stop); // stop the sample source event_thread_t *event_thread = TD_GET(ss_info)[self->sel_idx].ptr; - int nevents = (self->evl).nevents; + int nevents = (self->evl).nevents; perf_thread_fini(nevents, event_thread); @@ -831,7 +897,7 @@ METHOD_FN(supports_event, const char *ev_str) } - + // -------------------------------------------------------------------------- // handle a list of events // -------------------------------------------------------------------------- @@ -849,7 +915,7 @@ METHOD_FN(process_event_list, int lush_metrics) // TODO: stupid way to count the number of events for (event = start_tok(evlist); more_tok(); event = next_tok(), num_events++); - + // setup all requested events // if an event cannot be initialized, we still keep it in our list // but there will be no samples @@ -862,6 +928,16 @@ METHOD_FN(process_event_list, int lush_metrics) } memset(event_desc, 0, size); + extern int *reuse_distance_events; + extern int reuse_distance_num_events; + reuse_distance_events = (int *) hpcrun_malloc(sizeof(int) * num_events); + reuse_distance_num_events = 0; + if (reuse_distance_events == NULL){ + EMSG("Unable to allocate %d bytes", sizeof(int)*num_events); + return; + } + + int i=0; set_default_threshold(); @@ -913,7 +989,7 @@ METHOD_FN(process_event_list, int lush_metrics) // ------------------------------------------------------------ // initialize the property of the metric - // if the metric's name has "CYCLES" it mostly a cycle metric + // if the metric's name has "CYCLES" it mostly a cycle metric // this assumption is not true, but it's quite closed // ------------------------------------------------------------ @@ -928,19 +1004,48 @@ METHOD_FN(process_event_list, int lush_metrics) // since the OS will free it, we don't have to do it in hpcrun // set the metric for this perf event event_desc[i].metric = hpcrun_new_metric(); - + + + /******** For witch client WP_REUSE ***************/ +#ifdef REUSE_HISTO + if (strstr(name, "MEM_UOPS_RETIRED") != NULL) +#else + if (strstr(name, "MEM_UOPS_RETIRED") != NULL) //jqswang: TODO // && threshold == 0) +#endif + { + reuse_distance_events[reuse_distance_num_events++] = i; + } + /**************************************************/ + + // ------------------------------------------------------------ // if we use frequency (event_type=1) then the period is not deterministic, // it can change dynamically. In this case, the period is 1 // ------------------------------------------------------------ if (!is_period) { - // using frequency : the threshold is always 1, + // using frequency : the threshold is always 1, // since the period is determine dynamically threshold = 1; } metric_desc_t *m = hpcrun_set_metric_info_and_period(event_desc[i].metric, name_dup, MetricFlags_ValFmt_Real, threshold, prop); + // add the latency metric if the event is MEM_TRANS_RETIRED:LATENCY_ABOVE_THRESHOLD or MEM_TRANS_RETIRED:LOAD_LATENCY + if (strstr(name, "MEM_TRANS_RETIRED")) { + extern int latency_metric_id; + latency_metric_id = hpcrun_new_metric(); + hpcrun_set_metric_info_and_period(latency_metric_id, "LATENCY", MetricFlags_ValFmt_Int, threshold, metric_property_none); + extern int latency_l1_miss_load_metric_id; + latency_l1_miss_load_metric_id = hpcrun_new_metric(); + hpcrun_set_metric_info_and_period(latency_l1_miss_load_metric_id, "L1_CACHE_MISS_LOAD", MetricFlags_ValFmt_Int, threshold, metric_property_none); + extern int latency_l2_miss_load_metric_id; + latency_l2_miss_load_metric_id = hpcrun_new_metric(); + hpcrun_set_metric_info_and_period(latency_l2_miss_load_metric_id, "L2_CACHE_MISS_LOAD", MetricFlags_ValFmt_Int, threshold, metric_property_none); + extern int latency_l3_miss_load_metric_id; + latency_l3_miss_load_metric_id = hpcrun_new_metric(); + hpcrun_set_metric_info_and_period(latency_l3_miss_load_metric_id, "L3_CACHE_MISS_LOAD", MetricFlags_ValFmt_Int, threshold, metric_property_none); + } + if (m == NULL) { EMSG("Error: unable to create metric #%d: %s", index, name); } else { @@ -1041,7 +1146,6 @@ read_fd(int fd) } - /*************************************************************************** * object ***************************************************************************/ @@ -1066,14 +1170,52 @@ void linux_perf_events_resume(){ perf_start_all(nevents, event_thread); } +// OUTPUT: val, it is a uint64_t array and has at least 3 elements. +// For a counting event, val[0] is the actual value read from counter; val[1] is the time enabling; val[2] is time running +// For a overflow event, val[0] is the actual scaled value; val[1] and val[2] are set to 0 +// RETURN: 0, sucess; -1, error +int linux_perf_read_event_counter(int event_index, uint64_t *val){ + sample_source_t *self = &obj_name(); + event_thread_t *event_thread = TD_GET(ss_info)[self->sel_idx].ptr; + + event_thread_t *current = &(event_thread[event_index]); + + int ret = perf_read_event_counter(current, val); + + if (ret < 0) return -1; // something wrong here + + uint64_t sample_period = current->event->attr.sample_period; + if (sample_period == 0){ // counting event + return 0; + } else { + // overflow event + assert(val[1] == val[2]); //jqswang: TODO: I have no idea how to calculate the value under multiplexing for overflow event. + int64_t scaled_val = (int64_t) val[0] ;//% sample_period; + if (scaled_val >= sample_period * 10 // The counter value can become larger than the sampling period but they are usually less than 2 * sample_period + || scaled_val < 0){ + //jqswang: TODO: it does not filter out all the invalid values + //fprintf(stderr, "WEIRD_COUNTER: %ld %s\n", scaled_val, current->event->metric_desc->name); + hpcrun_stats_num_corrected_reuse_distance_inc(1); + scaled_val = 0; + } + //fprintf(stderr, "%s: %lu, %lu(%ld) %lu %lu ->", current->event->metric_desc->name, current->num_overflows, val[0],val[0],val[1],val[2]); + val[0] = current->num_overflows * sample_period + scaled_val; + //fprintf(stderr, " %lu\n", val[0]); + val[1] = 0; + val[2] = 0; + return 0; + } +} + + // --------------------------------------------- // signal handler // --------------------------------------------- static int perf_event_handler( - int sig, - siginfo_t* siginfo, + int sig, + siginfo_t* siginfo, void* context ) { @@ -1118,7 +1260,7 @@ perf_event_handler( // ---------------------------------------------------------------------------- if (siginfo->si_code < 0) { - TMSG(LINUX_PERF, "signal si_code %d < 0 indicates not from kernel", + TMSG(LINUX_PERF, "signal si_code %d < 0 indicates not from kernel", siginfo->si_code); perf_start_all(nevents, event_thread); @@ -1161,7 +1303,7 @@ perf_event_handler( // ---------------------------------------------------------------------------- // check #4: // check the index of the file descriptor (if we have multiple events) - // if the file descriptor is not on the list, we shouldn't store the + // if the file descriptor is not on the list, we shouldn't store the // metrics. Perhaps we should throw away? // ---------------------------------------------------------------------------- @@ -1180,6 +1322,10 @@ perf_event_handler( return 1; // tell monitor the signal has not been handled } + // Increment the number of overflows for the current event + current->num_overflows++; + + // ---------------------------------------------------------------------------- // parse the buffer until it finishes reading all buffers // ---------------------------------------------------------------------------- @@ -1199,8 +1345,9 @@ perf_event_handler( record_sample(current, &mmap_data, context, &sv); kernel_block_handler(current, sv, &mmap_data); + } while (more_data); - } while (more_data); + perf_reset_counter(fd); perf_start_all(nevents, event_thread); diff --git a/src/tool/hpcrun/sample-sources/perf/perf-util.c b/src/tool/hpcrun/sample-sources/perf/perf-util.c index 1c1c6495d2..f441dde1e7 100644 --- a/src/tool/hpcrun/sample-sources/perf/perf-util.c +++ b/src/tool/hpcrun/sample-sources/perf/perf-util.c @@ -119,12 +119,12 @@ perf_split_retained_node( // extract the abstract address in the node cct_addr_t *addr = hpcrun_cct_addr(node); - // create an abstract address representing the next machine code address + // create an abstract address representing the next machine code address cct_addr_t sibling_addr = *addr; sibling_addr.ip_norm.lm_ip++; // get the necessary sibling to node - cct_node_t *sibling = hpcrun_cct_insert_addr(hpcrun_cct_parent(node), + cct_node_t *sibling = hpcrun_cct_insert_addr(hpcrun_cct_parent(node), &sibling_addr); return sibling; @@ -136,8 +136,8 @@ perf_split_retained_node( */ static cct_node_t * perf_insert_cct( - uint16_t lm_id, - cct_node_t *parent, + uint16_t lm_id, + cct_node_t *parent, u64 ip ) { @@ -175,8 +175,8 @@ perf_util_get_kptr_restrict() return privilege; } -static uint16_t -perf_get_kernel_lm_id() +static uint16_t +perf_get_kernel_lm_id() { if (ksym_status == PERF_AVAILABLE && perf_kernel_lm_id == 0) { // ensure that this is initialized only once per process @@ -212,12 +212,12 @@ perf_get_kernel_lm_id() //---------------------------------------------------------- static cct_node_t * perf_add_kernel_callchain( - cct_node_t *leaf, + cct_node_t *leaf, void *data_aux ) { cct_node_t *parent = leaf; - + if (data_aux == NULL) { return parent; } @@ -226,14 +226,14 @@ perf_add_kernel_callchain( if (data->nr > 0) { uint16_t kernel_lm_id = perf_get_kernel_lm_id(); - // bug #44 https://github.com/HPCToolkit/hpctoolkit/issues/44 + // bug #44 https://github.com/HPCToolkit/hpctoolkit/issues/44 // if no kernel symbols are available, collapse the kernel call // chain into a single node if (perf_util_get_kptr_restrict() != 0) { return perf_insert_cct(kernel_lm_id, parent, 0); } - // add kernel IPs to the call chain top down, which is the + // add kernel IPs to the call chain top down, which is the // reverse of the order in which they appear in ips[] for (int i = data->nr - 1; i > 0; i--) { parent = perf_insert_cct(kernel_lm_id, parent, data->ips[i]); @@ -317,7 +317,7 @@ perf_util_kernel_syms_avail() /************************************************************* * Interface API - **************************************************************/ + **************************************************************/ //---------------------------------------------------------- // initialize perf_util. Need to be called as earliest as possible @@ -325,10 +325,10 @@ perf_util_kernel_syms_avail() void perf_util_init() { - // perf_kernel_lm_id must be set for each process. here, we clear it - // because it is too early to allocate a load module. it will be set + // perf_kernel_lm_id must be set for each process. here, we clear it + // because it is too early to allocate a load module. it will be set // later, exactly once per process if ksym_status == PERF_AVAILABLE. - perf_kernel_lm_id = 0; + perf_kernel_lm_id = 0; // if kernel symbols are available, we will attempt to collect kernel // callchains and add them to our call paths @@ -381,15 +381,20 @@ perf_util_attr_init( // some PMUs is sensitive to the sample type. // For instance, IDLE-CYCLES-BACKEND will fail if we set PERF_SAMPLE_ADDR. // By default, we need to initialize sample_type as minimal as possible. - - // add PERF_SAMPLE_ADDR | PERF_SAMPLE_IP for witch use - unsigned int sample_type = sampletype | PERF_SAMPLE_ADDR | PERF_SAMPLE_IP - | PERF_SAMPLE_PERIOD | PERF_SAMPLE_TIME; + unsigned int sample_type = sampletype + | PERF_SAMPLE_PERIOD | PERF_SAMPLE_TIME + | PERF_SAMPLE_IP | PERF_SAMPLE_ADDR + | PERF_SAMPLE_CPU | PERF_SAMPLE_TID + | PERF_SAMPLE_WEIGHT | PERF_SAMPLE_DATA_SRC; attr->size = sizeof(struct perf_event_attr); /* Size of attribute structure */ attr->freq = (usePeriod ? 0 : 1); attr->sample_period = threshold; /* Period or frequency of sampling */ + + // It enables that we can directly read the value of the event counter via file descriptor + attr->read_format = PERF_FORMAT_TOTAL_TIME_ENABLED|PERF_FORMAT_TOTAL_TIME_RUNNING; + int max_sample_rate = perf_util_get_max_sample_rate(); if (attr->freq == 1 && threshold >= max_sample_rate) { @@ -417,7 +422,7 @@ perf_util_attr_init( #endif attr->exclude_kernel = INCLUDE; } - + char *name; int precise_ip_type = perf_skid_parse_event(event_name, &name); free(name); @@ -425,7 +430,7 @@ perf_util_attr_init( u64 precise_ip; switch (precise_ip_type) { - case PERF_EVENT_AUTODETECT_SKID: + case PERF_EVENT_AUTODETECT_SKID: precise_ip = perf_skid_set_max_precise_ip(attr); break; case PERF_EVENT_SKID_ERROR: diff --git a/src/tool/hpcrun/sample-sources/perf/perf-util.h b/src/tool/hpcrun/sample-sources/perf/perf-util.h index e70d3e614a..7b39e8d6b3 100644 --- a/src/tool/hpcrun/sample-sources/perf/perf-util.h +++ b/src/tool/hpcrun/sample-sources/perf/perf-util.h @@ -109,6 +109,19 @@ typedef struct perf_mmap_data_s { } perf_mmap_data_t; +// data structure for the data_src field of perf_mmap_data_t +typedef union perf_mmap_data_src_t { + uint64_t val; + struct { + uint64_t mem_op:5, /* type of opcode */ + mem_lvl:14, /* memory hierarchy level */ + mem_snoop:5, /* snoop mode */ + mem_lock:2, /* lock instr */ + mem_dtlb:7, /* tlb access */ + mem_rsvd:31; + }; +} perf_mmap_data_src_t; + // -------------------------------------------------------------- // main data structure to store the information of an event. @@ -139,7 +152,7 @@ typedef struct event_thread_s { pe_mmap_t *mmap; // mmap buffer int fd; // file descriptor of the event event_info_t *event; // pointer to main event description - + uint64_t num_overflows; // record how many times this event has overflowed } event_thread_t; diff --git a/src/tool/hpcrun/sample-sources/perf/perf_mmap.c b/src/tool/hpcrun/sample-sources/perf/perf_mmap.c index 61c79536f3..4970c7e552 100644 --- a/src/tool/hpcrun/sample-sources/perf/perf_mmap.c +++ b/src/tool/hpcrun/sample-sources/perf/perf_mmap.c @@ -62,7 +62,6 @@ #include #include - /****************************************************************************** * hpcrun includes *****************************************************************************/ @@ -404,6 +403,7 @@ parse_buffer(int sample_type, event_thread_t *current, perf_mmap_data_t *mmap_in #endif #if LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0) if (sample_type & PERF_SAMPLE_WEIGHT) { + perf_read_u64(current_perf_mmap, &mmap_info->weight); data_read++; } if (sample_type & PERF_SAMPLE_DATA_SRC) { @@ -426,10 +426,110 @@ parse_buffer(int sample_type, event_thread_t *current, perf_mmap_data_t *mmap_in return data_read; } + +#if defined(__x86_64__) || defined(__i386__) + +#ifdef __x86_64__ +#define DECLARE_ARGS(val, low, high) unsigned low, high +#define EAX_EDX_VAL(val, low, high) ((low) | ((uint64_t )(high) << 32)) +#define EAX_EDX_ARGS(val, low, high) "a" (low), "d" (high) +#define EAX_EDX_RET(val, low, high) "=a" (low), "=d" (high) +#else +#define DECLARE_ARGS(val, low, high) unsigned long long val +#define EAX_EDX_VAL(val, low, high) (val) +#define EAX_EDX_ARGS(val, low, high) "A" (val) +#define EAX_EDX_RET(val, low, high) "=A" (val) +#endif + +#define barrier() __asm__ __volatile__("": : :"memory") + +static inline int rdpmc(pe_mmap_t *mmap, uint64_t *value) +{ + int counter = mmap->index - 1; + //fprintf(stderr,"counter = %d\n", counter); + DECLARE_ARGS(val, low, high); + + if (counter < 0) return -1; + + asm volatile("rdpmc" : EAX_EDX_RET(val, low, high) : "c" (counter)); + *value = EAX_EDX_VAL(val, low, high); + return 0; +} +#else +#error("rdpmc() is not defined"); +#endif + + +/* + * val[0] = raw count + * val[1] = TIME_ENABLED + * val[2] = TIME_RUNNING + */ +static inline int isCounterValid(uint64_t *val){ + if (!val[2] && !val[1] && val[0]) { + fprintf(stderr,"WARNING: time_running = 0 = time_enabled, raw count not zero\n"); + return -1; + } + if (val[2] > val[1]) { + fprintf(stderr, "WARNING: time_running > time_enabled\n"); + return -1; + } + return 1; +} + + //---------------------------------------------------------------------- // Public Interfaces //---------------------------------------------------------------------- +// read the counter value of the event +// val is an array of uint64_t, at least has a length of 3 +int perf_read_event_counter(event_thread_t *current, uint64_t *val){ + + pe_mmap_t *current_perf_mmap = current->mmap; + //rdpmc(current_perf_mmap, val); //something wrong when using rdpmc + + if (current->fd < 0){ + EMSG("Error: unable to open the event %d file descriptor", current->event->id); + return -1; + } + int ret = read(current->fd, val, sizeof(uint64_t) * 3 ); + if (ret < sizeof(uint64_t)*3) { + EMSG("Error: unable to read event %d", current->event->id); + return -1; + } + return 0; +} + +/* + * val[0] = raw count + * val[1] = TIME_ENABLED + * val[2] = TIME_RUNNING + */ +uint64_t perf_get_scaled_counter_val(uint64_t *val){ + uint64_t res = 0; + isCounterValid(val); + if (val[2]) { + res = (uint64_t)((double)val[0] * val[1]/val[2] ); + } + return res; +} + + +uint64_t perf_get_scaled_counter_delta(uint64_t *val, uint64_t *prev_val){ + uint64_t res = 0; + isCounterValid(val); + isCounterValid(prev_val); + + if (val[2] - prev_val[2]) { + res = (uint64_t)( + ((double)val[0] - (double)prev_val[0]) * ( (double)val[1] - (double)prev_val[1]) + / ((double) val[2] - (double)prev_val[2]) + ); + } + return res; +} + //---------------------------------------------------------- // reading mmap buffer from the kernel diff --git a/src/tool/hpcrun/sample-sources/perf/perf_mmap.h b/src/tool/hpcrun/sample-sources/perf/perf_mmap.h index cfdd0ebf8c..d1a569cfef 100644 --- a/src/tool/hpcrun/sample-sources/perf/perf_mmap.h +++ b/src/tool/hpcrun/sample-sources/perf/perf_mmap.h @@ -83,6 +83,7 @@ pe_mmap_t* set_mmap(int perf_fd); void perf_unmmap(pe_mmap_t *mmap); int read_perf_buffer(event_thread_t *current, perf_mmap_data_t *mmap_info); - - +int perf_read_event_counter(event_thread_t *current, uint64_t *val); +uint64_t perf_get_scaled_counter_val(uint64_t *val); +uint64_t perf_get_scaled_counter_delta(uint64_t *val, uint64_t *prev_val); #endif diff --git a/src/tool/hpcrun/sample-sources/watchpoint_clients.c b/src/tool/hpcrun/sample-sources/watchpoint_clients.c index cce0db0e72..d2f51f198e 100755 --- a/src/tool/hpcrun/sample-sources/watchpoint_clients.c +++ b/src/tool/hpcrun/sample-sources/watchpoint_clients.c @@ -141,8 +141,19 @@ int load_metric_id = -1; int dead_metric_id = -1; int measured_metric_id = -1; int latency_metric_id = -1; -int temporal_metric_id = -1; -int spatial_metric_id = -1; +int latency_l1_miss_load_metric_id = -1; +int latency_l2_miss_load_metric_id = -1; +int latency_l3_miss_load_metric_id = -1; + +int temporal_reuse_metric_id = -1; +int spatial_reuse_metric_id = -1; +int reuse_time_distance_metric_id = -1; // use rdtsc() to represent the reuse distance +int reuse_time_distance_count_metric_id = -1; // how many times reuse_time_distance_metric is incremented +int reuse_memory_distance_metric_id = -1; // use Loads+stores to reprent the reuse distance +int reuse_memory_distance_count_metric_id = -1; // how many times reuse_memory_distance_metric is incremented +int reuse_buffer_metric_ids[2] = {-1, -1}; // used to store temporal data for reuse client +int reuse_store_buffer_metric_id = -1; // store the last time we get an available value of stores + int false_ww_metric_id = -1; int false_rw_metric_id = -1; int false_wr_metric_id = -1; @@ -150,11 +161,44 @@ int true_ww_metric_id = -1; int true_rw_metric_id = -1; int true_wr_metric_id = -1; + +int *reuse_distance_events = NULL; +int reuse_distance_num_events = 0; +#ifdef REUSE_HISTO +bool reuse_output_trace = false; +double reuse_bin_start = 0; +double reuse_bin_ratio = 0; +uint64_t * reuse_bin_list = NULL; +double * reuse_bin_pivot_list = NULL; // store the bin intervals +int reuse_bin_size = 0; +#else +AccessType reuse_monitor_type = LOAD_AND_STORE; // WP_REUSE: what kind of memory access can be used to subscribe the watchpoint +WatchPointType reuse_trap_type = WP_RW; // WP_REUSE: what kind of memory access can trap the watchpoint +ReuseType reuse_profile_type = REUSE_BOTH; // WP_REUSE: we want to collect temporal reuse, spatial reuse OR both? +bool reuse_concatenate_use_reuse = false; // WP_REUSE: how to concatentate the use and reuse +#endif + #define NUM_WATERMARK_METRICS (4) int curWatermarkId = 0; int watermark_metric_id[NUM_WATERMARK_METRICS] = {-1, -1, -1, -1}; int pebs_metric_id[NUM_WATERMARK_METRICS] = {-1, -1, -1, -1}; +static inline uint64_t perf_scale(uint64_t *values) { + uint64_t res = 0; + + if (!values[2] && !values[1] && values[0]) { + fprintf(stderr,"WARNING: time_running = 0 = time_enabled, raw count not zero\n"); + } + if (values[2] > values[1]) { + fprintf(stderr, "WARNING: time_running > time_enabled\n"); + } + if (values[2]) { + res = (uint64_t)((double)values[0] * values[1]/values[2]); + } + return res; +} + + void SetupWatermarkMetric(int metricId){ if (curWatermarkId == NUM_WATERMARK_METRICS) { EEMSG("curWatermarkId == NUM_WATERMARK_METRICS = %d", NUM_WATERMARK_METRICS); @@ -190,8 +234,7 @@ __thread WPStats_t wpStats; #define WP_DEADSPY_EVENT_NAME "WP_DEADSPY" #define WP_REDSPY_EVENT_NAME "WP_REDSPY" #define WP_LOADSPY_EVENT_NAME "WP_LOADSPY" -#define WP_TEMPORAL_REUSE_EVENT_NAME "WP_TEMPORAL_REUSE" -#define WP_SPATIAL_REUSE_EVENT_NAME "WP_SPATIAL_REUSE" +#define WP_REUSE_EVENT_NAME "WP_REUSE" #define WP_FALSE_SHARING_EVENT_NAME "WP_FALSE_SHARING" #define WP_TRUE_SHARING_EVENT_NAME "WP_TRUE_SHARING" #define WP_ALL_SHARING_EVENT_NAME "WP_ALL_SHARING" @@ -204,8 +247,7 @@ typedef enum WP_CLIENT_ID{ WP_DEADSPY, WP_REDSPY, WP_LOADSPY, - WP_TEMPORAL_REUSE, - WP_SPATIAL_REUSE, + WP_REUSE, WP_FALSE_SHARING, WP_ALL_SHARING, WP_TRUE_SHARING, @@ -250,7 +292,8 @@ __thread uint64_t falseRWIns = 0; __thread uint64_t trueWWIns = 0; __thread uint64_t trueWRIns = 0; __thread uint64_t trueRWIns = 0; -__thread uint64_t reuse = 0; +__thread uint64_t reuseTemporal = 0; +__thread uint64_t reuseSpatial = 0; // Some stats __thread long int correct=0; @@ -279,14 +322,112 @@ __thread long ipDiff=0; //static int some_overflow; +/****************************************************************************** + * private tool function +*****************************************************************************/ +static int OpenWitchTraceOutput(){ + #define OUTPUT_TRACE_BUFFER_SIZE (1 <<10) + char file_name[PATH_MAX]; + int ret = snprintf(file_name, PATH_MAX, "%s-%u.reuse.hpcrun", hpcrun_files_executable_name(), syscall(SYS_gettid)); + if ( ret < 0 || ret >= PATH_MAX){ + return -1; + } + int fd = open(file_name, O_WRONLY | O_CREAT | O_APPEND, 0644); + if (fd < 0){ + return -1; + } + ret = hpcio_outbuf_attach(&(TD_GET(witch_client_trace_output)), fd, hpcrun_malloc(OUTPUT_TRACE_BUFFER_SIZE), OUTPUT_TRACE_BUFFER_SIZE, HPCIO_OUTBUF_UNLOCKED); + if (ret != HPCFMT_OK){ + return -1; + } + return 0; +} + +static void CloseWitchTraceOutput(){ + hpcio_outbuf_t *out_ptr = &(TD_GET(witch_client_trace_output)); + if (out_ptr->fd >= 0){ + hpcio_outbuf_close(out_ptr); + } +} + +static int WriteWitchTraceOutput(const char *fmt, ...){ + #define LOCAL_BUFFER_SIZE 1024 + va_list arg; + char local_buf[LOCAL_BUFFER_SIZE]; + va_start(arg, fmt); + int data_size = vsnprintf(local_buf, LOCAL_BUFFER_SIZE, fmt, arg); + va_end(arg); + if (data_size < 0 && data_size >= LOCAL_BUFFER_SIZE){ + return -1; + } + int ret = hpcio_outbuf_write(&(TD_GET(witch_client_trace_output)), local_buf, data_size); + if (ret != data_size){ + return -1; + } + return 0; +} + +#ifdef REUSE_HISTO +void ExpandReuseBinList(){ + // each time we double the size of reuse_bin_list + uint64_t *old_reuse_bin_list = reuse_bin_list; + double *old_reuse_bin_pivot_list = reuse_bin_pivot_list; + int old_reuse_bin_size = reuse_bin_size; + reuse_bin_size *= 2; + + reuse_bin_list = hpcrun_malloc(sizeof(uint64_t) * reuse_bin_size); + memset(reuse_bin_list, 0, sizeof(uint64_t) * reuse_bin_size); + memcpy(reuse_bin_list, old_reuse_bin_list, sizeof(uint64_t) * old_reuse_bin_size); + + reuse_bin_pivot_list = hpcrun_malloc(sizeof(double) * reuse_bin_size); + memset(reuse_bin_pivot_list, 0, sizeof(double) * reuse_bin_size); + memcpy(reuse_bin_pivot_list, old_reuse_bin_pivot_list, sizeof(double) * old_reuse_bin_size); + for(int i=old_reuse_bin_size; i < reuse_bin_size; i++){ + reuse_bin_pivot_list[i] = reuse_bin_pivot_list[i-1] * reuse_bin_ratio; + } + + //hpcrun_free(old_reuse_bin_list); + //hpcrun_free(old_reuse_bin_pivot_list); +} + +int FindReuseBinIndex(uint64_t distance){ + + if (distance < reuse_bin_pivot_list[0]){ + return 0; + } + if (distance >= reuse_bin_pivot_list[reuse_bin_size - 1]){ + ExpandReuseBinList(); + return FindReuseBinIndex(distance); + } + + int left = 0, right = reuse_bin_size - 1; + while(left + 1 < right){ + int mid = (left + right) / 2; + if ( distance < reuse_bin_pivot_list[mid]){ + right = mid; + } else { + left = mid; + } + } + assert(left + 1 == right); + return left + 1; +} + + +void ReuseAddDistance(uint64_t distance, uint64_t inc ){ + int index = FindReuseBinIndex(distance); + reuse_bin_list[index] += inc; +} +#endif + + /****************************************************************************** * method functions *****************************************************************************/ static WPTriggerActionType DeadStoreWPCallback(WatchPointInfo_t *wpi, int startOffset, int safeAccessLen, WatchPointTrigger_t * wt); static WPTriggerActionType RedStoreWPCallback(WatchPointInfo_t *wpi, int startOffseti, int safeAccessLen, WatchPointTrigger_t * wt); -static WPTriggerActionType TemporalReuseWPCallback(WatchPointInfo_t *wpi, int startOffset, int safeAccessLen, WatchPointTrigger_t * wt); -static WPTriggerActionType SpatialReuseWPCallback(WatchPointInfo_t *wpi, int startOffset, int safeAccessLen, WatchPointTrigger_t * wt); +static WPTriggerActionType ReuseWPCallback(WatchPointInfo_t *wpi, int startOffset, int safeAccessLen, WatchPointTrigger_t * wt); static WPTriggerActionType LoadLoadWPCallback(WatchPointInfo_t *wpi, int startOffset, int safeAccessLen, WatchPointTrigger_t * wt); static WPTriggerActionType FalseSharingWPCallback(WatchPointInfo_t *wpi, int startOffset, int safeAccessLen, WatchPointTrigger_t * wt); static WPTriggerActionType AllSharingWPCallback(WatchPointInfo_t *wpi, int startOffset, int safeAccessLen, WatchPointTrigger_t * wt); @@ -321,21 +462,13 @@ static WpClientConfig_t wpClientConfig[] = { .preWPAction = DISABLE_WP, .configOverrideCallback = LoadSpyWPConfigOverride }, - /**** Temporal Reuse ***/ - { - .id = WP_TEMPORAL_REUSE, - .name = WP_TEMPORAL_REUSE_EVENT_NAME, - .wpCallback = TemporalReuseWPCallback, - .preWPAction = DISABLE_WP, - .configOverrideCallback = TemporalReuseWPConfigOverride - }, - /**** Spatial Reuse ***/ + /**** Reuse ***/ { - .id = WP_SPATIAL_REUSE, - .name = WP_SPATIAL_REUSE_EVENT_NAME, - .wpCallback = SpatialReuseWPCallback, + .id = WP_REUSE, + .name = WP_REUSE_EVENT_NAME, + .wpCallback = ReuseWPCallback, .preWPAction = DISABLE_WP, - .configOverrideCallback = SpatialReuseWPConfigOverride + .configOverrideCallback = ReuseWPConfigOverride }, /**** False Sharing ***/ { @@ -385,7 +518,7 @@ static WpClientConfig_t wpClientConfig[] = { .preWPAction = DISABLE_WP, .configOverrideCallback = IPCTrueSharingWPConfigOverride } - + }; @@ -428,6 +561,7 @@ static void PopulateBlackListAddresses() { const char delim[] = " \n"; addr = strtok_r(l, delim, &save); char* perms = strtok_r(NULL, delim, &save); + (void) perms; //supress compiler's warning // skip 3 tokens for (int i=0; i < 3; i++) { (void) strtok_r(NULL, delim, &save);} char* name = strtok_r(NULL, delim, &save); @@ -490,12 +624,15 @@ METHOD_FN(start) { thread_data_t* td = hpcrun_get_thread_data(); source_state_t my_state = TD_GET(ss_state)[self->sel_idx]; - + if (my_state == START) { TMSG(WATCHPOINT,"*NOTE* WATCHPOINT start called when already in state START"); return; } td->ss_state[self->sel_idx] = START; +#ifdef REUSE_HISTO + assert(OpenWitchTraceOutput()==0); +#endif } static void ClientTermination(){ @@ -503,7 +640,7 @@ static void ClientTermination(){ hpcrun_stats_num_samples_imprecise_inc(wpStats.numImpreciseSamples); hpcrun_stats_num_watchpoints_set_inc(wpStats.numWatchpointsSet); WatchpointThreadTerminate(); - + switch (theWPConfig->id) { case WP_DEADSPY: hpcrun_stats_num_writtenBytes_inc(writtenBytes); @@ -522,14 +659,35 @@ static void ClientTermination(){ hpcrun_stats_num_oldBytes_inc(oldBytes); hpcrun_stats_num_oldAppxBytes_inc(oldAppxBytes); break; - case WP_TEMPORAL_REUSE: - hpcrun_stats_num_accessedIns_inc(accessedIns); - hpcrun_stats_num_reuse_inc(reuse); - break; - case WP_SPATIAL_REUSE: + case WP_REUSE: + { +#ifdef REUSE_HISTO + uint64_t val[3]; + //fprintf(stderr, "FINAL_COUNTING:"); + if (reuse_output_trace == false){ //dump the bin info + WriteWitchTraceOutput("BIN_START: %lf\n", reuse_bin_start); + WriteWitchTraceOutput("BIN_RATIO: %lf\n", reuse_bin_ratio); + + for(int i=0; i < reuse_bin_size; i++){ + WriteWitchTraceOutput("BIN: %d %lu\n", i, reuse_bin_list[i]); + } + } + + WriteWitchTraceOutput("FINAL_COUNTING:"); + for (int i=0; i < MIN(2,reuse_distance_num_events); i++){ + assert(linux_perf_read_event_counter(reuse_distance_events[i], val) >= 0); + //fprintf(stderr, " %lu %lu %lu,", val[0], val[1], val[2]);//jqswang + WriteWitchTraceOutput(" %lu %lu %lu,", val[0], val[1], val[2]); + } + //fprintf(stderr, "\n"); + WriteWitchTraceOutput("\n"); + //close the trace output + CloseWitchTraceOutput(); +#endif hpcrun_stats_num_accessedIns_inc(accessedIns); - hpcrun_stats_num_reuse_inc(reuse); - break; + hpcrun_stats_num_reuseTemporal_inc(reuseTemporal); + hpcrun_stats_num_reuseSpatial_inc(reuseSpatial); + } break; case WP_FALSE_SHARING: case WP_IPC_FALSE_SHARING: hpcrun_stats_num_accessedIns_inc(accessedIns); @@ -554,7 +712,7 @@ static void ClientTermination(){ hpcrun_stats_num_trueWWIns_inc(trueWWIns); hpcrun_stats_num_trueRWIns_inc(trueRWIns); hpcrun_stats_num_trueWRIns_inc(trueWRIns); - + default: break; } @@ -580,10 +738,10 @@ TopN(cct_node_t* node, cct_op_arg_t arg, size_t level) if (!set) return; hpcrun_metricVal_t *loc = hpcrun_metric_set_loc(set, metricID); if (!loc) return; - + uint64_t val = loc->i; if (val == 0) return; - + for (i=0; icore_profile_trace_data.epoch->csdata.tree_root; //TODO: partial? cct_node_t *partial = td->core_profile_trace_data.epoch->csdata.partial_unw_root; - + // trave root first and then partial second hpcrun_cct_walk_node_1st(root, TopN, (void *) metricID); - + int i, j; for (i=0; ii; - + if (val2 > val1) { cct_node_t *tmp = topNNode[i]; topNNode[i] = topNNode[j]; @@ -648,9 +806,9 @@ PrintTopN(int metricID) path = default_path; } sprintf(path, "%s/%s", path, "topN.log"); - + fd = fopen(path, "a+"); - + int libmonitorId, libhpcrunId; // print loadmodule info first fprintf (fd, "\n"); @@ -708,22 +866,22 @@ METHOD_FN(stop) //thread_data_t *td = hpcrun_get_thread_data(); //int nevents = self->evl.nevents; source_state_t my_state = TD_GET(ss_state)[self->sel_idx]; - + if (my_state == STOP) { TMSG(WATCHPOINT,"*NOTE* WATCHPOINT stop called when already in state STOP"); return; } - + if (my_state != START) { TMSG(WATCHPOINT,"*WARNING* WATCHPOINT stop called when not in state START"); return; } - + ClientTermination(); - + if (ENABLED(PRINTTOPN)) PrintTopN(dead_metric_id); - + TD_GET(ss_state)[self->sel_idx] = STOP; } @@ -731,7 +889,7 @@ static void METHOD_FN(shutdown) { TMSG(WATCHPOINT, "shutdown"); - + METHOD_CALL(self, stop); // make sure stop has been called self->state = UNINIT; } @@ -775,7 +933,7 @@ METHOD_FN(process_event_list, int lush_metrics) } char* evlist = METHOD_CALL(self, get_event_str); char* event = start_tok(evlist); - + // only one supported for(int i = 0; i < WP_MAX_CLIENTS; i++) { if (hpcrun_ev_is(event, wpClientConfig[i].name)) { @@ -783,17 +941,17 @@ METHOD_FN(process_event_list, int lush_metrics) break; } } - + wpStats.numImpreciseSamples = 0; wpStats.numWatchpointsSet = 0; WatchpointThreadInit(theWPConfig->wpCallback); - + if(theWPConfig->configOverrideCallback){ theWPConfig->configOverrideCallback(0); } - + PopulateBlackListAddresses(); - + switch (theWPConfig->id) { case WP_DEADSPY: measured_metric_id = hpcrun_new_metric(); @@ -801,7 +959,7 @@ METHOD_FN(process_event_list, int lush_metrics) dead_metric_id = hpcrun_new_metric(); hpcrun_set_metric_info_and_period(dead_metric_id, "BYTES_DEAD", MetricFlags_ValFmt_Int, 1, metric_property_none); break; - + case WP_REDSPY: case WP_LOADSPY: measured_metric_id = hpcrun_new_metric(); @@ -811,17 +969,151 @@ METHOD_FN(process_event_list, int lush_metrics) redApprox_metric_id = hpcrun_new_metric(); hpcrun_set_metric_info_and_period(redApprox_metric_id, "BYTES_RED_APPROX", MetricFlags_ValFmt_Int, 1, metric_property_none); break; - - case WP_TEMPORAL_REUSE: - temporal_metric_id = hpcrun_new_metric(); - hpcrun_set_metric_info_and_period(temporal_metric_id, "TEMPORAL", MetricFlags_ValFmt_Int, 1, metric_property_none); - break; - - case WP_SPATIAL_REUSE: - spatial_metric_id = hpcrun_new_metric(); - hpcrun_set_metric_info_and_period(spatial_metric_id, "SPATIAL", MetricFlags_ValFmt_Int, 1, metric_property_none); + + case WP_REUSE: + { +#ifdef REUSE_HISTO + { + char * bin_scheme_str = getenv("HPCRUN_WP_REUSE_BIN_SCHEME"); + if (bin_scheme_str){ + if ( 0 == strcasecmp(bin_scheme_str, "TRACE")){ + reuse_output_trace = true; + } + else { // it should be two numbers connected by "," + // For example, 4000.0,2.0 + char *dup_str = strdup(bin_scheme_str); + char *pos = strchr(dup_str, ','); + if ( pos == NULL){ + EEMSG("Invalid value of the environmental variable HPCRUN_WP_REUSE_BIN_SCHEME"); + free(dup_str); + monitor_real_abort(); + } + pos[0] = '\0'; + pos += 1; + + char *endptr; + reuse_bin_start = strtod(dup_str, &endptr); + if (reuse_bin_start <= 0.0 || reuse_bin_start == HUGE_VAL || endptr[0] != '\0'){ + EEMSG("Invalid value of the environmental variable HPCRUN_WP_REUSE_BIN_SCHEME"); + free(dup_str); + monitor_real_abort(); + } + reuse_bin_ratio = strtod(pos, &endptr); + if (reuse_bin_ratio <= 1.0 || reuse_bin_ratio == HUGE_VAL || endptr[0] != '\0'){ + EEMSG("Invalid value of the environmental variable HPCRUN_WP_REUSE_BIN_SCHEME"); + free(dup_str); + monitor_real_abort(); + } + free(dup_str); + printf("HPCRUN: start %lf, ratio %lf\n", reuse_bin_start, reuse_bin_ratio); + } + } else { //default + reuse_output_trace = false; + reuse_bin_start = 4000; + reuse_bin_ratio = 2; + } + + if (reuse_output_trace == false){ + reuse_bin_size = 20; + reuse_bin_list = hpcrun_malloc(sizeof(uint64_t)*reuse_bin_size); + memset(reuse_bin_list, 0, sizeof(uint64_t)*reuse_bin_size); + reuse_bin_pivot_list = hpcrun_malloc(sizeof(double)*reuse_bin_size); + reuse_bin_pivot_list[0] = reuse_bin_start; + for(int i=1; i < reuse_bin_size; i++){ + reuse_bin_pivot_list[i] = reuse_bin_pivot_list[i-1] * reuse_bin_ratio; + } + } + + } + +#else + { + char * monitor_type_str = getenv("HPCRUN_WP_REUSE_PROFILE_TYPE"); + if(monitor_type_str){ + if(0 == strcasecmp(monitor_type_str, "TEMPORAL")) { + reuse_profile_type = REUSE_TEMPORAL; + } else if (0 == strcasecmp(monitor_type_str, "SPATIAL")) { + reuse_profile_type = REUSE_SPATIAL; + } else if ( 0 == strcasecmp(monitor_type_str, "ALL") ) { + reuse_profile_type = REUSE_BOTH; + } else { + // default; + reuse_profile_type = REUSE_BOTH; + } + } else{ + // default + reuse_profile_type = REUSE_BOTH; + } + } + + { + char * monitor_type_str = getenv("HPCRUN_WP_REUSE_MONITOR_TYPE"); + if(monitor_type_str){ + if(0 == strcasecmp(monitor_type_str, "LOAD")) { + reuse_monitor_type = LOAD; + } else if (0 == strcasecmp(monitor_type_str, "STORE")) { + reuse_monitor_type = STORE; + } else if (0 == strcasecmp(monitor_type_str, "LS") || 0 == strcasecmp(monitor_type_str, "ALL") ) { + reuse_monitor_type = LOAD_AND_STORE; + } else { + // default; + reuse_monitor_type = LOAD_AND_STORE; + } + } else{ + // default + reuse_monitor_type = LOAD_AND_STORE; + } + } + { + char *trap_type_str = getenv("HPCRUN_WP_REUSE_TRAP_TYPE"); + if(trap_type_str){ + if(0 == strcasecmp(trap_type_str, "LOAD")) { + reuse_trap_type = WP_RW; // NO WP_READ allowed + } else if (0 == strcasecmp(trap_type_str, "STORE")) { + reuse_trap_type = WP_WRITE; + } else if (0 == strcasecmp(trap_type_str, "LS") || 0 == strcasecmp(trap_type_str, "ALL") ) { + reuse_trap_type = WP_RW; + } else { + // default; + reuse_trap_type = WP_RW; + } + } else{ + // default + reuse_trap_type = WP_RW; + } + } + + { + char *concatenate_order_str = getenv("HPCRUN_WP_REUSE_CONCATENATE_ORDER"); + if(concatenate_order_str && 0 == strcasecmp(concatenate_order_str, "USE_REUSE")){ + reuse_concatenate_use_reuse = true; + } else{ + reuse_concatenate_use_reuse = false; + } + } +#endif + + temporal_reuse_metric_id = hpcrun_new_metric(); + hpcrun_set_metric_info_and_period(temporal_reuse_metric_id, "TEMPORAL", MetricFlags_ValFmt_Int, 1, metric_property_none); + spatial_reuse_metric_id = hpcrun_new_metric(); + hpcrun_set_metric_info_and_period(spatial_reuse_metric_id, "SPATIAL", MetricFlags_ValFmt_Int, 1, metric_property_none); + reuse_memory_distance_metric_id = hpcrun_new_metric(); + hpcrun_set_metric_info_and_period(reuse_memory_distance_metric_id, "MEMORY_DISTANCE_SUM", MetricFlags_ValFmt_Int, 1, metric_property_none); + reuse_memory_distance_count_metric_id = hpcrun_new_metric(); + hpcrun_set_metric_info_and_period(reuse_memory_distance_count_metric_id, "MEMORY_DISTANCE_COUNT", MetricFlags_ValFmt_Int, 1, metric_property_none); + reuse_time_distance_metric_id = hpcrun_new_metric(); + hpcrun_set_metric_info_and_period(reuse_time_distance_metric_id, "TIME_DISTANCE_SUM", MetricFlags_ValFmt_Int, 1, metric_property_none); + reuse_time_distance_count_metric_id = hpcrun_new_metric(); + hpcrun_set_metric_info_and_period(reuse_time_distance_count_metric_id, "TIME_DISTANCE_COUNT", MetricFlags_ValFmt_Int, 1, metric_property_none); + + // the next two buffers only for internal use + reuse_buffer_metric_ids[0] = hpcrun_new_metric(); + hpcrun_set_metric_info_and_period(reuse_buffer_metric_ids[0], "REUSE_BUFFER_1", MetricFlags_ValFmt_Int, 1, metric_property_none); + reuse_buffer_metric_ids[1] = hpcrun_new_metric(); + hpcrun_set_metric_info_and_period(reuse_buffer_metric_ids[1],"REUSE_BUFFER_2", MetricFlags_ValFmt_Int, 1, metric_property_none); + + } break; - case WP_ALL_SHARING: case WP_IPC_ALL_SHARING: // must have a canonical load map across processes @@ -831,7 +1123,7 @@ METHOD_FN(process_event_list, int lush_metrics) SetUpFalseSharingMetrics(); SetUpTrueSharingMetrics(); break; - + case WP_FALSE_SHARING: case WP_IPC_FALSE_SHARING: // must have a canonical load map across processes @@ -840,7 +1132,7 @@ METHOD_FN(process_event_list, int lush_metrics) hpcrun_set_metric_info_and_period(measured_metric_id, "MONITORED", MetricFlags_ValFmt_Int, 1, metric_property_none); SetUpFalseSharingMetrics(); break; - + case WP_TRUE_SHARING: case WP_IPC_TRUE_SHARING: // must have a canonical load map across processes @@ -849,7 +1141,7 @@ METHOD_FN(process_event_list, int lush_metrics) hpcrun_set_metric_info_and_period(measured_metric_id, "MONITORED", MetricFlags_ValFmt_Int, 1, metric_property_none); SetUpTrueSharingMetrics(); break; - + default: break; } @@ -872,9 +1164,7 @@ METHOD_FN(display_events) printf("---------------------------------------------------------------------------\n"); printf("%s\n", WP_LOADSPY_EVENT_NAME); printf("---------------------------------------------------------------------------\n"); - printf("%s\n", WP_TEMPORAL_REUSE_EVENT_NAME); - printf("---------------------------------------------------------------------------\n"); - printf("%s\n", WP_SPATIAL_REUSE_EVENT_NAME); + printf("%s\n", WP_REUSE_EVENT_NAME); printf("---------------------------------------------------------------------------\n"); printf("%s\n", WP_FALSE_SHARING_EVENT_NAME); printf("---------------------------------------------------------------------------\n"); @@ -913,8 +1203,10 @@ enum JoinNodeType { E_KILLED=0, E_USED, E_NEW_VAL, - E_TEPORALLY_REUSED, - E_SPATIALLY_REUSED, + E_TEMPORALLY_REUSED_FROM, + E_TEMPORALLY_REUSED_BY, + E_SPATIALLY_REUSED_FROM, + E_SPATIALLY_REUSED_BY, E_TRUE_WW_SHARE, E_TRUE_WR_SHARE, E_TRUE_RW_SHARE, @@ -944,8 +1236,14 @@ static void USED_BY_INACCURATE_PC(void) {} static void NEW_VAL_BY(void) {} static void NEW_VAL_BY_INACCURATE_PC(void) {} -static void TEPORALLY_REUSED_BY(void) {} -static void TEPORALLY_REUSED_BY_INACCURATE_PC(void) {} +static void TEMPORALLY_REUSED_FROM(void) {} +static void TEMPORALLY_REUSED_FROM_INACCURATE_PC(void) {} + +static void TEMPORALLY_REUSED_BY(void) {} +static void TEMPORALLY_REUSED_BY_INACCURATE_PC(void) {} + +static void SPATIALLY_REUSED_FROM(void) {} +static void SPATIALLY_REUSED_FROM_INACCURATE_PC(void) {} static void SPATIALLY_REUSED_BY(void) {} static void SPATIALLY_REUSED_BY_INACCURATE_PC(void) {} @@ -993,8 +1291,10 @@ static const void * joinNodes[][2] = { [E_KILLED] = GET_FUN_ADDR(KILLED_BY), [E_USED] = GET_FUN_ADDR(USED_BY), [E_NEW_VAL] = GET_FUN_ADDR(NEW_VAL_BY), - [E_TEPORALLY_REUSED] = GET_FUN_ADDR(TEPORALLY_REUSED_BY), - [E_SPATIALLY_REUSED] = GET_FUN_ADDR(SPATIALLY_REUSED_BY), + [E_TEMPORALLY_REUSED_FROM] = GET_FUN_ADDR(TEMPORALLY_REUSED_FROM), + [E_TEMPORALLY_REUSED_BY] = GET_FUN_ADDR(TEMPORALLY_REUSED_BY), + [E_SPATIALLY_REUSED_FROM] = GET_FUN_ADDR(SPATIALLY_REUSED_FROM), + [E_SPATIALLY_REUSED_BY] = GET_FUN_ADDR(SPATIALLY_REUSED_BY), [E_TRUE_WW_SHARE] = GET_FUN_ADDR(TRUE_WW_SHARE), [E_TRUE_WR_SHARE] = GET_FUN_ADDR(TRUE_WR_SHARE), [E_TRUE_RW_SHARE] = GET_FUN_ADDR(TRUE_RW_SHARE), @@ -1066,19 +1366,80 @@ static inline void UpdateConcatenatedPathPair(void *ctxt, cct_node_t * oldNode, } +//possible return type: (uint64_t *) for interger, (double) for real +static void *get_metric_data_ptr(int metric_id, cct_node_t *node){ + if (! hpcrun_has_metric_set(node)) { + cct2metrics_assoc(node, hpcrun_metric_set_new()); + } + metric_set_t* set = hpcrun_get_metric_set(node); + metric_desc_t* minfo = hpcrun_id2metric(metric_id); + if (!minfo) { + return NULL; + } + hpcrun_metricVal_t* loc = hpcrun_metric_set_loc(set, metric_id); + switch (minfo->flags.fields.valFmt) { + case MetricFlags_ValFmt_Int: + return (void *) &(loc->i); + case MetricFlags_ValFmt_Real: + return (void *) &(loc->r); + default: + assert(false); + } + return NULL; +} + + +static inline cct_node_t *getPreciseNode(void *ctxt, void *precise_pc, int dummyMetricId){ + // currently, we assume precise_pc + 1 = context_pc for PEBS (+1 means one instruction) + // we want the context to point to the exact IP + + // unwind call stack once + sample_val_t v = hpcrun_sample_callpath(ctxt, dummyMetricId, SAMPLE_NO_INC, 0/*skipInner*/, 1/*isSync*/, NULL); + cct_node_t *new_node = v.sample_node; + if (precise_pc == 0) return new_node; + //fprintf("precise_pc = %lx\n", precise_pc); + cct_node_t *tmp_node = hpcrun_cct_parent(new_node); + assert(tmp_node); + if (is_same_function(hpcrun_context_pc(ctxt), precise_pc) == SAME_FN){ + tmp_node = hpcrun_insert_special_node(tmp_node, precise_pc-1); // in hpcrun_insert_special_node(), the ip is added by 1. We want to cancel it here. + new_node = tmp_node; + cct_addr_t *addr = hpcrun_cct_addr(tmp_node); + } + else { // if they are not within the same function. Set the node to the calling site. + cct_addr_t * addr = hpcrun_cct_addr(tmp_node); + if (addr->ip_norm.lm_ip - (unsigned long)precise_pc <= 15){ + tmp_node = hpcrun_cct_parent(tmp_node); + assert(tmp_node); + tmp_node = hpcrun_insert_special_node(tmp_node, precise_pc-1); + new_node = tmp_node; + } + } + return new_node; +} + + +static inline cct_node_t *getConcatenatedNode(cct_node_t *bottomNode, cct_node_t * topNode, const void * joinNode){ + // insert a special node + cct_node_t *node = hpcrun_insert_special_node(topNode, joinNode); + // concatenate call paths + node = hpcrun_cct_insert_path_return_leaf(bottomNode, node); + return node; +} + + static WPTriggerActionType DeadStoreWPCallback(WatchPointInfo_t *wpi, int startOffset, int safeAccessLen, WatchPointTrigger_t * wt){ if(!wt->pc) { // if the ip is 0, let's drop the WP return ALREADY_DISABLED; } - + // This is a approximation. // If we took N samples at wpi->sample.node since the last time a WP triggered here, // If this a dead write, we'll update the dead_writes metric at the call path sample.node:KILLED_BY:curctxt> // Otherwise (not dead), we'll update the used_writes metric at the call path sample.node:USED_BY:curctxt> // In either case, the increment will be (N * overlapBytes) // Bump up watermark_metric_id to match sampledMetricId - + double myProportion = ProportionOfWatchpointAmongOthersSharingTheSameContext(wpi); uint64_t numDiffSamples = GetWeightedMetricDiffAndReset(wpi->sample.node, wpi->sample.sampledMetricId, myProportion); int overlapBytes = GET_OVERLAP_BYTES(wpi->sample.va, wpi->sample.wpLength, wt->va, wt->accessLength); @@ -1086,7 +1447,7 @@ static WPTriggerActionType DeadStoreWPCallback(WatchPointInfo_t *wpi, int startO fprintf(stderr, "\n wpi->sample.va=%p, wpi->sample.wpLength = %d, wt->va = %p, wt->accessLength=%d\n", wpi->sample.va, wpi->sample.wpLength, wt->va, wt->accessLength); monitor_real_abort(); } - + // Now increment dead_metric_id by numDiffSamples * wpi->sample.accessLength // I could have done numDiffSamples * overlapBytes, but it will cause misattribution when access sizes are not same at dead and kill sites. // Basically, we are assuming that whatever happened in the observed watchpoints is applicable to the entire access length @@ -1127,10 +1488,10 @@ static WPTriggerActionType RedStoreWPCallback(WatchPointInfo_t *wpi, int startOf // if the ip is 0, let's drop the WP return ALREADY_DISABLED; } - + bool isFloatOperation = wt->floatType == ELEM_TYPE_UNKNOWN? false: true; bool redBytes = 0; - + // check integer instructions int overlapLen = GET_OVERLAP_BYTES(wt->va, safeAccessLen, wpi->sample.va, wpi->sample.wpLength); if(overlapLen <= 0){ @@ -1141,17 +1502,17 @@ static WPTriggerActionType RedStoreWPCallback(WatchPointInfo_t *wpi, int startOf int joinNodeIdx = wpi->sample.isSamplePointAccurate? E_ACCURATE_JOIN_NODE_IDX : E_INACCURATE_JOIN_NODE_IDX; int firstOffest = FIRST_OVERLAPPED_BYTE_OFFSET_IN_FIRST(wt->va, safeAccessLen, wpi->sample.va, wpi->sample.wpLength); int secondOffest = FIRST_OVERLAPPED_BYTE_OFFSET_IN_FIRST(wt->va, safeAccessLen, wpi->sample.va, wpi->sample.wpLength); - + void * wpiStartByte = wpi->sample.va + secondOffest; void * wtStartByte = wt->va + firstOffest; // if the overlapLen is not 4 or 8, we cannot do any FP, DP approximation. //wpiStartByte and wtStartByte are not 4 or 8 byte aligned, we cannot do any FP, DP approximation. - + // If we got an insane address that cannot be read, return silently if(!IsAddressReadable(wtStartByte)){ return ALREADY_DISABLED; } - + if(isFloatOperation){ switch (wt->floatType) { case ELEM_TYPE_SINGLE:{ @@ -1181,7 +1542,7 @@ static WPTriggerActionType RedStoreWPCallback(WatchPointInfo_t *wpi, int startOf } } break; - + case ELEM_TYPE_DOUBLE:{ if(overlapLen < sizeof(double)){ goto TreatLikeInteger; @@ -1207,7 +1568,7 @@ static WPTriggerActionType RedStoreWPCallback(WatchPointInfo_t *wpi, int startOf } } break; - + default: // unhandled!! goto TreatLikeInteger; break; @@ -1230,10 +1591,10 @@ static WPTriggerActionType RedStoreWPCallback(WatchPointInfo_t *wpi, int startOf UpdateConcatenatedPathPair(wt->ctxt, wpi->sample.node /* oldNode*/, joinNodes[E_NEW_VAL][joinNodeIdx] /* joinNode*/, measured_metric_id /* checkedMetric */, inc); } }else /* non float */{ - + TreatLikeInteger: ; - + for(int i = firstOffest, k = secondOffest ; i < firstOffest + overlapLen; i++, k++){ if(((uint8_t*)(wt->va))[i] == wpi->value[k]) { redBytes ++; @@ -1244,7 +1605,7 @@ static WPTriggerActionType RedStoreWPCallback(WatchPointInfo_t *wpi, int startOf } double myProportion = ProportionOfWatchpointAmongOthersSharingTheSameContext(wpi); uint64_t numDiffSamples = GetWeightedMetricDiffAndReset(wpi->sample.node, wpi->sample.sampledMetricId, myProportion); - + if(redBytes != 0) { // Now increment metric: if the entire overlap is redundant, amplify to numDiffSamples * wpi->sample.accessLength // This is an approximation of what might have happened. @@ -1264,37 +1625,94 @@ static WPTriggerActionType RedStoreWPCallback(WatchPointInfo_t *wpi, int startOf return ALREADY_DISABLED; } -static WPTriggerActionType TemporalReuseWPCallback(WatchPointInfo_t *wpi, int startOffset, int safeAccessLen, WatchPointTrigger_t * wt){ +static WPTriggerActionType ReuseWPCallback(WatchPointInfo_t *wpi, int startOffset, int safeAccessLen, WatchPointTrigger_t * wt){ +#if 0 // jqswang:TODO, how to handle it? if(!wt->pc) { - // if the ip is 0, let's retain the WP - return RETAIN_WP; + // if the ip is 0, let's drop the WP + //return RETAIN_WP; + return ALREADY_DISABLED; } - // Report a reuse - double myProportion = ProportionOfWatchpointAmongOthersSharingTheSameContext(wpi); - uint64_t numDiffSamples = GetWeightedMetricDiffAndReset(wpi->sample.node, wpi->sample.sampledMetricId, myProportion); - int joinNodeIdx = wpi->sample.isSamplePointAccurate? E_ACCURATE_JOIN_NODE_IDX : E_INACCURATE_JOIN_NODE_IDX; - - // Now increment temporal_metric_id by numDiffSamples * overlapBytes - uint64_t inc = numDiffSamples; - reuse += inc; - UpdateConcatenatedPathPair(wt->ctxt, wpi->sample.node /* oldNode*/, joinNodes[E_TEPORALLY_REUSED][joinNodeIdx] /* joinNode*/, temporal_metric_id /* checkedMetric */, inc); - return ALREADY_DISABLED; -} +#endif //jqswang -static WPTriggerActionType SpatialReuseWPCallback(WatchPointInfo_t *wpi, int startOffset, int safeAccessLen, WatchPointTrigger_t * wt){ - if(!wt->pc) { - // if the ip is 0, drop the WP - return ALREADY_DISABLED; + uint64_t val[2][3]; + for (int i=0; i < MIN(2, reuse_distance_num_events); i++){ + assert(linux_perf_read_event_counter( reuse_distance_events[i], val[i]) >= 0); + //fprintf(stderr, "USE: %lu %lu %lu, REUSE: %lu %lu %lu\n", wpi->sample.reuseDistance[i][0], wpi->sample.reuseDistance[i][1], wpi->sample.reuseDistance[i][2], val[i][0], val[i][1], val[i][2]); + //fprintf(stderr, "DIFF: %lu\n", val[i][0] - wpi->sample.reuseDistance[i][0]); + for(int j=0; j < 3; j++){ + if (val[i][j] >= wpi->sample.reuseDistance[i][j]){ + val[i][j] -= wpi->sample.reuseDistance[i][j]; + } + else { //Something wrong happens here and the record is not reliable. Drop it! + return ALREADY_DISABLED; + } + } } - // Report a reuse + + // Report a reuse double myProportion = ProportionOfWatchpointAmongOthersSharingTheSameContext(wpi); uint64_t numDiffSamples = GetWeightedMetricDiffAndReset(wpi->sample.node, wpi->sample.sampledMetricId, myProportion); - int joinNodeIdx = wpi->sample.isSamplePointAccurate? E_ACCURATE_JOIN_NODE_IDX : E_INACCURATE_JOIN_NODE_IDX; - // Now increment dead_metric_id by numDiffSamples * overlapBytes uint64_t inc = numDiffSamples; - reuse += inc; - - UpdateConcatenatedPathPair(wt->ctxt, wpi->sample.node /* oldNode*/, joinNodes[E_SPATIALLY_REUSED][joinNodeIdx] /* joinNode*/, spatial_metric_id /* checkedMetric */, inc); + int joinNodeIdx = wpi->sample.isSamplePointAccurate? E_ACCURATE_JOIN_NODE_IDX : E_INACCURATE_JOIN_NODE_IDX; + + uint64_t time_distance = rdtsc() - wpi->startTime; + +#ifdef REUSE_HISTO + //cct_node_t *reuseNode = getPreciseNode(wt->ctxt, wt->pc, temporal_reuse_metric_id ); + sample_val_t v = hpcrun_sample_callpath(wt->ctxt, temporal_reuse_metric_id, SAMPLE_NO_INC, 0/*skipInner*/, 1/*isSync*/, NULL); + cct_node_t *reuseNode = v.sample_node; + + if (reuse_output_trace){ + WriteWitchTraceOutput("REUSE_DISTANCE: %d %d %lu,", hpcrun_cct_persistent_id(wpi->sample.node), hpcrun_cct_persistent_id(reuseNode), inc); + for(int i=0; i < MIN(2, reuse_distance_num_events); i++){ + WriteWitchTraceOutput(" %lu %lu %lu,", val[i][0], val[i][1], val[i][2]); + } + WriteWitchTraceOutput("\n"); + } else{ + uint64_t rd = 0; + for(int i=0; i < MIN(2, reuse_distance_num_events); i++){ + assert(val[i][1] == 0 && val[i][2] == 0); // no counter multiplexing allowed + rd += val[i][0]; + } + ReuseAddDistance(rd, inc); + } + +#else + + cct_node_t *reusePairNode; + if (wpi->sample.reuseType == REUSE_TEMPORAL){ + sample_val_t v = hpcrun_sample_callpath(wt->ctxt, temporal_reuse_metric_id, SAMPLE_NO_INC, 0/*skipInner*/, 1/*isSync*/, NULL); + cct_node_t *reuseNode = v.sample_node; + if (reuse_concatenate_use_reuse){ + reusePairNode = getConcatenatedNode(reuseNode /*bottomNode*/, wpi->sample.node /*topNode*/, joinNodes[E_TEMPORALLY_REUSED_BY][joinNodeIdx] /* joinNode*/); + }else{ + reusePairNode = getConcatenatedNode(wpi->sample.node /*bottomNode*/, reuseNode /*topNode*/, joinNodes[E_TEMPORALLY_REUSED_FROM][joinNodeIdx] /* joinNode*/); + } + } + else { // REUSE_SPATIAL + sample_val_t v = hpcrun_sample_callpath(wt->ctxt, spatial_reuse_metric_id, SAMPLE_NO_INC, 0/*skipInner*/, 1/*isSync*/, NULL); + cct_node_t *reuseNode = v.sample_node; + if (reuse_concatenate_use_reuse){ + reusePairNode = getConcatenatedNode(reuseNode /*bottomNode*/, wpi->sample.node /*topNode*/, joinNodes[E_SPATIALLY_REUSED_BY][joinNodeIdx] /* joinNode*/); + }else{ + reusePairNode = getConcatenatedNode(wpi->sample.node /*bottomNode*/, reuseNode /*topNode*/, joinNodes[E_SPATIALLY_REUSED_FROM][joinNodeIdx] /* joinNode*/); + } + } + + cct_metric_data_increment(reuse_memory_distance_metric_id, reusePairNode, (cct_metric_data_t){.i = (val[0][0] + val[1][0]) }); + cct_metric_data_increment(reuse_memory_distance_count_metric_id, reusePairNode, (cct_metric_data_t){.i = 1}); + + reuseTemporal += inc; + if (wpi->sample.reuseType == REUSE_TEMPORAL){ + cct_metric_data_increment(temporal_reuse_metric_id, reusePairNode, (cct_metric_data_t){.i = inc}); + } else { + cct_metric_data_increment(spatial_reuse_metric_id, reusePairNode, (cct_metric_data_t){.i = inc}); + } + cct_metric_data_increment(reuse_time_distance_metric_id, reusePairNode, (cct_metric_data_t){.i = time_distance}); + cct_metric_data_increment(reuse_time_distance_count_metric_id, reusePairNode, (cct_metric_data_t){.i = 1}); +#endif + + return ALREADY_DISABLED; } @@ -1331,7 +1749,7 @@ static WPTriggerActionType FalseSharingWPCallback(WatchPointInfo_t *wpi, int sta joinNode = joinNodes[E_FALSE_WW_SHARE][joinNodeIdx]; } } - + sample_val_t v = hpcrun_sample_callpath(wt->ctxt, measured_metric_id, SAMPLE_UNIT_INC, 0/*skipInner*/, 1/*isSync*/, NULL); // insert a special node cct_node_t *node = hpcrun_insert_special_node(v.sample_node, joinNode); @@ -1366,7 +1784,7 @@ static WPTriggerActionType TrueSharingWPCallback(WatchPointInfo_t *wpi, int star joinNode = joinNodes[E_TRUE_WW_SHARE][joinNodeIdx]; } } - + sample_val_t v = hpcrun_sample_callpath(wt->ctxt, measured_metric_id, SAMPLE_UNIT_INC, 0/*skipInner*/, 1/*isSync*/, NULL); // insert a special node cct_node_t *node = hpcrun_insert_special_node(v.sample_node, joinNode); @@ -1397,7 +1815,7 @@ static WPTriggerActionType IPCFalseSharingWPCallback(WatchPointInfo_t *wpi, int joinNode = joinNodes[E_IPC_FALSE_WW_SHARE][joinNodeIdx]; } } - + sample_val_t v = hpcrun_sample_callpath(wt->ctxt, measured_metric_id, SAMPLE_UNIT_INC, 0/*skipInner*/, 1/*isSync*/, NULL); // insert a special node cct_node_t *node = hpcrun_insert_special_node(v.sample_node, joinNode); @@ -1426,7 +1844,7 @@ static WPTriggerActionType IPCTrueSharingWPCallback(WatchPointInfo_t *wpi, int s joinNode = joinNodes[E_IPC_TRUE_WW_SHARE][joinNodeIdx]; } } - + sample_val_t v = hpcrun_sample_callpath(wt->ctxt, measured_metric_id, SAMPLE_UNIT_INC, 0/*skipInner*/, 1/*isSync*/, NULL); // insert a special node cct_node_t *node = hpcrun_insert_special_node(v.sample_node, joinNode); @@ -1445,7 +1863,7 @@ static inline bool IsLibMonitorAddress(void * addr) { if(!libmonitorLM){ libmonitorLM = hpcrun_loadmap_findByName(hpcrun_loadmap_findLoadName("libmonitor.so"))->dso_info; } - + if (addr >= libmonitorLM->start_addr && addr < libmonitorLM->end_addr){ return true; } @@ -1456,7 +1874,7 @@ static inline bool IsHPCRunAddress(void * addr) { if(!hpcrunLM){ hpcrunLM = hpcrun_loadmap_findByName(hpcrun_loadmap_findLoadName("libhpcrun.so"))->dso_info; } - + if (addr >= hpcrunLM->start_addr && addr < hpcrunLM->end_addr){ return true; } @@ -1484,7 +1902,7 @@ static inline bool IsValidAddress(void * addr, void * pc){ thread_data_t * td = hpcrun_get_thread_data(); if( (addr == 0) ) return false; - + if( (pc == 0) ) return false; @@ -1493,15 +1911,15 @@ static inline bool IsValidAddress(void * addr, void * pc){ if(IsAltStackAddress(addr)) return false; if(IsFSorGS(addr)) - return false; - + return false; + if(IsBlackListedWatchpointAddress(addr) || IsBlackListedWatchpointAddress(pc)){ return false; } - + if (isTdataAddress(addr)) return false; - + if((addr && !(((unsigned long)addr) & 0xF0000000000000)) && (pc && !(((unsigned long)pc) & 0xF0000000000000))) return true; @@ -1515,7 +1933,7 @@ void ReadSharedDataTransactionally(SharedData_t *localSharedData){ int64_t startCounter = gSharedData.counter; if(startCounter & 1) continue; // Some writer is updating - + __sync_synchronize(); *localSharedData = gSharedData; __sync_synchronize(); @@ -1539,7 +1957,7 @@ int static inline GetFloorWPLength(int accessLen){ int static inline GetFloorWPLengthAtAddress(void * address, int accessLen){ uint8_t alignment = ((size_t) address) & (MAX_WP_LENGTH -1); - + switch (alignment) { case 1: case 3: case 5: case 7: /* 1-byte aligned */ return 1; case 2: case 6: /* 2-byte aligned */ return MIN(2, accessLen); @@ -1664,7 +2082,7 @@ void ReadIPCSharedDataTransactionally(IPC_FSInfo *ipcFSInfo){ int64_t startCounter = ipcSharedData->counter; if(startCounter & 1) continue; // Some writer is updating - + __sync_synchronize(); *ipcFSInfo = ipcSharedData->fsInfo; __sync_synchronize(); @@ -1698,11 +2116,11 @@ static inline void create_shared_memory() { if(__sync_bool_compare_and_swap(&ipcSharedData, 0, ptr)){ hpcrun_process_aux_cleanup_add(destroy_shared_memory, NULL); } - + } uint16_t GetOrCreateIPCSharedLMEntry(const char * realPath){ - + if(ipcSharedData == NULL) create_shared_memory(); // start from 1; leave 0 out; @@ -1744,7 +2162,7 @@ unsigned long GetPFN(unsigned long virt_addr){ printf("Error! Cannot open %s\n", PA_PATH); goto ErrExit; } - + //Shifting by virt-addr-offset number of bytes //and multiplying by the size of an address (the size of an entry in pagemap file) uint64_t file_offset = virt_addr / getpagesize() * PAGEMAP_ENTRY; @@ -1767,9 +2185,9 @@ unsigned long GetPFN(unsigned long virt_addr){ else c_buf[PAGEMAP_ENTRY - i - 1] = c; } - + fclose(f); - + for(int i=0; i < PAGEMAP_ENTRY; i++){ //printf("%d ",c_buf[i]); read_val = (read_val << 8) + c_buf[i]; @@ -1782,14 +2200,14 @@ unsigned long GetPFN(unsigned long virt_addr){ // printf("Page not present\n"); // if(GET_BIT(read_val, 62)) // printf("Page swapped\n"); - + return INVALID_PHYSICAL_ADDRESS; ErrExit: if(f){ fclose(f); } return INVALID_PHYSICAL_ADDRESS; - + } @@ -1807,7 +2225,7 @@ static inline struct VAPAMap* splayPAtoVAMap(struct VAPAMap* root, unsigned long static void InsertVAtoPAMap(void * va, unsigned long pa){ VAPAMap_t * found = splayVAtoPAMap(vaToPAMap, va); - + // Check if a trace node with traceKey already exists under this context node if(found && (va == found->virtualAddress)) { vaToPAMap = found; @@ -1836,7 +2254,7 @@ static void InsertVAtoPAMap(void * va, unsigned long pa){ static void InsertPAtoVAMap(unsigned long pa, void * va){ VAPAMap_t * found = splayPAtoVAMap(paToVAMap, pa); - + // Check if a trace node with traceKey already exists under this context node if(found && (pa == found->physicalAddress)) { paToVAMap = found; @@ -1929,7 +2347,7 @@ static void UpdateVMMap(){ if(s != 0){ fprintf(stderr, "\n Failed to STAT %s", VA_PATH); } - + if( ((lastVMMAPCheck % VM_MAP_CHECK_FREQUENCY) == 0) && (lastMapChangeTime != mapsStat.st_mtime)) { // New mapping @@ -1952,11 +2370,11 @@ static void HandleIPCFalseSharing(void * data_addr, void * pc, cct_node_t *node, unsigned long pa = GetPAfromVA(data_addr); // Ok, on a shared page! // Ok to publish new data? - + // Is the published address old enough (stayed for > 1 sample time span) int64_t curTime = rdtsc(); volatile IPC_FSInfo * globalIPCInfo = &(ipcSharedData->fsInfo); - + pid_t me = myTid; // Get the time, tid, and counter // This is definately racy but benign. @@ -1967,14 +2385,14 @@ static void HandleIPCFalseSharing(void * data_addr, void * pc, cct_node_t *node, && (pa != INVALID_PHYSICAL_ADDRESS) // my PA is a valid address ) { // Attempt to lockout - + if(__sync_bool_compare_and_swap(&(ipcSharedData->counter), theCounter, theCounter+1)){ } else { // Failed to update ==> someone else succeeded ==> Fetch that address and set a WP for that goto SET_FS_WP; } - - + + globalIPCInfo->time = rdtsc(); globalIPCInfo->tid = myTid; globalIPCInfo->wpType = accessType == LOAD ? WP_WRITE : WP_RW; @@ -1982,7 +2400,7 @@ static void HandleIPCFalseSharing(void * data_addr, void * pc, cct_node_t *node, globalIPCInfo->address = pa; globalIPCInfo->offset = PAGE_OFFSET(data_addr); globalIPCInfo->accessLen = accessLen; - + int btLen = 0; for(; btLen < MAX_BACKTRACE_LEN - 1; btLen++){ if (node == NULL) @@ -1990,7 +2408,7 @@ static void HandleIPCFalseSharing(void * data_addr, void * pc, cct_node_t *node, globalIPCInfo->backtrace[btLen] = *hpcrun_cct_addr(node); node = hpcrun_cct_parent(node); } - + // unlikely; if btLen == MAX_BACKTRACE_LEN; drop the WP by invalidating it if (btLen == MAX_BACKTRACE_LEN -1 ) { globalIPCInfo->tid = -1; @@ -2010,12 +2428,12 @@ static void HandleIPCFalseSharing(void * data_addr, void * pc, cct_node_t *node, if(va == INVALID_VIRUAL_ADDRESS) { goto ErrExit; } - + va = va + localIPCInfo.offset; - + long metricThreshold = hpcrun_id2metric(sampledMetricId)->period; accessedIns += metricThreshold; - + switch (theWPConfig->id) { case WP_IPC_TRUE_SHARING:{ // Set WP at the same address @@ -2083,7 +2501,7 @@ bool PrintStats(){ void * contextIP = hpcrun_context_pc(context); int v1 = get_access_type(mmap_data->ip); int v2 = get_access_type(contextIP); - + switch(v1){ case 0: unk1++; break; case 1: ld1++; break; @@ -2098,7 +2516,7 @@ bool PrintStats(){ case 3: mix2++; break; default: break; } - + float tot = unk1 + ld1 + st1 + mix1; fprintf(stderr, "W=%f (%f), L=%f(%f), M=%f(%f), U=%f(%f)\n", st1/tot, st2/tot, ld1/tot, ld2/tot, mix1/tot, mix2/tot, unk1/tot, unk2/tot); /* @@ -2112,7 +2530,7 @@ bool PrintStats(){ void * contextIP = hpcrun_context_pc(context); extern int is_same_function(void *ins1, void* ins2); int samev1 = is_same_function(contextIP, mmap_data->ip); - + switch(samev1){ case 0: difffunc++; break; case 1: samefunc++; break; @@ -2129,18 +2547,18 @@ bool PrintStats(){ #endif bool OnSample(perf_mmap_data_t * mmap_data, void * contextPC, cct_node_t *node, int sampledMetricId) { - void * data_addr = mmap_data->addr; - void * precisePC = (mmap_data->header_misc & PERF_RECORD_MISC_EXACT_IP) ? mmap_data->ip : 0; + void * data_addr = (void *)mmap_data->addr; + void * precisePC = (void *)((mmap_data->header_misc & PERF_RECORD_MISC_EXACT_IP) ? mmap_data->ip : 0); // Filert out address and PC (0 or kernel address will not pass) if (!IsValidAddress(data_addr, precisePC)) { goto ErrExit; // incorrect access type } - + // do not monitor NULL CCT node if (node == NULL) { goto ErrExit; // incorrect CCT } - + // fprintf(stderr, " numWatchpointsSet=%lu\n", wpStats.numWatchpointsSet); int accessLen; @@ -2154,7 +2572,7 @@ bool OnSample(perf_mmap_data_t * mmap_data, void * contextPC, cct_node_t *node, //EMSG("Sampled sd.accessType = %d, accessLen=%d at precisePC = %p\n", accessType, accessLen, precisePC); goto ErrExit; // incorrect access type } - + // if the context PC and precise PC are not in the same function, then the sample point is inaccurate. bool isSamplePointAccurate; FunctionType ft = is_same_function(contextPC, precisePC); @@ -2163,14 +2581,14 @@ bool OnSample(perf_mmap_data_t * mmap_data, void * contextPC, cct_node_t *node, } else { isSamplePointAccurate = false; } - + switch (theWPConfig->id) { case WP_DEADSPY:{ if(accessType == LOAD){ //EMSG("Sampled accessType = %d\n", accessType); goto ErrExit; // incorrect access type } - + long metricThreshold = hpcrun_id2metric(sampledMetricId)->period; writtenBytes += accessLen * metricThreshold; SampleData_t sd= { @@ -2189,7 +2607,7 @@ bool OnSample(perf_mmap_data_t * mmap_data, void * contextPC, cct_node_t *node, SubscribeWatchpoint(&sd, OVERWRITE, false /* capture value */); } break; - + case WP_REDSPY:{ // If we got an insane address that cannot be read, return silently if(!IsAddressReadable(data_addr)){ @@ -2251,65 +2669,86 @@ bool OnSample(perf_mmap_data_t * mmap_data, void * contextPC, cct_node_t *node, } } break; - case WP_SPATIAL_REUSE:{ + case WP_REUSE:{ +#ifdef REUSE_HISTO +#else + if ( accessType != reuse_monitor_type && reuse_monitor_type != LOAD_AND_STORE) break; +#endif long metricThreshold = hpcrun_id2metric(sampledMetricId)->period; accessedIns += metricThreshold; - SampleData_t sd= { .node = node, - .type=WP_RW, + .type=WP_RW, //jqswang: Setting it to WP_READ causes segment fault .accessType=accessType, - .wpLength = accessLen, + //.wpLength = accessLen, // set later .accessLength= accessLen, .sampledMetricId=sampledMetricId, .isSamplePointAccurate = isSamplePointAccurate, .preWPAction=theWPConfig->preWPAction, - .isBackTrace = false + .isBackTrace = false, }; +#ifdef REUSE_HISTO + sd.wpLength = 1; +#else sd.wpLength = GetFloorWPLength(accessLen); - // randomly protect another word in the same cache line - uint64_t aligned_pc = ALIGN_TO_CACHE_LINE((uint64_t)data_addr); - if ((rdtsc() & 1) == 0) - sd.va = (void*) (aligned_pc - CACHE_LINE_SZ); - else - sd.va = (void *) (aligned_pc + CACHE_LINE_SZ); + sd.type = reuse_trap_type; +#endif + + bool isProfileSpatial; + if (reuse_profile_type == REUSE_TEMPORAL){ + isProfileSpatial = false; + } else if (reuse_profile_type == REUSE_SPATIAL){ + isProfileSpatial = true; + } else { + isProfileSpatial = (rdtsc() & 1); + } + if (isProfileSpatial) {// detect spatial reuse + int wpSizes[] = {8, 4, 2, 1}; + FalseSharingLocs falseSharingLocs[CACHE_LINE_SZ]; + int numFSLocs = 0; + GetAllFalseSharingLocations((size_t)data_addr, accessLen, ALIGN_TO_CACHE_LINE((size_t)(data_addr)), CACHE_LINE_SZ, wpSizes, 0 /*curWPSizeIdx*/ , 4 /*totalWPSizes*/, falseSharingLocs, &numFSLocs); + if (numFSLocs == 0) { // No location is found. It is probably due to the access length already occupies one cache line. So we just monitor the temporal reuse instead. + sd.va = data_addr; + sd.reuseType = REUSE_TEMPORAL; + } else { + int idx = rdtsc() % numFSLocs; //randomly choose one location to monitor + sd.va = (void *)falseSharingLocs[idx].va; + sd.reuseType = REUSE_SPATIAL; #if 0 - int offset = ((uint64_t)data_addr - aligned_pc) / accessLen; - int bound = CACHE_LINE_SZ / accessLen; - int r = rdtsc() % bound; - if (r == offset) r = (r+1) % bound; - sd.va = aligned_pc + (r * accessLen); + int offset = ((uint64_t)data_addr - aligned_pc) / accessLen; + int bound = CACHE_LINE_SZ / accessLen; + int r = rdtsc() % bound; + if (r == offset) r = (r+1) % bound; + sd.va = aligned_pc + (r * accessLen); #endif + } + } else { + sd.va = data_addr; + sd.reuseType = REUSE_TEMPORAL; + } + if (!IsValidAddress(sd.va, precisePC)) { goto ErrExit; // incorrect access type } - SubscribeWatchpoint(&sd, OVERWRITE, false /* capture value */); - } - break; - case WP_TEMPORAL_REUSE:{ - long metricThreshold = hpcrun_id2metric(sampledMetricId)->period; - accessedIns += metricThreshold; - - SampleData_t sd= { - .va = data_addr, - .node = node, - .type=WP_RW, - .accessType=accessType, - .wpLength = accessLen, - .accessLength= accessLen, - .sampledMetricId=sampledMetricId, - .isSamplePointAccurate = isSamplePointAccurate, - .preWPAction=theWPConfig->preWPAction, - .isBackTrace = false - }; - sd.wpLength = GetFloorWPLength(accessLen); + + // Read the reuse distance event counters + // We assume the reading event is load, store or both. + for (int i=0; i < MIN(2, reuse_distance_num_events); i++){ + uint64_t val[3]; + assert(linux_perf_read_event_counter( reuse_distance_events[i], val) >= 0); + //fprintf(stderr, "USE %lu %lu %lu -- ", val[0], val[1], val[2]); + //fprintf(stderr, "USE %lx -- ", val[0]); + memcpy(sd.reuseDistance[i], val, sizeof(uint64_t)*3);; + } + //fprintf(stderr, "\n"); + // register the watchpoint SubscribeWatchpoint(&sd, OVERWRITE, false /* capture value */); } break; case WP_FALSE_SHARING: case WP_TRUE_SHARING: case WP_ALL_SHARING:{ - + // Is the published address old enough (stayed for > 1 sample time span) int64_t curTime = rdtsc(); SharedData_t localSharedData; @@ -2319,7 +2758,7 @@ bool OnSample(perf_mmap_data_t * mmap_data, void * contextPC, cct_node_t *node, localSharedData.time = gSharedData.time; localSharedData.tid = gSharedData.tid; localSharedData.counter = gSharedData.counter; - + //ReadSharedDataTransactionally(&localSharedData); if( ((curTime-localSharedData.time) > 2 * (curTime-lastTime)) // Sufficient time passed since the last time somebody published && @@ -2335,7 +2774,7 @@ bool OnSample(perf_mmap_data_t * mmap_data, void * contextPC, cct_node_t *node, localSharedData.accessLen = accessLen; localSharedData.counter ++; // makes the counter odd localSharedData.node = node; - + if(__sync_bool_compare_and_swap(&gSharedData.counter, theCounter, theCounter+1)){ gSharedData = localSharedData; __sync_synchronize(); @@ -2349,7 +2788,7 @@ bool OnSample(perf_mmap_data_t * mmap_data, void * contextPC, cct_node_t *node, SET_FS_WP: ReadSharedDataTransactionally(&localSharedData); long metricThreshold = hpcrun_id2metric(sampledMetricId)->period; accessedIns += metricThreshold; - + switch (theWPConfig->id) { case WP_TRUE_SHARING:{ // Set WP at the same address @@ -2434,7 +2873,7 @@ bool OnSample(perf_mmap_data_t * mmap_data, void * contextPC, cct_node_t *node, lastTime = curTime; } break; - + case WP_IPC_FALSE_SHARING: case WP_IPC_TRUE_SHARING: { UpdateVMMap(); @@ -2446,10 +2885,9 @@ bool OnSample(perf_mmap_data_t * mmap_data, void * contextPC, cct_node_t *node, } wpStats.numWatchpointsSet ++; return true; - + ErrExit: wpStats.numImpreciseSamples ++; return false; - -} +} diff --git a/src/tool/hpcrun/sample-sources/watchpoint_support.c b/src/tool/hpcrun/sample-sources/watchpoint_support.c index fd233d006d..df769499c7 100644 --- a/src/tool/hpcrun/sample-sources/watchpoint_support.c +++ b/src/tool/hpcrun/sample-sources/watchpoint_support.c @@ -189,10 +189,10 @@ static int OnWatchPoint(int signum, siginfo_t *info, void *context); __attribute__((constructor)) static void InitConfig(){ tData.fptr = NULL; - + volatile int dummyWP[MAX_WP_SLOTS]; wpConfig.isLBREnabled = true; - + struct perf_event_attr peLBR = { .type = PERF_TYPE_BREAKPOINT, .size = sizeof(struct perf_event_attr), @@ -213,9 +213,19 @@ static void InitConfig(){ } else { wpConfig.isLBREnabled = false; } + { //jqswang: Maybe we can move this part to other better location? + char * lbr_flag_str = getenv("HPCRUN_WP_REUSE_LBR"); + if (lbr_flag_str && 0 == strcasecmp(lbr_flag_str, "ENABLE")){ + wpConfig.isLBREnabled = true; + //printf(stderr,"LBR is set to TRUE\n"); + } else { + wpConfig.isLBREnabled = false; + //fprintf(stderr,"LBR is set to FALSE\n"); + } + } CHECK(close(fd)); - - + + #if defined(FAST_BP_IOC_FLAG) wpConfig.isWPModifyEnabled = true; #else @@ -225,7 +235,7 @@ static void InitConfig(){ //wpConfig.signalDelivered = SIGIO; //wpConfig.signalDelivered = SIGUSR1; wpConfig.signalDelivered = SIGRTMIN + 3; - + // Setup the signal handler sigset_t block_mask; sigfillset(&block_mask); @@ -235,18 +245,18 @@ static void InitConfig(){ .sa_mask = block_mask, .sa_flags = SA_SIGINFO | SA_RESTART | SA_NODEFER | SA_ONSTACK }; - + if(monitor_sigaction(wpConfig.signalDelivered, OnWatchPoint, 0 /*flags*/, &sa1) == -1) { fprintf(stderr, "Failed to set WHICH_SIG handler: %s\n", strerror(errno)); monitor_real_abort(); } - - - - - + + + + + wpConfig.pgsz = sysconf(_SC_PAGESIZE); - + // identify max WP supported by the architecture volatile int wpHandles[MAX_WP_SLOTS]; int i = 0; @@ -270,7 +280,7 @@ static void InitConfig(){ break; } } - + if(i == 0) { fprintf(stderr, "Cannot create a single watch point\n"); monitor_real_abort(); @@ -279,10 +289,10 @@ static void InitConfig(){ CHECK(close(wpHandles[j])); } wpConfig.maxWP = i; - + // Should we get the floating point type in an access? wpConfig.getFloatType = false; - + // Get the replacement scheme char * replacementScheme = getenv("HPCRUN_WP_REPLACEMENT_SCHEME"); if(replacementScheme){ @@ -300,7 +310,7 @@ static void InitConfig(){ // default; wpConfig.replacementPolicy = AUTO; } - + // Should we fix IP off by one? char * fixIP = getenv("HPCRUN_WP_DONT_FIX_IP"); if(fixIP){ @@ -316,7 +326,7 @@ static void InitConfig(){ // default; wpConfig.dontFixIP = false; } - + // Should we get the address in a WP trigger? char * disassembleWPAddress = getenv("HPCRUN_WP_DONT_DISASSEMBLE_TRIGGER_ADDRESS"); if(disassembleWPAddress){ @@ -333,8 +343,8 @@ static void InitConfig(){ wpConfig.dontDisassembleWPAddress = false; } - - + + } void RedSpyWPConfigOverride(void *v){ @@ -377,16 +387,11 @@ void IPCAllSharingWPConfigOverride(void *v){ } -void TemporalReuseWPConfigOverride(void *v){ - // dont fix IP - wpConfig.dontFixIP = true; - wpConfig.dontDisassembleWPAddress = true; -} - -void SpatialReuseWPConfigOverride(void *v){ +void ReuseWPConfigOverride(void *v){ // dont fix IP wpConfig.dontFixIP = true; wpConfig.dontDisassembleWPAddress = true; + wpConfig.isLBREnabled = false; //jqswang } static void CreateWatchPoint(WatchPointInfo_t * wpi, SampleData_t * sampleData, bool modify) { @@ -404,7 +409,7 @@ static void CreateWatchPoint(WatchPointInfo_t * wpi, SampleData_t * sampleData, .exclude_hv = 1, .disabled = 0, /* enabled */ }; - + switch (sampleData->wpLength) { case 1: pe.bp_len = HW_BREAKPOINT_LEN_1; break; case 2: pe.bp_len = HW_BREAKPOINT_LEN_2; break; @@ -415,13 +420,13 @@ static void CreateWatchPoint(WatchPointInfo_t * wpi, SampleData_t * sampleData, monitor_real_abort(); } pe.bp_addr = (uintptr_t)sampleData->va; - + switch (sampleData->type) { case WP_READ: pe.bp_type = HW_BREAKPOINT_R; break; case WP_WRITE: pe.bp_type = HW_BREAKPOINT_W; break; default: pe.bp_type = HW_BREAKPOINT_W | HW_BREAKPOINT_R; } - + #if defined(FAST_BP_IOC_FLAG) if(modify) { // modification @@ -444,10 +449,10 @@ static void CreateWatchPoint(WatchPointInfo_t * wpi, SampleData_t * sampleData, } // Set the perf_event file to async mode CHECK(fcntl(perf_fd, F_SETFL, fcntl(perf_fd, F_GETFL, 0) | O_ASYNC)); - + // Tell the file to send a signal when an event occurs CHECK(fcntl(perf_fd, F_SETSIG, wpConfig.signalDelivered)); - + // Deliver the signal to this thread struct f_owner_ex fown_ex; fown_ex.type = F_OWNER_TID; @@ -457,17 +462,17 @@ static void CreateWatchPoint(WatchPointInfo_t * wpi, SampleData_t * sampleData, EMSG("Failed to set the owner of the perf event file: %s\n", strerror(errno)); return; } - - + + // CHECK(fcntl(perf_fd, F_SETOWN, gettid())); - + wpi->fileHandle = perf_fd; // mmap the file if lbr is enabled - if(wpConfig.isLBREnabled) { + //if(wpConfig.isLBREnabled) { wpi->mmapBuffer = MAPWPMBuffer(perf_fd); - } + //} } - + wpi->isActive = true; wpi->va = (void *) pe.bp_addr; wpi->sample = *sampleData; @@ -490,7 +495,7 @@ static void CreateDummyHardwareEvent(void) { .exclude_hv = 1, .branch_sample_type = PERF_SAMPLE_BRANCH_ANY, }; - + // Create the perf_event for this thread on all CPUs with no event group int perf_fd = perf_event_open(&pe, 0, -1, -1, 0); if (perf_fd == -1) { @@ -508,14 +513,14 @@ static void CloseDummyHardwareEvent(int perf_fd){ /*********** Client interfaces *******/ static void DisArm(WatchPointInfo_t * wpi){ - + // assert(wpi->isActive); assert(wpi->fileHandle != -1); - + if(wpi->mmapBuffer) UNMAPWPMBuffer(wpi->mmapBuffer); wpi->mmapBuffer = 0; - + CHECK(close(wpi->fileHandle)); wpi->fileHandle = -1; wpi->isActive = false; @@ -531,7 +536,7 @@ static bool ArmWatchPoint(WatchPointInfo_t * wpi, SampleData_t * sampleData) { return true; } } - + // disable the old WP if active if(wpi->isActive) { DisArm(wpi); @@ -554,7 +559,7 @@ void WatchpointThreadInit(WatchPointUpCall_t func){ EMSG("Failed sigaltstack"); monitor_real_abort(); } - + tData.lbrDummyFD = -1; tData.fptr = func; tData.fs_reg_val = (void*)-1; @@ -567,15 +572,15 @@ void WatchpointThreadInit(WatchPointUpCall_t func){ tData.numWatchpointDropped = 0; tData.numSampleTriggeringWatchpoints = 0; tData.numInsaneIP = 0; - - + + for (int i=0; i tData.watchPointArray[i].startTime) { @@ -704,7 +709,7 @@ static VictimType GetVictim(int * location, ReplacementPolicy policy){ return NON_EMPTY_SLOT; } break; - + case EMPTY_SLOT_ONLY:{ return NONE_AVAILABLE; } @@ -729,12 +734,12 @@ static void ConsumeAllRingBufferData(void *mbuf) { * data points to beginning of buffer payload */ void * data = ((void *)hdr) + wpConfig.pgsz; - + /* * position of tail within the buffer payload */ tail = hdr->data_tail & pgmsk; - + /* * size of what is available * @@ -764,12 +769,12 @@ static int ReadMampBuffer(void *mbuf, void *buf, size_t sz) { * data points to beginning of buffer payload */ data = ((void *)hdr) + wpConfig.pgsz; - + /* * position of tail within the buffer payload */ tail = hdr->data_tail & pgmsk; - + /* * size of what is available * @@ -781,15 +786,15 @@ static int ReadMampBuffer(void *mbuf, void *buf, size_t sz) { rmb(); return -1; } - + /* From perf_event_open() manpage */ rmb(); - - + + /* * sz <= avail_sz, we can satisfy the request */ - + /* * c = size till end of buffer * @@ -797,23 +802,23 @@ static int ReadMampBuffer(void *mbuf, void *buf, size_t sz) { * a power of two, so we can do: */ c = pgmsk + 1 - tail; - + /* * min with requested size */ m = c < sz ? c : sz; - + /* copy beginning */ memcpy(buf, data + tail, m); - + /* * copy wrapped around leftover */ if (sz > m) memcpy(buf + m, data, sz - m); - + hdr->data_tail += sz; - + return 0; } @@ -867,7 +872,7 @@ static inline void * GetPatchedIP(void * contextIP) { static bool CollectWatchPointTriggerInfo(WatchPointInfo_t * wpi, WatchPointTrigger_t *wpt, void * context){ //struct perf_event_mmap_page * b = wpi->mmapBuffer; struct perf_event_header hdr; - + if (ReadMampBuffer(wpi->mmapBuffer, &hdr, sizeof(struct perf_event_header)) < 0) { EMSG("Failed to ReadMampBuffer: %s\n", strerror(errno)); monitor_real_abort(); @@ -885,7 +890,6 @@ static bool CollectWatchPointTriggerInfo(WatchPointInfo_t * wpi, WatchPointTrig EMSG("Failed to ReadMampBuffer: %s\n", strerror(errno)); monitor_real_abort(); } - if(! (hdr.misc & PERF_RECORD_MISC_EXACT_IP)){ //EMSG("PERF_SAMPLE_IP imprecise\n"); tData.numWatchpointImpreciseIP ++; @@ -899,7 +903,8 @@ static bool CollectWatchPointTriggerInfo(WatchPointInfo_t * wpi, WatchPointTrig reliableIP = patchedIP; } else { // Fake as requested by Xu for reuse clients - reliableIP = contextIP-1; + //reliableIP = contextIP-1; + reliableIP = 0; //To avoid ambiguity, just ignore the inaccurate PC for the moment for reuse client. TODO: Can we still use the inaccurate pc? } //EMSG("PERF_SAMPLE_IP imprecise: %p patched to %p in WP handler\n", tmpIP, patchedIP); } else { @@ -932,9 +937,9 @@ static bool CollectWatchPointTriggerInfo(WatchPointInfo_t * wpi, WatchPointTrig reliableIP = contextIP-1; } } - + wpt->pc = reliableIP; - + if(wpConfig.dontDisassembleWPAddress == false){ FloatType * floatType = wpConfig.getFloatType? &wpt->floatType : 0; if(false == get_mem_access_length_and_type_address(wpt->pc, (uint32_t*) &(wpt->accessLength), &(wpt->accessType), floatType, context, &addr)){ @@ -945,8 +950,8 @@ static bool CollectWatchPointTriggerInfo(WatchPointInfo_t * wpi, WatchPointTrig //EMSG("WP triggered 0 access length! at pc=%p\n", wpt->pc); goto ErrExit; } - - + + void * patchedAddr = (void *)-1; // Stack affecting addresses will be off by 8 // Some instructions affect the address computing register: mov (%rax),%eax @@ -957,7 +962,7 @@ static bool CollectWatchPointTriggerInfo(WatchPointInfo_t * wpi, WatchPointTrig else tData.numWatchpointImpreciseAddressArbitraryLength ++; - + tData.numWatchpointImpreciseAddressArbitraryLength ++; patchedAddr = wpi->va; } else { @@ -992,7 +997,7 @@ static bool CollectWatchPointTriggerInfo(WatchPointInfo_t * wpi, WatchPointTrig //SkipBuffer(wpi->mmapBuffer , hdr.size - sizeof(hdr)); goto ErrExit; } - + ErrExit: // We must cleanup the mmap buffer if there is any data left ConsumeAllRingBufferData(wpi->mmapBuffer); @@ -1017,12 +1022,12 @@ static int OnWatchPoint(int signum, siginfo_t *info, void *context){ // and return and avoid any MSG. void* pc = hpcrun_context_pc(context); if (!hpcrun_safe_enter_async(pc)) return 0; - + linux_perf_events_pause(); - + tData.numWatchpointTriggers++; //fprintf(stderr, " numWatchpointTriggers = %lu, \n", tData.numWatchpointTriggers); - + //find which watchpoint fired int location = -1; for(int i = 0 ; i < wpConfig.maxWP; i++) { @@ -1031,7 +1036,7 @@ static int OnWatchPoint(int signum, siginfo_t *info, void *context){ break; } } - + // Ensure it is an active WP if(location == -1) { EMSG("\n WP trigger did not match any known active WP\n"); @@ -1041,7 +1046,7 @@ static int OnWatchPoint(int signum, siginfo_t *info, void *context){ //fprintf("\n WP trigger did not match any known active WP\n"); return 0; } - + WatchPointTrigger_t wpt; WPTriggerActionType retVal; WatchPointInfo_t *wpi = &tData.watchPointArray[location]; @@ -1062,15 +1067,15 @@ static int OnWatchPoint(int signum, siginfo_t *info, void *context){ monitor_real_abort(); break; } - - + + if( false == CollectWatchPointTriggerInfo(wpi, &wpt, context)) { tData.numWatchpointDropped++; retVal = DISABLE_WP; // disable if unable to collect any info. } else { retVal = tData.fptr(wpi, 0, wpt.accessLength/* invalid*/, &wpt); } - + // Let the client take action. switch (retVal) { case DISABLE_WP: { @@ -1127,7 +1132,7 @@ static bool ValidateWPData(SampleData_t * sampleData){ else return false; break; - + default: EMSG("Unsuppported WP length %d", sampleData->wpLength); monitor_real_abort(); @@ -1170,19 +1175,19 @@ bool SubscribeWatchpoint(SampleData_t * sampleData, OverwritePolicy overwritePol if(IsOveralpped(sampleData)){ return false; // drop the sample if it overlaps an existing address } - + // No overlap, look for a victim slot int victimLocation = -1; // Find a slot to install WP VictimType r = GetVictim(&victimLocation, wpConfig.replacementPolicy); - + if(r != NONE_AVAILABLE) { // VV IMP: Capture value before arming the WP. if(captureValue) CaptureValue(sampleData, &tData.watchPointArray[victimLocation]); // I know the error case that we have captured the value but ArmWatchPoint fails. // I am not handling that corner case because ArmWatchPoint() will fail with a monitor_real_abort(). - + if(ArmWatchPoint(&tData.watchPointArray[victimLocation], sampleData) == false){ //LOG to hpcrun log EMSG("ArmWatchPoint failed for address %p", sampleData->va); @@ -1202,14 +1207,14 @@ WPUpCallTRetType Test1UpCall(WatchPointInfo_t * wp, WatchPointTrigger_t * wt) { printf("\n Test1UpCall %p\n", wt->va); if(wpConfig.isLBREnabled) assert(wp->sample.va == wt->va); - + cnt ++; return DISABLE; } void TestBasic(){ tData.fptr = Test1UpCall; - + sigset_t block_mask; sigemptyset (&block_mask); // Set a signal handler for SIGUSR1 @@ -1218,18 +1223,18 @@ void TestBasic(){ // .sa_mask = block_mask, .sa_flags = SA_SIGINFO | SA_RESTART | SA_NODEFER }; - + if(sigaction(wpConfig.signalDelivered, &sa1, NULL) == -1) { fprintf(stderr, "Failed to set WHICH_SIG handler: %s\n", strerror(errno)); monitor_real_abort(); } - - + + WatchpointThreadInit(); int N = 10000; volatile int dummyWPLocation[10000]; cnt = 0; - + for(int i = 0 ; i < N; i++) { SampleData_t s = {.va = &dummyWPLocation[i], .wpLength = sizeof(int), .type = WP_WRITE}; SubscribeWatchpoint(&s, AUTO); diff --git a/src/tool/hpcrun/sample-sources/watchpoint_support.h b/src/tool/hpcrun/sample-sources/watchpoint_support.h index 3b0d066e94..233502a103 100644 --- a/src/tool/hpcrun/sample-sources/watchpoint_support.h +++ b/src/tool/hpcrun/sample-sources/watchpoint_support.h @@ -60,6 +60,7 @@ #define MIN(x, y) (((x) < (y)) ? (x) : (y)) +#define MAX(x, y) (((x) > (y)) ? (x) : (y)) #define MAX_WP_LENGTH (8L) #define CACHE_LINE_SZ (64) #define ALIGN_TO_CACHE_LINE(addr) ((uint64_t)(addr) & (~(CACHE_LINE_SZ-1))) @@ -84,6 +85,7 @@ typedef enum MergePolicy {AUTO_MERGE, NO_MERGE, CLIENT_ACTION} MergePolicy; typedef enum OverwritePolicy {OVERWRITE, NO_OVERWRITE} OverwritePolicy; typedef enum VictimType {EMPTY_SLOT, NON_EMPTY_SLOT, NONE_AVAILABLE} VictimType; typedef enum WPTriggerActionType {DISABLE_WP, ALREADY_DISABLED, DISABLE_ALL_WP, RETAIN_WP} WPTriggerActionType; +typedef enum ReuseType { REUSE_TEMPORAL, REUSE_SPATIAL, REUSE_BOTH} ReuseType; // for reuse client // Data structure that is given by clients to set a WP typedef struct SampleData{ @@ -101,6 +103,8 @@ typedef struct SampleData{ WPTriggerActionType preWPAction; bool isSamplePointAccurate; bool isBackTrace; + ReuseType reuseType; + uint64_t reuseDistance[2][3]; } SampleData_t; typedef struct WatchPointInfo{ @@ -152,8 +156,7 @@ extern bool IsFSorGS(void *addr); extern double ProportionOfWatchpointAmongOthersSharingTheSameContext(WatchPointInfo_t *wpi); -extern void TemporalReuseWPConfigOverride(void*); -extern void SpatialReuseWPConfigOverride(void*); +extern void ReuseWPConfigOverride(void*); extern void FalseSharingWPConfigOverride(void*); extern void TrueSharingWPConfigOverride(void*); extern void AllSharingWPConfigOverride(void*); diff --git a/src/tool/hpcrun/thread_data.h b/src/tool/hpcrun/thread_data.h index 4d5ce25438..df4b09ad34 100644 --- a/src/tool/hpcrun/thread_data.h +++ b/src/tool/hpcrun/thread_data.h @@ -169,6 +169,8 @@ typedef struct thread_data_t { core_profile_trace_data_t core_profile_trace_data; + hpcio_outbuf_t witch_client_trace_output; //jqswang: it is used to output any data from witch client for post-mortem processing + // ---------------------------------------- // backtrace buffer // ---------------------------------------- diff --git a/src/tool/hpcrun/utilities/tokenize.c b/src/tool/hpcrun/utilities/tokenize.c index 9002b7470c..7632e59fd4 100644 --- a/src/tool/hpcrun/utilities/tokenize.c +++ b/src/tool/hpcrun/utilities/tokenize.c @@ -101,8 +101,8 @@ next_tok(void) int hpcrun_extract_threshold ( - const char *input_string, - long *threshold, + const char *input_string, + long *threshold, long default_value ) { @@ -112,13 +112,13 @@ hpcrun_extract_threshold *threshold = default_value; type = THRESH_DEFAULT; } else { - if (*input_string == PREFIX_FREQUENCY) { + if (*input_string == PREFIX_FREQUENCY) { input_string++; // skip the PREFIX_FREQUENCY character type = THRESH_FREQ; } else { type = THRESH_VALUE; } - + char *endptr; long value = strtol(input_string, &endptr, 10); if (value == 0) { @@ -129,7 +129,7 @@ hpcrun_extract_threshold if (endptr == input_string) { value = default_value; if (type == THRESH_FREQ) { - // FIXME: Laksono: we have some choices + // FIXME: Laksono: we have some choices // // Should the type be changed to THRESH_VALUE here? // @@ -138,21 +138,21 @@ hpcrun_extract_threshold } } } - + *threshold = value; } - + return type; } // event option syntax is event_name @ [f] threshold // if the f indicator exist, the number is the frequency, otherwise // it's a period number -// Returns: +// Returns: // THRESH_FREQ if event has explicit frequency // THRESH_VALUE if event has explicit threshold, // THRESH_DEFAULT if using default. -// +// int hpcrun_extract_ev_thresh(const char *in, int evlen, char *ev, long *th, long def) { @@ -180,7 +180,7 @@ hpcrun_extract_ev_thresh(const char *in, int evlen, char *ev, long *th, long def strncpy(ev, in, len); ev[len] = '\0'; *th = def; - + return THRESH_DEFAULT; }