Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
b3751d4
Fix TT ABI compatibility
ApoKalipse-V Jun 17, 2025
3d9a042
Add changelog
ApoKalipse-V Jun 17, 2025
fdf0609
Implementation fix
ApoKalipse-V Jun 17, 2025
ba6ba7e
Format
ApoKalipse-V Jun 17, 2025
811402d
Compiler fix
ApoKalipse-V Jun 17, 2025
789ac73
Apply suggestions from code review
Jun 18, 2025
ec87d87
More review comments
ApoKalipse-V Jun 20, 2025
d7cd954
Format
ApoKalipse-V Jun 20, 2025
363773d
Clang tidy fix
ApoKalipse-V Jun 23, 2025
6ef19f3
Merge branch 'amd-staging' into gbaraldi/fix_tt_abi
Jun 26, 2025
0dc37e4
Merge branch 'amd-staging' into gbaraldi/fix_tt_abi
Jun 26, 2025
a79d685
Merge branch 'amd-staging' into gbaraldi/fix_tt_abi
Jun 30, 2025
aa630ee
Apply suggestions from code review
Jul 1, 2025
1055c6c
Apply suggestions from code review
Jul 1, 2025
4c852d5
Merge branch 'amd-staging' into gbaraldi/fix_tt_abi
Jul 1, 2025
bc224b3
Fix conflict with review comments
ApoKalipse-V Jul 1, 2025
4b84b89
Apply suggestions from code review
Jul 1, 2025
a32b3c2
Update source/include/rocprofiler-sdk/experimental/thread-trace/trace…
Jul 1, 2025
62c2214
Merge branch 'amd-staging' into gbaraldi/fix_tt_abi
Jul 2, 2025
a01daca
Merge branch 'amd-staging' into gbaraldi/fix_tt_abi
Jul 8, 2025
f5e34ff
Merge branch 'amd-staging' into gbaraldi/fix_tt_abi
Jul 9, 2025
f5dc6b5
Merge branch 'amd-staging' into gbaraldi/fix_tt_abi
Jul 9, 2025
5380b3e
Merge branch 'amd-staging' into gbaraldi/fix_tt_abi
Jul 10, 2025
3a314ea
Merge branch 'amd-staging' into gbaraldi/fix_tt_abi
Jul 11, 2025
a20774b
Merge branch 'amd-staging' into gbaraldi/fix_tt_abi
Jul 22, 2025
4eac8c6
Merge branch 'amd-staging' into gbaraldi/fix_tt_abi
Jul 22, 2025
6cef5e0
Merge branch 'amd-staging' into gbaraldi/fix_tt_abi
Jul 22, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -220,3 +220,9 @@ Full documentation for ROCprofiler-SDK is available at [rocm.docs.amd.com/projec
### Removed

- Support of gfx940 and gfx941 targets from compilation

## ROCprofiler-SDK 1.1.0 for ROCm release 7.1.0

### Resolved issues

- Removed bitfields of Thread trace API for better ABI compatibility
2 changes: 1 addition & 1 deletion source/bin/rocprofv3.py
Original file line number Diff line number Diff line change
Expand Up @@ -769,7 +769,7 @@ def add_parser_bool_argument(gparser, *args, **kwargs):

att_options.add_argument(
"--att-buffer-size",
help="Thread trace buffer size. Default 96MB",
help="Thread trace buffer size. Default 256MB",
default=None,
type=str,
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ typedef struct rocprofiler_thread_trace_parameter_t
struct
{
rocprofiler_counter_id_t counter_id;
uint64_t simd_mask : 4;
uint8_t simd_mask;
};
};
} rocprofiler_thread_trace_parameter_t;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,13 @@
* @{
*/

/**
* @brief Describes a timestamp in shader clock units.
* TS==0 marks the start of the trace on a shader engine.
* Different shader engines may have different frequencies or start points.
*/
typedef int64_t rocprofiler_shader_timestamp_t;

/**
* @brief Describes the type of info received.
*/
Expand All @@ -57,28 +64,29 @@ typedef struct rocprofiler_thread_trace_decoder_pc_t
*/
typedef struct rocprofiler_thread_trace_decoder_perfevent_t
{
int64_t time; ///< Shader clock timestamp in which these counters were read.
uint16_t events0; ///< Counter0 (bank==0) or Counter4 (bank==1).
uint16_t events1; ///< Counter1 (bank==0) or Counter5 (bank==1).
uint16_t events2; ///< Counter2 (bank==0) or Counter6 (bank==1).
uint16_t events3; ///< Counter3 (bank==0) or Counter7 (bank==1).
uint8_t CU; ///< Shader compute unit ID these counters were collected from.
uint8_t bank; ///< Selects counter group [0,3] or [4,7]
rocprofiler_shader_timestamp_t timestamp;

uint16_t event0; //< Counter0 (bank==0) or Counter4 (bank==1).
uint16_t event1; //< Counter1 (bank==0) or Counter5 (bank==1).
uint16_t event2; //< Counter2 (bank==0) or Counter6 (bank==1).
uint16_t event3; //< Counter3 (bank==0) or Counter7 (bank==1).
uint8_t cu; ///< Shader compute unit ID these counters were collected from.
uint8_t bank; ///< Selects counter group [0,3] or [4,7]
} rocprofiler_thread_trace_decoder_perfevent_t;

/**
* @brief Describes an occupancy event (wave started or wave ended).
*/
typedef struct rocprofiler_thread_trace_decoder_occupancy_t
{
rocprofiler_thread_trace_decoder_pc_t pc; ///< Wave start address (kernel entry point)
uint64_t time; ///< Timestamp of event
uint8_t reserved; ///< Reserved
uint8_t cu; ///< Compute unit ID (gfx9) or WGP ID (gfx10+).
uint8_t simd; ///< SIMD ID [0,3] within compute unit
uint8_t slot; ///< Wave slot ID within SIMD
uint32_t start : 1; ///< 1 if wave_start, 0 if a wave_end
uint32_t _rsvd : 31;
rocprofiler_thread_trace_decoder_pc_t pc; ///< Wave start address (kernel entry point)
rocprofiler_shader_timestamp_t timestamp; ///< Timestamp of event

uint8_t start; ///< 1 = Wave start event, 0 = wave end event
uint8_t cu; ///< Compute unit ID (gfx9) or WGP ID (gfx10+)
uint8_t simd; ///< SIMD ID [0,3] within compute unit
uint8_t slot; ///< Wave slot ID within SIMD
uint32_t reserved;
} rocprofiler_thread_trace_decoder_occupancy_t;

/**
Expand Down Expand Up @@ -127,17 +135,23 @@ typedef enum rocprofiler_thread_trace_decoder_inst_category_t
/**
* @brief Describes an instruction execution event.
*
* The duration is measured as stall+issue time (gfx9) or stall+execution time (gfx10+).
* Time + duration marks the issue (gfx9) or execution (gfx10+) completion time.
* Time + stall marks the successful issue time.
* Duration - stall is the issue time (gfx9) or execution time (gfx10+).
* Bitfields defined exec_time and category
*
* Exec time is defined differently for different architectures:
* ::exec == issue time (gfx9) or ::exec == completion time (gfx10+)
*
* ::time marks when the wave first attempted to execute this instruction
* ::time + ::duration marks the issue completion (gfx9) or completion (gfx10+) time
* ::time + stall marks when the wave first attempted to issue the instruction (when ::exec begins)
* Stalled time can be computed as: ::duration - ::exec
*/
typedef struct rocprofiler_thread_trace_decoder_inst_t
{
uint32_t category : 8; ///< One of rocprofiler_thread_trace_decoder_inst_category_t
uint32_t stall : 24; ///< Stall duration, in clock cycles.
int32_t duration; ///< Total instruction duration, in clock cycles.
int64_t time; ///< When the wave first attempted to execute this instruction.
uint8_t category;
uint8_t reserved;
uint16_t exec;
int32_t duration; ///< Total instruction duration, in clock cycles. Stall + Exec
rocprofiler_shader_timestamp_t time;
rocprofiler_thread_trace_decoder_pc_t pc;
} rocprofiler_thread_trace_decoder_inst_t;

Expand All @@ -159,8 +173,8 @@ typedef struct rocprofiler_thread_trace_decoder_wave_t
uint32_t _rsvd2;
uint32_t _rsvd3;

int64_t begin_time; ///< Wave begin time. Should match occupancy event wave start.
int64_t end_time; ///< Wave end time. Should match occupancy event wave end.
rocprofiler_shader_timestamp_t start_timestamp; ///< Matches occupancy event wave start.
rocprofiler_shader_timestamp_t end_timestamp; ///< Matches occupancy event wave end.

size_t timeline_size; ///< timeline_array size
size_t instructions_size; ///< instructions_array size
Expand Down
2 changes: 1 addition & 1 deletion source/lib/att-tool/occupancy.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ OccupancyFile(const Fspath& dir,
for(const auto& event : eventlist)
{
nlohmann::json json_event;
json_event.push_back(event.time);
json_event.push_back(event.timestamp);
json_event.push_back(event.cu);
json_event.push_back(event.simd);
json_event.push_back(event.slot);
Expand Down
12 changes: 6 additions & 6 deletions source/lib/att-tool/perfcounter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -42,12 +42,12 @@ PerfcounterFile(WaveConfig& config, const perfevent_t* events, size_t event_coun
const auto& event = events[i];

nlohmann::json json_event;
json_event.push_back(event.time);
json_event.push_back(event.events0);
json_event.push_back(event.events1);
json_event.push_back(event.events2);
json_event.push_back(event.events3);
json_event.push_back(event.CU);
json_event.push_back(event.timestamp);
json_event.push_back(event.event0);
json_event.push_back(event.event1);
json_event.push_back(event.event2);
json_event.push_back(event.event3);
json_event.push_back(event.cu);
json_event.push_back(event.bank);

data.push_back(json_event);
Expand Down
4 changes: 2 additions & 2 deletions source/lib/att-tool/profile_interface.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ get_trace_data(rocprofiler_thread_trace_decoder_record_type_t trace_id,
for(size_t wave_n = 0; wave_n < trace_size; wave_n++)
{
const auto& wave = static_cast<const wave_t*>(trace_events)[wave_n];
int64_t prev_inst_time = wave.begin_time;
auto prev_inst_time = wave.start_timestamp;

for(size_t j = 0; j < wave.instructions_size; j++)
{
Expand All @@ -87,7 +87,7 @@ get_trace_data(rocprofiler_thread_trace_decoder_record_type_t trace_id,
auto& line = tool.get(inst.pc);
line.hitcount += 1;
line.latency += inst.duration;
line.stall += inst.stall;
line.stall += inst.duration - inst.exec;
line.idle += std::max<int64_t>(inst.time - prev_inst_time, 0);
} catch(...)
{
Expand Down
14 changes: 7 additions & 7 deletions source/lib/att-tool/wave.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -53,8 +53,8 @@ WaveFile::WaveFile(WaveConfig& config, const wave_t& wave)
filename,
FilenameMgr::Coord{
config.shader_engine, (int) wave.simd, (int) wave.wave_id, assigned_id},
wave.begin_time,
wave.end_time);
wave.start_timestamp,
wave.end_timestamp);
}

nlohmann::json instructions;
Expand All @@ -64,13 +64,13 @@ WaveFile::WaveFile(WaveConfig& config, const wave_t& wave)
auto& inst = wave.instructions_array[i];
instructions.push_back({inst.time,
static_cast<int>(inst.category),
static_cast<int>(inst.stall),
static_cast<int>(inst.duration - inst.exec),
static_cast<int64_t>(inst.duration),
config.code->line_numbers[inst.pc]});
}

nlohmann::json timeline;
int64_t acc_time = wave.begin_time;
auto acc_time = wave.start_timestamp;

for(size_t i = 0; i < wave.timeline_size; i++)
{
Expand Down Expand Up @@ -110,8 +110,8 @@ WaveFile::WaveFile(WaveConfig& config, const wave_t& wave)
{"id", assigned_id},
{"simd", wave.simd},
{"slot", wave.wave_id},
{"begin", wave.begin_time},
{"end", wave.end_time},
{"begin", wave.start_timestamp},
{"end", wave.end_timestamp},

{"instructions", instructions},
{"timeline", timeline},
Expand All @@ -120,7 +120,7 @@ WaveFile::WaveFile(WaveConfig& config, const wave_t& wave)

nlohmann::json metadata = {
{"name", "SE" + std::to_string(config.shader_engine)},
{"duration", wave.end_time - wave.begin_time},
{"duration", wave.end_timestamp - wave.start_timestamp},
{"wave", wave_entry},
{"num_stitched", wave.instructions_size},
{"num_insts", wave.instructions_size},
Expand Down
4 changes: 2 additions & 2 deletions source/lib/rocprofiler-sdk-tool/config.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ format_name(std::string_view _name, const config& = get_config<>());
struct att_perfcounter
{
std::string counter_name = {};
uint32_t simd_mask = 0xf;
uint8_t simd_mask = 0xF;

template <typename ArchiveT>
void save(ArchiveT&) const;
Expand Down Expand Up @@ -139,7 +139,7 @@ struct config : output_config
int mpi_rank = get_mpi_rank();
uint64_t att_param_shader_engine_mask =
get_env<uint64_t>("ROCPROF_ATT_PARAM_SHADER_ENGINE_MASK", 0x1);
uint64_t att_param_buffer_size = get_env<uint64_t>("ROCPROF_ATT_PARAM_BUFFER_SIZE", 0x6000000);
uint64_t att_param_buffer_size = get_env<uint64_t>("ROCPROF_ATT_PARAM_BUFFER_SIZE", 0x10000000);
uint64_t att_param_simd_select = get_env<uint64_t>("ROCPROF_ATT_PARAM_SIMD_SELECT", 0xF);
uint64_t att_param_target_cu = get_env<uint64_t>("ROCPROF_ATT_PARAM_TARGET_CU", 1);
uint64_t att_param_perf_ctrl = get_env<uint64_t>("ROCPROF_ATT_PARAM_PERFCOUNTER_CTRL", 0);
Expand Down
56 changes: 51 additions & 5 deletions source/lib/rocprofiler-sdk/thread_trace/decode.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,16 @@ rocprofiler_thread_trace_decoder_create(rocprofiler_thread_trace_decoder_handle_
{
auto dl = std::make_unique<DL>(path);
if(dl->handle == nullptr) return ROCPROFILER_STATUS_ERROR_NOT_AVAILABLE;
if(!dl->valid()) return ROCPROFILER_STATUS_ERROR_INCOMPATIBLE_ABI;

if(!dl->valid() || dl->version > TTD_API_VERSION)
{
ROCP_CI_LOG(ERROR) << "Incompatible decoder version: v" << dl->version_major << '.'
<< dl->version_minor << "\nExpected: " << TTD_API_VERSION_MAJOR << '.'
<< TTD_API_VERSION_MINOR << " or lower.\nPlease update the SDK or"
<< " use a compatible library version.";

return ROCPROFILER_STATUS_ERROR_INCOMPATIBLE_ABI;
}

auto lk = std::unique_lock{map_mut};
static uint64_t count = 1;
Expand Down Expand Up @@ -204,9 +213,47 @@ trace_callback(rocprofiler_thread_trace_decoder_record_type_t record_type_id,
uint64_t trace_size,
void* userdata)
{
using wave_t = rocprofiler_thread_trace_decoder_wave_t;
using occupancy_t = rocprofiler_thread_trace_decoder_occupancy_t;

ROCP_FATAL_IF(userdata == nullptr) << "Userdata is null!";
auto* trace_data = static_cast<trace_data_t*>(userdata);

ROCP_FATAL_IF(trace_data->decoder == nullptr) << "Decoder is null!";
ROCP_FATAL_IF(trace_data->decoder->dl == nullptr) << "DL is null!";

// For version v0.1, we need to convert stall to exec time and fix the bitshifts
if(trace_data->decoder->dl->version <= TTD_MAKE_VERSION(0, 1))
{
if(record_type_id == ROCPROFILER_THREAD_TRACE_DECODER_RECORD_WAVE)
{
for(size_t w = 0; w < trace_size; w++)
{
auto& wave = static_cast<wave_t*>(trace_events)[w];
for(size_t i = 0; i < wave.instructions_size; i++)
{
auto& inst = wave.instructions_array[i];
// v0.1 uses the 24 high bits as stall
auto stall = (static_cast<uint64_t>(inst.exec) << 8) | inst.reserved;
// Duration is defined as exec + stall
inst.exec = static_cast<uint16_t>(inst.duration - stall);
inst.reserved = 0;
}
}
}
else if(record_type_id == ROCPROFILER_THREAD_TRACE_DECODER_RECORD_OCCUPANCY)
{
for(size_t i = 0; i < trace_size; i++)
{
auto& event = static_cast<occupancy_t*>(trace_events)[i];
event.start = event.reserved == 0 ? 0 : 1;
event.reserved = 0;
}
}
}

trace_data->cb(record_type_id, trace_events, trace_size, trace_data->userdata);

return ROCPROFILER_THREAD_TRACE_DECODER_STATUS_SUCCESS;
}

Expand All @@ -229,11 +276,10 @@ rocprofiler_trace_decode(rocprofiler_thread_trace_decoder_handle_t handle,
.cb = user_callback,
.userdata = userdata};

auto status =
decoder->dl->att_parse_data_fn(copy_trace_data, trace_callback, isa_callback, &cbdata);
auto status = decoder->dl->parse(copy_trace_data, trace_callback, isa_callback, &cbdata);
if(status != ROCPROFILER_THREAD_TRACE_DECODER_STATUS_SUCCESS)
{
const char* statustr = decoder->dl->att_status_fn(status);
const char* statustr = decoder->dl->status(status);
if(statustr == nullptr) statustr = "Unknown error";
ROCP_ERROR << "Callback failed with status " << status << ": " << statustr;

Expand All @@ -255,6 +301,6 @@ rocprofiler_thread_trace_decoder_info_string(rocprofiler_thread_trace_decoder_ha
auto decoder = get_dl(handle);
if(decoder == nullptr) return nullptr;

return decoder->dl->att_info_fn(info);
return decoder->dl->info(info);
}
}
17 changes: 12 additions & 5 deletions source/lib/rocprofiler-sdk/thread_trace/dl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -33,18 +33,25 @@ namespace thread_trace
{
DL::DL(const char* libpath)
{
using version_fn_t = decltype(rocprof_trace_decoder_get_version);

if(libpath == nullptr) return;

auto path = common::filesystem::path(libpath) / "librocprof-trace-decoder.so";

handle = dlopen(path.c_str(), RTLD_LAZY | RTLD_LOCAL);
if(!handle) return;

att_parse_data_fn =
reinterpret_cast<ParseFn*>(dlsym(handle, "rocprof_trace_decoder_parse_data"));
att_info_fn = reinterpret_cast<InfoFn*>(dlsym(handle, "rocprof_trace_decoder_get_info_string"));
att_status_fn =
reinterpret_cast<StatusFn*>(dlsym(handle, "rocprof_trace_decoder_get_status_string"));
parse = reinterpret_cast<parse_fn_t*>(dlsym(handle, "rocprof_trace_decoder_parse_data"));
info = reinterpret_cast<info_fn_t*>(dlsym(handle, "rocprof_trace_decoder_get_info_string"));
status =
reinterpret_cast<status_fn_t*>(dlsym(handle, "rocprof_trace_decoder_get_status_string"));

auto* func_version =
reinterpret_cast<version_fn_t*>(dlsym(handle, "rocprof_trace_decoder_get_version"));

if(func_version) func_version(&version_major, &version_minor, &version_patch);
version = TTD_MAKE_VERSION(version_major, version_minor);
};

DL::~DL()
Expand Down
22 changes: 13 additions & 9 deletions source/lib/rocprofiler-sdk/thread_trace/dl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,9 +32,9 @@ namespace thread_trace
{
class DL
{
using ParseFn = decltype(rocprof_trace_decoder_parse_data);
using InfoFn = decltype(rocprof_trace_decoder_get_info_string);
using StatusFn = decltype(rocprof_trace_decoder_get_status_string);
using parse_fn_t = decltype(rocprof_trace_decoder_parse_data);
using info_fn_t = decltype(rocprof_trace_decoder_get_info_string);
using status_fn_t = decltype(rocprof_trace_decoder_get_status_string);

public:
DL(const char* libpath);
Expand All @@ -44,14 +44,18 @@ class DL

bool valid() const
{
return handle != nullptr && att_parse_data_fn != nullptr && att_info_fn != nullptr &&
att_status_fn != nullptr;
return handle != nullptr && parse != nullptr && info != nullptr && status != nullptr;
};

ParseFn* att_parse_data_fn = nullptr;
InfoFn* att_info_fn = nullptr;
StatusFn* att_status_fn = nullptr;
void* handle = nullptr;
parse_fn_t* parse = nullptr;
info_fn_t* info = nullptr;
status_fn_t* status = nullptr;
void* handle = nullptr;

uint32_t version_major = 0;
uint32_t version_minor = 0;
uint32_t version_patch = 0;
uint64_t version = 0;
};

} // namespace thread_trace
Expand Down
Loading