Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 14 additions & 2 deletions common/arch.h
Original file line number Diff line number Diff line change
Expand Up @@ -211,7 +211,13 @@ enum InstructionType {
// List of instruction groups
enum GroupType {
G_FP64 = 0, // FP64 arithmetic instructions
G_FP32, // FP32 arithmetic instructions
G_FP32, // FP32 arithmetic instructions
/**
* Fernando Fernandes, 10/2022
* Add the FP16 from FP16 MMA fault sites separated from FP32
*/
G_FP16,
G_MMA,
G_LD, // instructions that read from emory
G_PR, // instructions that write to PR registers only
G_NODEST, // instructions with no destination register
Expand All @@ -229,7 +235,13 @@ enum BitFlipModel {
FLIP_TWO_BITS, // flip two adjacent bits
RANDOM_VALUE, // write a random value.
ZERO_VALUE, // write value 0
NUM_BFM_TYPES
/**
* Fernando Fernandes, 10/2022
* Add warp wide fault models
*/
WARP_RANDOM_VALUE, // random in a warp
WARP_ZERO_VALUE, // zero in all the warp
NUM_BFM_TYPES
};


Expand Down
34 changes: 31 additions & 3 deletions common/globals.h
Original file line number Diff line number Diff line change
Expand Up @@ -110,8 +110,12 @@ const char * instTypeNames[NUM_ISA_INSTRUCTIONS] = {
"GETLMEMBASE", "SETCRSPTR", "SETLMEMBASE" , "PMTRIG", "SETCTAID"
};

/**
* Fernando Fernandes, 10/2022
* Add the FP16 and MMA fault sites separated from FP32
*/
const char * instGrouptNames[NUM_INST_GROUPS] = {
"fp64", "fp32", "ld", "pr", "nodest", "others", "gppr", "gp"
"fp64", "fp32", "fp16", "mma", "ld", "pr", "nodest", "others", "gppr", "gp"
};

int fp64Inst[] = {
Expand All @@ -122,6 +126,18 @@ int fp32Inst[] = {
FADD, FADD32I, FCMP, FFMA, FFMA32I, FMNMX, FMUL, FMUL32I, FSEL, FSET, FSWZADD, IPA, DSET
};

/**
* Fernando Fernandes, 09/2022
* Add the FP16 and MMA fault sites separated from FP32
*/
int fp16Inst[] = {
HADD2, HADD2_32I, HFMA2, HFMA2_32I, HMUL2, HMUL2_32I, HSET2, HSETP2,
};

int MMAInst[] = {
IMMA, HMMA,
};

int ldInst[] = {
LD, LDC, LDG, LDL, LDS, SULD, SUST, TLD, TLD4, TLD4S, TLDS
};
Expand All @@ -142,11 +158,15 @@ int noDestInst[] = {

int otherInst[] = {
// Floating-point Instructions
MUFU, RRO, HADD2, HADD2_32I, HFMA2, HFMA2_32I, HMUL2, HMUL2_32I, HSET2, HSETP2,
/**
* Fernando Fernandes, 09/2022
* Add the FP16 and MMA fault sites separated from FP32
*/
MUFU, RRO, // HADD2, HADD2_32I, HFMA2, HFMA2_32I, HMUL2, HMUL2_32I, HSET2, HSETP2,
// Integer Instructions
IDP, IDP4A, BFE, BFI, BMSK, BREV, FLO, IADD, IADD3, IADD32I, ICMP, IMAD, IMAD32I, IMADSP, IMNMX, IMUL, IMUL32I, ISCADD, ISCADD32I, ISET, LEA, LOP, LOP3, LOP32I, POPC, SHF, SHL, SHR, XMAD,
// MMA instructions
IMMA, HMMA,
// IMMA, HMMA,
// Video Instructions
VABSDIFF, VADD, VMAD, VMNMX, VSET, VSHL, VSHR, VABSDIFF4,
// Conversion Instructions
Expand Down Expand Up @@ -249,6 +269,14 @@ int getOpGroupNum(int opcode) {
return G_FP64;
if (checkOpType(opcode, fp32Inst, sizeof(fp32Inst)/sizeof(int)))
return G_FP32;
/**
* Fernando Fernandes, 09/2022
* Add the FP16 and MMA fault sites separated from FP32
*/
if (checkOpType(opcode, fp16Inst, sizeof(fp16Inst) / sizeof(int)))
return G_FP16;
if (checkOpType(opcode, MMAInst, sizeof(MMAInst) / sizeof(int)))
return G_MMA;
if (checkOpType(opcode, ldInst, sizeof(ldInst)/sizeof(int)))
return G_LD;
if (checkOpType(opcode, prOnlyInst, sizeof(prOnlyInst)/sizeof(int)))
Expand Down
2 changes: 1 addition & 1 deletion injector/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ DUMMY=0

SOURCES=$(wildcard *.cu)
OBJECTS=$(SOURCES:.cu=.o)
ARCH=35
ARCH=70

NVBIT_TOOL=injector.so

Expand Down
32 changes: 26 additions & 6 deletions injector/inject_funcs.cu
Original file line number Diff line number Diff line change
Expand Up @@ -34,19 +34,30 @@ __inline__ __device__ int get_flat_tid() {
// Get bit-mask for error injection. Old value will be XORed with this mask later to inject the error.
__inline__
__device__ unsigned int get_mask(uint32_t bitFlipModel, float bitIDSeed, unsigned int oldVal) {
/**
* Fernando Fernandes, 10/2022
* Added the mask creation for warp wide injections
*/
return (bitFlipModel == FLIP_SINGLE_BIT) * ((unsigned int)1<<(int)(32*bitIDSeed)) +
(bitFlipModel == FLIP_TWO_BITS) * ((unsigned int)3<<(int)(31*bitIDSeed)) +
(bitFlipModel == RANDOM_VALUE) * (((unsigned int)-1) * bitIDSeed) +
(bitFlipModel == ZERO_VALUE) * oldVal;
(bitFlipModel == RANDOM_VALUE || bitFlipModel == WARP_RANDOM_VALUE) * (((unsigned int)-1) * bitIDSeed) +
(bitFlipModel == ZERO_VALUE || bitFlipModel == WARP_ZERO_VALUE) * oldVal;
}

extern "C" __device__ __noinline__ void inject_error(uint64_t piinfo, uint64_t pcounters, uint64_t pverbose_device,
int offset, int index, int grp_index, int predicate, int destGPRNum, int regval,
int numDestGPRs, int destPRNum1, int destPRNum2, int maxRegs) {

inj_info_t* inj_info = (inj_info_t*)piinfo;
inj_info_t* inj_info = (inj_info_t*)piinfo;
uint32_t verbose_device = *((uint32_t *)pverbose_device);
uint64_t * counters = (uint64_t *)pcounters;
// Fernando Fernandes, 10/2022: The inj_info is now an array of 32 position of inj_info_t
// each position for each lane_id. This is only valid for warp wide injections, other fault models use position 0
const unsigned lane_id = get_laneid();
const bool warp_wide_fi = inj_info->bitFlipModel == WARP_ZERO_VALUE || inj_info->bitFlipModel == WARP_RANDOM_VALUE;
if (warp_wide_fi) {
inj_info += lane_id;
}

if (verbose_device)
inj_info->debug[NUM_DEBUG_VALS-1] = 1;
Expand Down Expand Up @@ -75,6 +86,12 @@ extern "C" __device__ __noinline__ void inject_error(uint64_t piinfo, uint64_t p

bool injectFlag = false;
switch (igid) {
/**
* Fernando Fernandes, 10/2022
* Enable the FP16 and the MMA injections
*/
case G_FP16: // It is supposed to work for FP16 and FP16MMA, as they have destination registers
case G_MMA:
case G_FP32: // inject into one of the dest reg
case G_FP64: // inject into one of the regs written by the inst
case G_LD: // inject into one of the regs written by the inst
Expand All @@ -96,14 +113,17 @@ extern "C" __device__ __noinline__ void inject_error(uint64_t piinfo, uint64_t p
case G_NODEST: // do nothing
default: break;
}

// Fernando Fernandes, 10/2022:
// If the warp injections is selected, sync between all the active threads within the warp
if (warp_wide_fi)
injectFlag = __any_sync(__activemask(), injectFlag) != 0;
if (verbose_device && injectFlag)
printf("inj_info->instID=%ld, %ld, %ld, %ld\n", inj_info->instID, currCounter1, currCounter2, currCounter3);

if (injectFlag) {
// assert(0 == 10);
if (verbose_device)
printf("offset=0x%x, igid:%d, destGPRNum=%d, grp_index=%d\n", offset, igid, destGPRNum, grp_index);
printf("offset=0x%x, igid:%d, destGPRNum=%d, grp_index=%d laneId=%d\n", offset, igid, destGPRNum, grp_index, lane_id); // Fernando Fernandes, 10/2022: debug message
// We need to randomly select one register from numDestGPRs + (destPRNum1 != -1) + (destPRNum2 != -1)
int totalDest = numDestGPRs + (destPRNum1 != -1) + (destPRNum2 != -1);
assert(totalDest > 0);
Expand Down Expand Up @@ -142,7 +162,7 @@ extern "C" __device__ __noinline__ void inject_error(uint64_t piinfo, uint64_t p
assert(inj_info->debug[12] == inj_info->opcode);
assert(inj_info->debug[13] == inj_info->pcOffset);
if (verbose_device)
printf("done here\n");
printf("done here, lane:%d\n", lane_id); // Fernando Fernandes, 10/2022: debug message
} else {
assert(0 == 2);
}
Expand Down
Loading