Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
81 changes: 8 additions & 73 deletions ml_analysis/NB.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
"""
File: NB.py
Project: analysis
Last Modified: 2021-8-2
Created Date: 2021-8-2
Last Modified: 2022-2-7
Created Date: 2022-2-7
Copyright (c) 2021
Author: AHMA project (Univ Rennes, CNRS, Inria, IRISA)
"""
Expand All @@ -23,76 +23,7 @@
sys.path.append(os.path.join (os.path.dirname (__file__), "../pre-processings/"))

from nicv import compute_nicv
from list_manipulation import get_tag

################################################################################
def load_traces (files_list, bandwidth, time_limit):
################################################################################
# load_traces
# load all the traces listed in 'files_lists', the 2D-traces are flattened
# /!\ only half of the time features are used (to speedup without decreasing
# the accuracy)
#
# input:
# + files_list: list of the filenames
# + bandwidth: indexes of the bandwidth
# + time_limit: percentage of the trace to concerve
#
# output:
# + traces: array containing the traces (dimension: DxQ, D number of features,
# Q number of samples)
################################################################################
## get dimension
tmp_trace = np.load (files_list [0], allow_pickle = True)[-1][bandwidth, :]

## take only half of the features
D = int (tmp_trace.shape [1]*time_limit)
tmp_trace = tmp_trace [:, :D].flatten ()

traces = np.zeros ((len (tmp_trace), len (files_list)))
traces [:, 0] = tmp_trace

for i in range (1, traces.shape [1]):
traces [:, i] = np.load (files_list [i], allow_pickle = True)[-1][bandwidth, :D].flatten ()

return traces

################################################################################
def mean_by_label (traces, labels, files, mean_size):
################################################################################
# mean_by_label
# mean traces per executable, it means the input traces are mean by batch of
# 'mean_size' of the same executable (not only same label)
#
# input:
# + traces: array of traces (DxQ)
# + labels: list of the labels (Q elements)
# + files: names of the files, to be able to get the exaecutable name
# + mean_size: size of the batch
#
# output:
# + traces: mean traces (dimension: Dx(Q/new_size), D number of features,
# (Q/new_size) number of samples)
# + labels: new labels (Q/new_size)
################################################################################

tags = np.array ([get_tag (f) for f in files])
unique = np.unique (tags)

tmp_res = []
tmp_labels = []

for i in tqdm (range (len (unique)), desc = 'meaning (%s)'%mean_size, leave = False):
idx = np.where (tags == str (unique [i]))[0]

for j in range (0, len (idx) - mean_size, mean_size):
tmp_labels.append (labels [idx [j]])
current_res = 0.
for k in range (mean_size):
current_res += traces [:, idx [j + k]]
tmp_res.append (current_res/mean_size)

return np.array (tmp_res).T, tmp_labels
from evaluation import get_tag, load_traces

################################################################################
def evaluate (path_lists, log_file, model_lda, model_nb, mean_sizes, nb_of_bd,
Expand Down Expand Up @@ -162,6 +93,9 @@ def evaluate (path_lists, log_file, model_lda, model_nb, mean_sizes, nb_of_bd,

## testing
testing_labels = y_test

## get tags to be able to mean
testing_tags = np.array ([get_tag (f) for f in x_test_filelist])

## load NB
gnb = joblib.load (model_nb)
Expand All @@ -187,7 +121,8 @@ def evaluate (path_lists, log_file, model_lda, model_nb, mean_sizes, nb_of_bd,
file_log.write ('compute with %s per mean\n'%mean_size)
file_log.close ()

X, y = mean_by_label (testing_traces, np.array (testing_labels), x_test_filelist, mean_size)
X, y = mean_by_tags (testing_traces, testing_tags,
np.array (testing_labels), x_test_filelist, mean_size)

# NB + LDA
t0 = time.time ()
Expand Down
80 changes: 8 additions & 72 deletions ml_analysis/SVM.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
"""
File: SVM.py
Project: analysis
Last Modified: 2021-8-2
Created Date: 2021-8-2
Last Modified: 2022-2-7
Created Date: 2022-2-7
Copyright (c) 2021
Author: AHMA project (Univ Rennes, CNRS, Inria, IRISA)
"""
Expand All @@ -23,75 +23,7 @@
sys.path.append(os.path.join (os.path.dirname (__file__), "../pre-processings/"))
from nicv import compute_nicv
from list_manipulation import get_tag

################################################################################
def load_traces (files_list, bandwidth, time_limit):
################################################################################
# load_traces
# load all the traces listed in 'files_lists', the 2D-traces are flattened
# /!\ only half of the time features are used (to speedup without decreasing
# the accuracy)
#
# input:
# + files_list: list of the filenames
# + bandwidth: indexes of the bandwidth
# + time_limit: percentage of the trace to concerve
#
# output:
# + traces: array containing the traces (dimension: DxQ, D number of features,
# Q number of samples)
################################################################################
## get dimension
tmp_trace = np.load (files_list [0], allow_pickle = True)[-1][bandwidth, :]

## take only half of the features
D = int (tmp_trace.shape [1]*time_limit)
tmp_trace = tmp_trace [:, :D].flatten ()

traces = np.zeros ((len (tmp_trace), len (files_list)))
traces [:, 0] = tmp_trace

for i in range (1, traces.shape [1]):
traces [:, i] = np.load (files_list [i], allow_pickle = True)[-1][bandwidth, :D].flatten ()

return traces

################################################################################
def mean_by_label (traces, labels, files, mean_size):
################################################################################
# mean_by_label
# mean traces per executable, it means the input traces are mean by batch of
# 'mean_size' of the same executable (not only same label)
#
# input:
# + traces: array of traces (DxQ)
# + labels: list of the labels (Q elements)
# + files: names of the files, to be able to get the exaecutable name
# + mean_size: size of the batch
#
# output:
# + traces: mean traces (dimension: Dx(Q/new_size), D number of features,
# (Q/new_size) number of samples)
# + labels: new labels (Q/new_size)
################################################################################

tags = np.array ([get_tag (f) for f in files])
unique = np.unique (tags)

tmp_res = []
tmp_labels = []

for i in tqdm (range (len (unique)), desc = 'meaning (%s)'%mean_size, leave=False):
idx = np.where (tags == str (unique [i]))[0]

for j in range (0, len (idx) - mean_size, mean_size):
tmp_labels.append (labels [idx [j]])
current_res = 0.
for k in range (mean_size):
current_res += traces [:, idx [j + k]]
tmp_res.append (current_res/mean_size)

return np.array (tmp_res).T, tmp_labels
from evaluation import mean_by_tags, load_traces

################################################################################
def evaluate (path_lists, log_file, model_lda, model_svm, mean_sizes, nb_of_bd,
Expand Down Expand Up @@ -161,6 +93,9 @@ def evaluate (path_lists, log_file, model_lda, model_svm, mean_sizes, nb_of_bd,
## testing
testing_labels = y_test

## get tags to be able to mean
testing_tags = np.array ([get_tag (f) for f in x_test_filelist])

## load SVM
svm = joblib.load (model_svm)

Expand All @@ -185,7 +120,8 @@ def evaluate (path_lists, log_file, model_lda, model_svm, mean_sizes, nb_of_bd,
file_log.write ('compute with %s per mean\n'%mean_size)
file_log.close ()

X, y = mean_by_label (testing_traces, np.array (testing_labels), x_test_filelist, mean_size)
X, y = mean_by_tags (testing_traces, testing_tags,
np.array (testing_labels), x_test_filelist, mean_size)

# SVM
t0 = time.time ()
Expand Down
37 changes: 25 additions & 12 deletions ml_analysis/evaluate.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
"""
File: evaluate.py
Project: analysis
Last Modified: 2021-8-2
Created Date: 2021-8-2
Last Modified: 2022-2-7
Created Date: 2022-2-7
Copyright (c) 2021
Author: AHMA project (Univ Rennes, CNRS, Inria, IRISA)
"""
Expand Down Expand Up @@ -77,37 +77,46 @@ def load_traces (files_list, bandwidth, time_limit):
return traces

################################################################################
def mean_by_label (traces, labels, mean_size):
def mean_by_tags (traces, tags, labels, mean_size):
################################################################################
# mean_by_label
# mean_by_tags
# mean traces per label, it means the input traces are mean by batch of 'mean_size'
# of the same label
#
# input:
# + traces: array of traces (DxQ)
# + tags: unique Id per {malware} x {baits} (from the name of the file)
# + labels: labels used for the classification
# + mean_size: nbr of samples per mean
#
# output:
# + traces: mean traces (dimension: DxQ, D number of features,
# Q number of samples)
# + labels: lbaels of the average traces
################################################################################

unique = np.unique (labels)

unique_tags = np.unique (tags)
tags_to_label = {}
for i in unique_tags: # conversion from tag to label
tags_to_label [i] = labels [np.where (tags == i)[0][0]]

tmp_res = []
tmp_labels = []
count = 0

for i in tqdm (range (len (unique)), desc = 'meaning (%s)'%mean_size):
idx = np.where (labels == unique [i])[0]
for i in range (len (unique_tags)):
idx = np.where (tags == unique_tags [i])[0]

for j in range (0, len (idx) - mean_size, mean_size):
tmp_labels.append (unique [i])
tmp_labels.append (tags_to_label [unique_tags [i]])
current_res = 0.

for k in range (mean_size):
current_res += traces [:, idx [j + k]]

tmp_res.append (current_res/mean_size)

return np.array (tmp_res).T, tmp_labels
return np.array (tmp_res, dtype = tmp_res [0][0].dtype).T, np.array (tmp_labels)

################################################################################
def evaluate (path_lists, log_file, mean_sizes, nb_of_bd, path_acc,
Expand Down Expand Up @@ -219,6 +228,9 @@ def evaluate (path_lists, log_file, mean_sizes, nb_of_bd, path_acc,
testing_traces = load_traces (x_test_filelist, bandwidth, time_limit)
testing_labels = y_test

## get tags to be able to mean
testing_tags = np.array ([get_tag (f) for f in x_test_filelist])

## no means
## projection LDA
t0 = time.time ()
Expand Down Expand Up @@ -257,7 +269,8 @@ def evaluate (path_lists, log_file, mean_sizes, nb_of_bd, path_acc,
file_log.write ('compute with %s per mean\n'%mean_size)
file_log.close ()

X, y = mean_by_label (testing_traces, np.array (testing_labels), mean_size)
X, y = mean_by_tags (testing_traces, testing_tags,
np.array (testing_labels), x_test_filelist, mean_size)

## LDA on means
t0 = time.time ()
Expand Down
Loading