ahma-hub · damien-marion · Feb 7, 2022 · Feb 7, 2022 · Feb 7, 2022
diff --git a/ml_analysis/NB.py b/ml_analysis/NB.py
@@ -1,8 +1,8 @@
 """
  File: NB.py 
  Project: analysis 
- Last Modified: 2021-8-2
- Created Date: 2021-8-2
+ Last Modified: 2022-2-7
+ Created Date: 2022-2-7
  Copyright (c) 2021
  Author: AHMA project (Univ Rennes, CNRS, Inria, IRISA)
 """
@@ -23,76 +23,7 @@
 sys.path.append(os.path.join (os.path.dirname (__file__), "../pre-processings/"))
 
 from nicv import compute_nicv
-from list_manipulation import get_tag
-
-################################################################################
-def load_traces (files_list, bandwidth, time_limit):
-################################################################################
-# load_traces
-# load all the traces listed in 'files_lists', the 2D-traces are flattened
-# /!\ only half of the time features are used (to speedup without decreasing
-# the accuracy)
-# 
-# input:
-#  + files_list: list of the filenames
-#  + bandwidth: indexes of the bandwidth 
-#  + time_limit: percentage of the trace to concerve
-#
-# output:
-#  + traces: array containing the traces (dimension: DxQ, D number of features,
-#  Q number of samples)
-################################################################################
-    ## get dimension
-    tmp_trace = np.load (files_list [0], allow_pickle = True)[-1][bandwidth, :]
-
-    ## take only half of the features
-    D = int (tmp_trace.shape [1]*time_limit)
-    tmp_trace = tmp_trace [:, :D].flatten ()
-
-    traces = np.zeros ((len (tmp_trace), len (files_list)))
-    traces [:, 0] = tmp_trace 
-
-    for i in range (1, traces.shape [1]):
-        traces [:, i] = np.load (files_list [i], allow_pickle = True)[-1][bandwidth, :D].flatten ()
-
-    return traces
-
-################################################################################
-def mean_by_label (traces, labels, files, mean_size):
-################################################################################
-# mean_by_label
-# mean traces per executable, it means the input traces are mean by batch of
-# 'mean_size' of the same executable (not only same label) 
-# 
-# input:
-#  + traces: array of traces (DxQ)
-#  + labels: list of the labels (Q elements)
-#  + files: names of the files, to be able to get the exaecutable name
-#  + mean_size: size of the batch
-#
-# output:
-#  + traces: mean traces (dimension: Dx(Q/new_size), D number of features,
-#  (Q/new_size) number of samples)
-#  + labels: new labels (Q/new_size)
-################################################################################
-
-    tags = np.array ([get_tag (f) for f in files])
-    unique = np.unique (tags)
-
-    tmp_res = []
-    tmp_labels = []
-
-    for i in tqdm (range (len (unique)), desc = 'meaning (%s)'%mean_size, leave = False):
-        idx = np.where (tags == str (unique [i]))[0]
-
-        for j in range (0, len (idx) - mean_size, mean_size):
-            tmp_labels.append (labels [idx [j]])
-            current_res = 0.
-            for k in range (mean_size):
-                current_res += traces [:, idx [j + k]]
-            tmp_res.append (current_res/mean_size)
-
-    return np.array (tmp_res).T, tmp_labels
+from evaluation import get_tag, load_traces
 
 ################################################################################
 def evaluate (path_lists, log_file, model_lda, model_nb, mean_sizes, nb_of_bd,
@@ -162,6 +93,9 @@ def evaluate (path_lists, log_file, model_lda, model_nb, mean_sizes, nb_of_bd,
 
     ## testing
     testing_labels = y_test
+
+    ## get tags to be able to mean
+    testing_tags = np.array ([get_tag (f) for f in x_test_filelist])
 
     ## load NB 
     gnb = joblib.load (model_nb)
@@ -187,7 +121,8 @@ def evaluate (path_lists, log_file, model_lda, model_nb, mean_sizes, nb_of_bd,
             file_log.write ('compute with %s per mean\n'%mean_size)
             file_log.close ()
 
-            X, y = mean_by_label (testing_traces, np.array (testing_labels), x_test_filelist, mean_size)
+            X, y = mean_by_tags (testing_traces, testing_tags,
+                                 np.array (testing_labels), x_test_filelist, mean_size)
 
             # NB + LDA
             t0 = time.time ()

diff --git a/ml_analysis/SVM.py b/ml_analysis/SVM.py
@@ -1,8 +1,8 @@
 """
  File: SVM.py 
  Project: analysis 
- Last Modified: 2021-8-2
- Created Date: 2021-8-2
+ Last Modified: 2022-2-7
+ Created Date: 2022-2-7
  Copyright (c) 2021
  Author: AHMA project (Univ Rennes, CNRS, Inria, IRISA)
 """
@@ -23,75 +23,7 @@
 sys.path.append(os.path.join (os.path.dirname (__file__), "../pre-processings/"))
 from nicv import compute_nicv
 from list_manipulation import get_tag
-
-################################################################################
-def load_traces (files_list, bandwidth, time_limit):
-################################################################################
-# load_traces
-# load all the traces listed in 'files_lists', the 2D-traces are flattened
-# /!\ only half of the time features are used (to speedup without decreasing
-# the accuracy)
-# 
-# input:
-#  + files_list: list of the filenames
-#  + bandwidth: indexes of the bandwidth
-#  + time_limit: percentage of the trace to concerve
-#
-# output:
-#  + traces: array containing the traces (dimension: DxQ, D number of features,
-#  Q number of samples)
-################################################################################
-    ## get dimension
-    tmp_trace = np.load (files_list [0], allow_pickle = True)[-1][bandwidth, :]
-
-    ## take only half of the features
-    D = int (tmp_trace.shape [1]*time_limit)
-    tmp_trace = tmp_trace [:, :D].flatten ()
-
-    traces = np.zeros ((len (tmp_trace), len (files_list)))
-    traces [:, 0] = tmp_trace 
-
-    for i in range (1, traces.shape [1]):
-        traces [:, i] = np.load (files_list [i], allow_pickle = True)[-1][bandwidth, :D].flatten ()
-
-    return traces
-
-################################################################################
-def mean_by_label (traces, labels, files, mean_size):
-################################################################################
-# mean_by_label
-# mean traces per executable, it means the input traces are mean by batch of
-# 'mean_size' of the same executable (not only same label) 
-# 
-# input:
-#  + traces: array of traces (DxQ)
-#  + labels: list of the labels (Q elements)
-#  + files: names of the files, to be able to get the exaecutable name
-#  + mean_size: size of the batch
-#
-# output:
-#  + traces: mean traces (dimension: Dx(Q/new_size), D number of features,
-#  (Q/new_size) number of samples)
-#  + labels: new labels (Q/new_size)
-################################################################################
-
-    tags = np.array ([get_tag (f) for f in files])
-    unique = np.unique (tags)
-
-    tmp_res = []
-    tmp_labels = []
-
-    for i in tqdm (range (len (unique)), desc = 'meaning (%s)'%mean_size, leave=False):
-        idx = np.where (tags == str (unique [i]))[0]
-
-        for j in range (0, len (idx) - mean_size, mean_size):
-            tmp_labels.append (labels [idx [j]])
-            current_res = 0.
-            for k in range (mean_size):
-                current_res += traces [:, idx [j + k]]
-            tmp_res.append (current_res/mean_size)
-
-    return np.array (tmp_res).T, tmp_labels
+from evaluation import mean_by_tags, load_traces
 
 ################################################################################
 def evaluate (path_lists, log_file, model_lda, model_svm, mean_sizes, nb_of_bd,
@@ -161,6 +93,9 @@ def evaluate (path_lists, log_file, model_lda, model_svm, mean_sizes, nb_of_bd,
     ## testing
     testing_labels = y_test
 
+    ## get tags to be able to mean
+    testing_tags = np.array ([get_tag (f) for f in x_test_filelist])
+
     ## load SVM
     svm = joblib.load (model_svm)
 
@@ -185,7 +120,8 @@ def evaluate (path_lists, log_file, model_lda, model_svm, mean_sizes, nb_of_bd,
             file_log.write ('compute with %s per mean\n'%mean_size)
             file_log.close ()
 
-            X, y = mean_by_label (testing_traces, np.array (testing_labels), x_test_filelist, mean_size)
+            X, y = mean_by_tags (testing_traces, testing_tags,
+                                 np.array (testing_labels), x_test_filelist, mean_size)
 
             # SVM
             t0 = time.time ()

diff --git a/ml_analysis/evaluate.py b/ml_analysis/evaluate.py
@@ -1,8 +1,8 @@
 """
  File: evaluate.py 
  Project: analysis 
- Last Modified: 2021-8-2
- Created Date: 2021-8-2
+ Last Modified: 2022-2-7
+ Created Date: 2022-2-7
  Copyright (c) 2021
  Author: AHMA project (Univ Rennes, CNRS, Inria, IRISA)
 """
@@ -77,37 +77,46 @@ def load_traces (files_list, bandwidth, time_limit):
     return traces
 
 ################################################################################
-def mean_by_label (traces, labels, mean_size):
+def mean_by_tags (traces, tags, labels, mean_size):
 ################################################################################
-# mean_by_label
+# mean_by_tags
 # mean traces per label, it means the input traces are mean by batch of 'mean_size' 
 # of the same label
 # 
 # input:
 #  + traces: array of traces (DxQ)
+#  + tags: unique Id per {malware} x {baits} (from the name of the file)
+#  + labels: labels used for the classification
+#  + mean_size: nbr of samples per mean
 #
 # output:
 #  + traces: mean traces (dimension: DxQ, D number of features,
 #  Q number of samples)
+#  + labels: lbaels of the average traces
 ################################################################################
 
-    unique = np.unique (labels)
-
+    unique_tags = np.unique (tags)
+    tags_to_label = {}
+    for i in unique_tags: # conversion from tag to label
+        tags_to_label [i] = labels [np.where (tags == i)[0][0]]
+
     tmp_res = []
     tmp_labels = []
     count = 0
 
-    for i in tqdm (range (len (unique)), desc = 'meaning (%s)'%mean_size):
-        idx = np.where (labels == unique [i])[0]
-        
+    for i in range (len (unique_tags)): 
+        idx = np.where (tags == unique_tags [i])[0]
+
         for j in range (0, len (idx) - mean_size, mean_size):
-            tmp_labels.append (unique [i])
+            tmp_labels.append (tags_to_label [unique_tags [i]])
             current_res = 0.
+
             for k in range (mean_size):
                 current_res += traces [:, idx [j + k]]
+
             tmp_res.append (current_res/mean_size)
 
-    return np.array (tmp_res).T, tmp_labels
+    return np.array (tmp_res, dtype = tmp_res [0][0].dtype).T, np.array (tmp_labels)
 
 ################################################################################
 def evaluate (path_lists, log_file, mean_sizes, nb_of_bd, path_acc,
@@ -219,6 +228,9 @@ def evaluate (path_lists, log_file, mean_sizes, nb_of_bd, path_acc,
     testing_traces = load_traces (x_test_filelist,  bandwidth, time_limit)
     testing_labels = y_test
 
+    ## get tags to be able to mean
+    testing_tags = np.array ([get_tag (f) for f in x_test_filelist])
+
     ## no means
     ## projection LDA
     t0 = time.time ()
@@ -257,7 +269,8 @@ def evaluate (path_lists, log_file, mean_sizes, nb_of_bd, path_acc,
         file_log.write ('compute with %s per mean\n'%mean_size)
         file_log.close ()
 
-        X, y = mean_by_label (testing_traces, np.array (testing_labels), mean_size)
+        X, y = mean_by_tags (testing_traces, testing_tags,
+                                 np.array (testing_labels), x_test_filelist, mean_size)
 
         ## LDA on means
         t0 = time.time ()