-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathutils.py
More file actions
126 lines (110 loc) · 4.02 KB
/
utils.py
File metadata and controls
126 lines (110 loc) · 4.02 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
import numpy as np
from keras.utils.np_utils import to_categorical
from pcaputilities import extractSequences
def pad_seq(seq, max_length, PAD=0):
"""
:param seq: list of int,
:param max_length: int,
:return seq: list of int,
"""
seq += [PAD for i in range(max_length - len(seq))]
return seq
def extract_all(real_packet_sizes_file):
"""
Extract packet sequences from file of signed ints.
Sign indicates direction
# Arguments:
real_packet_sizes_file: String
path to file
# Returns:
normalized_packets: 2D list of unsigned ints
V: vocab size
"""
real_packets = extractSequences(real_packet_sizes_file)
normalized_packets = []
max_packet_size = 0
for packets in real_packets:
print(packets)
max_packet_size = max(max([abs(int(x)) for x in packets]), max_packet_size)
V = (max_packet_size * 2) + 3 # Add tokens PAD, BOS, EOS
for packets in real_packets:
packet_sizes = [(int(x) + max_packet_size + 3) for x in packets]
normalized_packets.append(packet_sizes)
return normalized_packets, V+1
def split_sequences(sequences, max_length):
split_sequences = []
for sequence in sequences:
split_sequences = split_sequences + [sequence[i:i + max_length] for i in range(0, len(sequence), max_length)]
return split_sequences
def build_generator_pretraining_datasets(sequences, V, PAD=0, BOS=1, EOS=2):
"""
Format generator pretraining data batch.
# Arguments:
sequences: (B, None)
real sequences with variable length
# Returns:
None: no input is needed for generator pretraining.
x: numpy.array, shape = (B, max_length)
y_true: numpy.array, shape = (B, max_length, V)
labels with one-hot encoding.
max_length is the max length of sequence in the batch.
if length smaller than max_length, the data will be padded.
"""
max_length = 0
x, y_true = [], []
for sequence in sequences:
ids_x, ids_y_true = [], []
ids_x.append(BOS)
ids_x.extend(sequence)
ids_x.append(EOS)
x.append(ids_x)
ids_y_true.extend(sequence)
ids_y_true.append(EOS)
y_true.append(ids_y_true)
max_length = max(max_length, len(ids_x))
for i, ids in enumerate(x):
x[i] = x[i][:max_length]
for i, ids in enumerate(y_true):
y_true[i] = y_true[i][:max_length]
x = [pad_seq(token, max_length, PAD) for token in x]
x = np.array(x, dtype=np.int32)
y_true = [pad_seq(token, max_length, PAD) for token in y_true]
y_true = np.array(y_true, dtype=np.int32)
y_true = to_categorical(y_true, num_classes=V)
return x, y_true, max_length
def build_discriminator_datasets(real_sequences, fake_sequences, PAD=0, BOS=1, EOS=2):
"""
Format discriminator data batch.
# Arguments:
real_sequences: (B, None)
real sequences with variable length
fake_seqences: (B, None)
generated sequences with variable length
# Returns:
None: no input is needed for generator pretraining.
x: numpy.array, shape = (B, max_length)
y_true: numpy.array, shape = (B, max_length, V)
labels with one-hot encoding.
max_length is the max length of sequence in the batch.
if length smaller than max_length, the data will be padded.
"""
max_length = 0
X, Y = [], []
for real_sequence in real_sequences:
x = [BOS]
x.extend(real_sequence)
x.append(EOS)
X.append(x) # ex. [8, 10, 6, 3, EOS]
Y.append(1)
max_length = max(max_length, len(x))
for fake_sequence in fake_sequences:
x = [BOS]
x.extend(fake_sequence)
x.append(EOS)
X.append(x) # ex. [8, 10, 6, 3, EOS]
Y.append(0)
for i, ids in enumerate(X):
X[i] = X[i][:max_length]
X = [pad_seq(sen, max_length, PAD) for sen in X]
X = np.array(X, dtype=np.int32)
return X, np.array(Y, dtype=np.int32), max_length