-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdata.py
More file actions
68 lines (53 loc) · 2.65 KB
/
data.py
File metadata and controls
68 lines (53 loc) · 2.65 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
import numpy as np
import pandas as pd
# Set seed for reproducibility
np.random.seed(42)
# Number of rows
n_rows = 10000
# 1. Generate base variables (no initial correlation)
active_sessions = np.random.randint(100, 5000, n_rows)
crc_errors = np.random.exponential(scale=0.01, size=n_rows) # mostly near 0
crc_errors = np.clip(crc_errors, 0, 0.1) # limit to maximum 0.1
buffer_utilization = np.random.uniform(0, 100, n_rows)
previous_unplanned_restarts = np.random.poisson(lam=0.5, size=n_rows) # mostly 0, some 1, few >=2
previous_unplanned_restarts = np.clip(previous_unplanned_restarts, 0, 5)
dropped_packets = np.random.negative_binomial(2, 0.05, size=n_rows) # long tail
dropped_packets = np.clip(dropped_packets, 0, 1000)
# 2. Calculate probability of target = 1 (failure) based on variables
# Logical formula: higher CRC errors, buffer full, previous restarts and drops increase probability
prob_raw = (0.3 * (crc_errors / 0.1) +
0.25 * (buffer_utilization / 100) +
0.2 * (previous_unplanned_restarts / 5) +
0.25 * (np.log1p(dropped_packets) / np.log1p(1000)))
# Normalize to [0,1] and add noise
prob = np.clip(prob_raw + np.random.normal(0, 0.1, n_rows), 0, 0.95)
# Adjust so that approximately 8% of cases have target=1 (infrequent event)
prob = prob * 0.5 # reduce base probability
prob = np.clip(prob, 0, 0.7)
# 3. Generate target via Bernoulli
target = np.random.binomial(1, prob)
# Ensure at least 5% positive events
if target.sum() < 0.05 * n_rows:
missing = int(0.05 * n_rows) - target.sum()
zero_indices = np.where(target == 0)[0]
np.random.shuffle(zero_indices)
target[zero_indices[:missing]] = 1
# 4. Slightly adjust variables to make sense with target=1
# (improves separability, useful for supervised learning)
for i in np.where(target == 1)[0]:
crc_errors[i] = min(0.1, crc_errors[i] + np.random.uniform(0.01, 0.05))
buffer_utilization[i] = min(100, buffer_utilization[i] + np.random.uniform(5, 15))
dropped_packets[i] = min(1000, dropped_packets[i] + np.random.randint(10, 100))
# 5. Create DataFrame with English column names
df = pd.DataFrame({
'active_sessions': active_sessions.astype(int),
'crc_errors_per_second': np.round(crc_errors, 6),
'buffer_memory_utilization_percent': np.round(buffer_utilization, 1),
'unplanned_restarts_last_24h': previous_unplanned_restarts.astype(int),
'dropped_packets_due_to_buffer_full_last_hour': dropped_packets.astype(int),
'target': target
})
# 6. Save to CSV
df.to_csv('datos_routers.csv', index=False)
print(f"File 'router_data.csv' generated with {len(df)} rows.")
print(f"Target distribution:\n{df['target'].value_counts(normalize=True)}")