-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathgenerate_workload.py
More file actions
102 lines (74 loc) · 3.23 KB
/
generate_workload.py
File metadata and controls
102 lines (74 loc) · 3.23 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
import json
import numpy as np
import pandas as pd
import toml
import umap.umap_ as umap
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from util import transform_geo_to_xyz
with open("./config.toml") as f:
config = toml.load(f)
# TODO: Read the cities list from config
input_file = config["general"]["gsts_list"]
output_file = config["workload"]["output_file"]
file_size_output_file = config["workload"]["file_size_output_file"]
print(f"Reading input file from {input_file}")
df_gst = pd.read_csv(input_file)
def row_transform_geo_to_xyz(row):
lat = row.lat
lon = row.lng
return transform_geo_to_xyz(lat, lon)
print(f"Transforming ground station lat/lon to xyz coordinates...")
df_gst["x"], df_gst["y"], df_gst["z"] = zip(
*df_gst.apply(row_transform_geo_to_xyz, axis=1)
)
reducer = umap.UMAP(n_components=3, n_neighbors=100)
X_raw = df_gst[["x", "y", "z", "iso2"]].to_numpy()
print(f"Preprocessing ground stations...")
num_cols = [0, 1, 2]
cat_cols = [3]
pipeline = ColumnTransformer(
[
("num_transformer", StandardScaler(), num_cols),
("cat_transformer", OneHotEncoder(), cat_cols),
]
)
X = pipeline.fit_transform(X_raw)
print(f"Translating ground station data into embedding space...")
X_transformed = reducer.fit_transform(X)
xmin = np.min(X_transformed[:, 0])
xmax = np.max(X_transformed[:, 0])
ymin = np.min(X_transformed[:, 1])
ymax = np.max(X_transformed[:, 1])
zmin = np.min(X_transformed[:, 2])
zmax = np.max(X_transformed[:, 2])
files_per_dim = config["workload"]["num_files"] ** (1./3) * 1j
xx, yy, zz = np.mgrid[xmin:xmax:files_per_dim, ymin:ymax:files_per_dim, zmin:zmax:files_per_dim]
# file_id is index in file_positions
file_positions = np.vstack([xx.ravel(), yy.ravel(), zz.ravel()])
def calculate_distance_to_files(gst_coords, files_coords):
dists = []
for f_coords in files_coords.T:
d = np.linalg.norm(gst_coords - f_coords)
dists.append(d)
return np.array(dists)
print("Computing files popularities for all ground stations...")
gst_file_orders = {}
for index, row in df_gst.iterrows():
gst_id = row.id
print(f"Computing file popularity for ground station {gst_id}...")
gst_coords = row[["x", "y", "z"]].to_numpy()
D = calculate_distance_to_files(gst_coords, file_positions)
file_order = D.argsort()
gst_file_orders[gst_id] = file_order.tolist()
# https://www.researchgate.net/publication/227252035_Web_Workload_Characterization_Ten_Years_Later suggest pareto distribution with alpha~1
# https://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=&ved=2ahUKEwjhpZj0wMzxAhUhg_0HHXalChQQFjAAegQIAhAD&url=https%3A%2F%2Fwww.cc.gatech.edu%2F~dovrolis%2FCourses%2F8803_F03%2Famogh.ppt&usg=AOvVaw3OA7_OuBWI0B1lMUOV9dVh suggests lognormal model or hybrid model with lognormal distribution with a Pareto tail
file_sizes = np.ceil(
np.random.pareto(int(config["workload"]["pareto_alpha"]), file_positions.shape[0])
)
df_fs = pd.DataFrame(data=file_sizes, columns=["file_size"])
df_fs.index.name = "file_id"
df_fs.to_csv(file_size_output_file, mode="w", index=True)
print(f"Saving file popularity output file to {output_file}...")
with open(output_file, "w") as f:
json.dump(gst_file_orders, f)