-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathhamsci_data.py
More file actions
180 lines (147 loc) · 7.93 KB
/
hamsci_data.py
File metadata and controls
180 lines (147 loc) · 7.93 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
import numpy as np
import os
import datetime as dt
import scipy.signal as signal
import pandas as pd
def read_file(file_name, dtype=np.float32):
data = np.fromfile(file_name, dtype=dtype)
data = data.reshape(-1, 2)
return data[:, 0] + 1j * data[:, 1]
def cross_correlation(signal1, signal2):
x = np.asarray(signal2)
y = np.asarray(signal1)
correlation = signal.correlate(x, y, mode='full')
lag = np.arange(len(correlation)) - (len(correlation) // 2) # Adjust for full output
magnitudes = 20*np.log10(correlation)
return lag,magnitudes
def calculate_delay(x,y,sample_rate): # determines delay by finding the difference between peaks in the correlation
y = np.real(y)
if len(y)==0:
return np.nan,np.nan
peaks = signal.find_peaks(y, distance=300)[0] # finds peaks in correlation
local_peaks = pd.DataFrame({'x':x[peaks], 'y':y[peaks]}) # data frame ensures time values and magnitude values stay in sync
local_peaks = local_peaks.sort_values(by='y',ascending=False)
# identify the first peak. (the direct path)
directx = local_peaks['x'].values[0]
directy = local_peaks['y'].values[0]
# identify the second peak. (the indirect path)
secondx = local_peaks['x'].values[1]
secondy = local_peaks['y'].values[1]
delay = (secondx-directx)/sample_rate # gives the delay in seconds
# determine how closely the return echo correlation peak matches the direct path
peak1 = np.argwhere(x==directx)[0][0]
peak2 = np.argwhere(x==secondx)[0][0]
w1 = -175 # window to compare curves of first and second peaks
w2 = 175
curve1 = y[peak1+w1:peak1+w2]-(directy-secondy)
curve2 = y[peak2+w1:peak2+w2]
sum_squares_diff = 0
if len(curve1) == len(curve2):
diff = curve1-curve2
for j in diff:
sum_squares_diff=sum_squares_diff+(j**2)
root_sum_squares_diff = np.sqrt(sum_squares_diff)
else:
root_sum_squares_diff = np.nan
return delay,root_sum_squares_diff
def calculate_virtual_layer_heights(raw_data_directory,start_date,end_date):
'''
:param directory: Parent directory of raw data files
:param start_date: first date to process. enter date as datetime obj
:param end_date: last date to process. enter date as datetime obj
'''
delta = end_date-start_date
date_directories = []
for i in range(delta.days+1):
day = raw_data_directory+dt.datetime.strftime(start_date+dt.timedelta(days=i),'%Y/%m/%d/')
if os.path.isdir(day):
date_directories.append(day)
fnames = []
fpaths = []
for directory in date_directories: # create a list of all files with date range and their file paths
for file_name in os.listdir(directory):
file_path = directory+file_name
date = dt.datetime.strptime(file_name[0:19],'%Y_%m_%d_%H_%M_%S').replace(tzinfo=dt.timezone.utc)#
if file_name!='no_name' and os.path.getsize(file_path)!=0 and date>start_date: # avoid processing files that are empty or are the transmitted chirp
fnames.append(file_name)
fpaths.append(file_path)
blank = np.ndarray(len(fnames)).fill(np.nan) #
correlation_data = pd.DataFrame({'names':fnames, # create a dataframe to remain organized while processing
'paths':fpaths,
'utc_times':blank,
'lags':blank,
'mags':blank,
'delays':blank,
'heights':blank,
'root_sum_squares_diff':blank,
'frequencies':blank,
'echo_detected':blank})
correlation_data['utc_times'] = pd.to_datetime(correlation_data['names'].str[0:19],format='%Y_%m_%d_%H_%M_%S') # psarse utc timestamp from file name
correlation_data['frequencies'] = correlation_data['names'].str[20:] # parse frequencies from file name
sample_chirp = read_file('chirps_and_echoes/chirpA')
for i in correlation_data.index: # perform the cross correlation on each file
file_path = correlation_data['paths'][i]
chirp_echo = read_file(file_path)
if len(chirp_echo)!=0:
lag,mag = cross_correlation(sample_chirp, chirp_echo)
correlation_data.at[i,'lags'] = lag#[0:len(lag)//2]
correlation_data.at[i,'mags'] = mag#[0:len(lag)//2]
else:
correlation_data.at[i,'lags'] = []
correlation_data.at[i,'mags'] = []
sample_rate = 195312
max_root_sum_squares_diff = 500
min_delay = 0.001 # 150*2/300000
max_delay = 0.003 # 450*2/300000
for i in correlation_data.index: # calculatae delay for each echo
delay,root_sum_squares_diff = calculate_delay(correlation_data['lags'][i],correlation_data['mags'][i],sample_rate)
if root_sum_squares_diff > max_root_sum_squares_diff or root_sum_squares_diff==np.nan: # if return sginal is lost in noise, disregard data point
correlation_data.loc[i,'delays'] = np.nan
correlation_data.loc[i,'echo_detected'] = False
elif delay > max_delay or delay < min_delay: # if "delay" is outside of reasonable range, it is likely noise or mult-ihop
correlation_data.loc[i,'delays'] = np.nan
correlation_data.loc[i,'echo_detected'] = False
else:
correlation_data.loc[i,'delays'] = delay
correlation_data.loc[i,'echo_detected'] = True
correlation_data.loc[i,'root_sum_squares_diff'] = root_sum_squares_diff
correlation_data['heights'] = (correlation_data['delays'])*(299792.458/2) # convert delay to virtual layer height
data_output = pd.DataFrame({'utc_times': correlation_data['utc_times'],
'frequencies_Hz': correlation_data['frequencies'],
'virtual_heights_km': correlation_data['heights'],
'echo_detected': correlation_data['echo_detected']})
return data_output
def update_layer_height_csv(csv_directory,raw_data_directory):
if len(os.listdir(csv_directory))!=0:
old_data = pd.read_csv(csv_directory+os.listdir(csv_directory)[-1])
old_data['utc_times'] = pd.to_datetime(old_data['utc_times'],format='ISO8601',utc=True)
start_date = (old_data['utc_times'].iloc[-1]).to_pydatetime()
else:
start_date = (dt.datetime.now(dt.timezone.utc) - dt.timedelta(days=int(3)))#.strftime('%Y-%m-%dT%H:%M:%S')
end_date = (dt.datetime.now(dt.timezone.utc) + dt.timedelta(days=int(1)))#.strftime('%Y-%m-%dT%H:%M:%S')
data = calculate_virtual_layer_heights(raw_data_directory,start_date,end_date)
save_height_data(data,csv_directory)
def save_height_data(data,data_directory):
start_date = data['utc_times'].iloc[0].date()
end_date = data['utc_times'].iloc[-1].date()
day = start_date
while day<=end_date:
new_data = data[data['utc_times'].dt.date==day]
file_path = data_directory+str(day)+'.csv'
if os.path.isfile(file_path):
old_data = pd.read_csv(file_path)
old_data['utc_times'] = pd.to_datetime(old_data['utc_times'],format='ISO8601')
new_data = pd.concat([old_data,new_data])
new_data = new_data.sort_values(by=['utc_times'],ascending=True)
new_data = new_data.reset_index(drop=True)
new_data = new_data.drop_duplicates()
new_data.to_csv(file_path,index=False)
day=day+dt.timedelta(days=1)
if __name__ == '__main__':
start_date = dt.datetime(2026,2,6)
end_date = dt.datetime(2026,2,9)
# print(calculate_virtual_layer_heights('chirps_and_echoes/',start_date,end_date))
# print(os.path.getsize('chirps_and_echoes/2026/02/09/2026_02_09_00_00_54_7025002'))
csv_directory = 'Scranton_data/'
raw_data_directory = 'chirps_and_echoes/'
update_layer_height_csv(csv_directory,raw_data_directory)