-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdataframe_comparison.py
More file actions
80 lines (60 loc) · 2.74 KB
/
dataframe_comparison.py
File metadata and controls
80 lines (60 loc) · 2.74 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
from fastdtw import fastdtw
from scipy.spatial.distance import euclidean
from itertools import combinations
# --- Configurações ---
DATA_DIR = 'data'
OUTPUT_DIR = 'comparativos_output'
os.makedirs(OUTPUT_DIR, exist_ok=True)
def get_all_2d_columns(df:pd.DataFrame) -> list[str]:
return [col for col in df.columns if col.endswith(('_x', '_y'))]
def get_dtw_path(df1:pd.DataFrame, df2:pd.DataFrame, cols:list[str]):
s1 = df1[cols].ffill().bfill().fillna(0).values
s2 = df2[cols].ffill().bfill().fillna(0).values
distance, path = fastdtw(s1, s2, dist=euclidean)
return path, distance
def compare_pair(file1_info, file2_info):
df1 = pd.read_parquet(file1_info['parquet'])
df2 = pd.read_parquet(file2_info['parquet'])
cols1 = set(get_all_2d_columns(df1))
cols2 = set(get_all_2d_columns(df2))
common_cols = sorted(list(cols1.intersection(cols2)))
if not common_cols:
print(f"⚠️ Pular: Nenhuma coluna em comum entre {file1_info['name']} e {file2_info['name']}")
return None
# Calculate DTW using only the shared points
path, dist = get_dtw_path(df1, df2, common_cols)
avg_dist = dist / len(path)
pair_name = f"{file1_info['name']}_vs_{file2_info['name']}"
print(f"Processando: {pair_name} | Pontos usados: {len(common_cols)//2} | Distância: {avg_dist:.4f}")
return {"pair": pair_name, "distance": dist, "avg_distance": avg_dist}
if __name__ == "__main__":
signals_map = []
for signal_name in os.listdir(DATA_DIR):
signal_path = os.path.join(DATA_DIR, signal_name)
if os.path.isdir(signal_path):
files = os.listdir(signal_path)
parquets = [f for f in files if f.endswith('.parquet')]
for p in parquets:
base_name = p.replace('.parquet', '')
video_file = base_name + '.mp4'
if video_file in files:
signals_map.append({
'signal_group': signal_name,
'name': base_name,
'video': os.path.join(signal_path, video_file),
'parquet': os.path.join(signal_path, p)
})
all_pairs = list(combinations(signals_map, 2))
print(f"Total de sinais encontrados: {len(signals_map)}")
print(f"Total de comparações a realizar: {len(all_pairs)}")
results = []
for f1, f2 in all_pairs:
res = compare_pair(f1, f2)
results.append(res)
df_results = pd.DataFrame(results)
df_results.to_csv(os.path.join(OUTPUT_DIR, 'ranking_similaridade.csv'), index=False)
print("\nRanking de similaridade salvo em 'ranking_similaridade.csv'")