MKF4MMRec/parse_log.py at main · GAIR-Lab/MKF4MMRec · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
import os
import re
import csv
import time
import shutil

# Parse metric string into a dict
def parse_metrics(line):
    metrics = {}
    parts = re.findall(r'(\w+@\d+):\s*([\d\.]+)', line)
    for key, value in parts:
        metrics[key] = float(value)
    return metrics

# Parse a single log file
def parse_log_file(file_path):
    with open(file_path, 'r') as f:
        lines = f.readlines()

    # Search from the end of the file, find the last BEST flag, then find Valid and Test
    best_line = -1
    for i in range(len(lines) - 1, -1, -1):
        if '█████████████ BEST ████████████████' in lines[i]:
            best_line = i
            break
    if best_line == -1:
        return None  # No BEST flag, skip

    valid_line, test_line = None, None
    for i in range(best_line, len(lines)):
        if 'Valid:' in lines[i]:
            valid_line = lines[i]
        if 'Test:' in lines[i]:
            test_line = lines[i]

    if not valid_line or not test_line:
        return None  # Missing valid/test line

    return {
        'valid': parse_metrics(valid_line),
        'test': parse_metrics(test_line)
    }

def move_incomplete_logs(log_dir, temp_dir='temp_log'):
    """
    Check and move incomplete log files to the temp_log folder.
    If a file does not have a BEST flag and its last modification time is more than 30 minutes ago, it is considered an incomplete training.
    """
    # Ensure the temp_log folder exists
    if not os.path.exists(temp_dir):
        os.makedirs(temp_dir)

    current_time = time.time()
    moved_files = []

    for filename in os.listdir(log_dir):
        if filename.endswith('.log'):
            file_path = os.path.join(log_dir, filename)

            # Check the last modification time of the file
            last_modified = os.path.getmtime(file_path)
            time_diff = current_time - last_modified

            # If the file modification time is more than 30 minutes (1800 seconds)
            if time_diff > 1800:
                # Check for BEST flag
                has_best = False
                try:
                    with open(file_path, 'r') as f:
                        content = f.read()
                        if '█████████████ BEST ████████████████' in content:
                            has_best = True
                except Exception as e:
                    print(f'[ERROR] Failed to read {filename}: {e}')
                    continue

                # If there is no BEST flag, move it to temp_log
                if not has_best:
                    temp_file_path = os.path.join(temp_dir, filename)
                    try:
                        shutil.move(file_path, temp_file_path)
                        moved_files.append(filename)
                        print(f'[MOVED] {filename} -> {temp_dir}/ (incomplete training)')
                    except Exception as e:
                        print(f'[ERROR] Failed to move {filename}: {e}')

    return moved_files

# Parse all log files
def parse_all_logs(log_dir):
    results = []
    for filename in os.listdir(log_dir):
        if filename.endswith('.log'):
            file_path = os.path.join(log_dir, filename)
            parsed = parse_log_file(file_path)
            if parsed:
                results.append((filename.replace('.log', ''), parsed))
            else:
                print(f'[WARN] Skipped (incomplete): {filename}')
    return results

# Save to CSV
def sort_metrics(metrics):
    """
    Group metrics by type and sort by the number after @
    e.g., map@5, map@10, recall@5, recall@10
    """
    def metric_key(m):
        match = re.match(r'(\D+?)@(\d+)', m)
        if match:
            name, k = match.groups()
            return (name, int(k))
        return (m, 0)

    return sorted(metrics, key=metric_key)

def save_to_csv(results, output_file='log_summary.csv'):
    # Collect all occurring metrics
    all_metrics = set()
    for _, result in results:
        all_metrics.update(result['valid'].keys())
        all_metrics.update(result['test'].keys())

    # Sort metrics
    sorted_metrics = sort_metrics(all_metrics)

    # Build headers
    headers = ['model'] + [f'valid_{m}' for m in sorted_metrics] + [f'test_{m}' for m in sorted_metrics]

    # Sort by model name
    results.sort(key=lambda x: x[0].lower())

    # Write to CSV
    with open(output_file, 'w', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(headers)
        for model_name, result in results:
            row = [model_name]
            for m in sorted_metrics:
                row.append(result['valid'].get(m, ''))
            for m in sorted_metrics:
                row.append(result['test'].get(m, ''))
            writer.writerow(row)

# Main entry point
if __name__ == '__main__':
    log_dir = './log'  # Your log directory path

    # First, move incomplete log files
    print("Checking for incomplete log files...")
    moved_files = move_incomplete_logs(log_dir)
    if moved_files:
        print(f"Moved {len(moved_files)} incomplete log files to temp_log/")
    else:
        print("No incomplete log files found.")

    # Then parse the remaining log files
    print("\nParsing remaining log files...")
    results = parse_all_logs(log_dir)
    save_to_csv(results)
    print(f'Done. Parsed {len(results)} logs.')