-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathSinesFileSanitizer.cpp
More file actions
166 lines (137 loc) · 4.92 KB
/
SinesFileSanitizer.cpp
File metadata and controls
166 lines (137 loc) · 4.92 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
#include "SinesFileSanitizer.h"
#include <mutex>
#include <codecvt>
#include <locale>
#include <regex>
#include <algorithm>
#include <unordered_map>
// Initialize the static map
std::unordered_map<char32_t, std::string> SinesFileSanitizer::specialCharMap;
void SinesFileSanitizer::initSpecialCharMap() {
// Whitespace characters
specialCharMap[U' '] = "_";
specialCharMap[U'\t'] = "_";
specialCharMap[U'\n'] = "_";
specialCharMap[U'\r'] = "_";
// Special characters that need to be mapped
const char32_t symbols[] = {
// Basic punctuation and operators
U'@', U'#', U'£', U'&', U'+', U'(', U')', U'/', U'*',
U'"', U'\'', U':', U';', U'!', U'?', U'~', U'`', U'|',
// Mathematical and currency symbols
U'•', U'√', U'π', U'÷', U'×', U'§', U'∆', U'€', U'¥',
U'$', U'¢', U'^', U'°', U'=', U'{', U'}', U'[', U']',
// Special marks
U'✓', U'™', U'®', U'©', U'%', U',', U'.',U'<',U'>'
};
// Map all special characters to underscore
for (char32_t sym : symbols) {
specialCharMap[sym] = "_";
}
}
std::u32string SinesFileSanitizer::utf8to32(const std::string& utf8str) {
try {
std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> converter;
return converter.from_bytes(utf8str);
} catch (const std::exception&) {
return std::u32string(); // Return empty string on conversion failure
}
}
std::string SinesFileSanitizer::utf32to8(const std::u32string& utf32str) {
try {
std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> converter;
return converter.to_bytes(utf32str);
} catch (const std::exception&) {
return std::string(); // Return empty string on conversion failure
}
}
std::string SinesFileSanitizer::cleanupFileName(const std::string& fileName) {
// Remove consecutive underscores
std::string cleaned = std::regex_replace(fileName, std::regex("_+"), "_");
// Remove leading and trailing underscores
cleaned = std::regex_replace(cleaned, std::regex("^_+|_+$"), "");
// If the filename is empty after cleanup, use a default name
if (cleaned.empty()) {
return "unnamed_file";
}
// Ensure the filename doesn't start with a hyphen
if (cleaned[0] == '-') {
cleaned = "_" + cleaned;
}
return cleaned;
}
std::u32string SinesFileSanitizer::toLowercase(const std::u32string& str) {
std::u32string lowercase = str;
std::transform(lowercase.begin(), lowercase.end(), lowercase.begin(),
[](char32_t c) { return std::tolower(static_cast<unsigned char>(c)); });
return lowercase;
}
std::string SinesFileSanitizer::sanitizeFileName(const std::string& fileName) {
static std::once_flag flag;
std::call_once(flag, []() {
initSpecialCharMap();
});
if (fileName.empty()) {
return "unnamed_file";
}
// Convert to UTF-32 for comprehensive character handling
std::u32string sanitized = utf8to32(fileName);
if (sanitized.empty()) {
return "unnamed_file";
}
// Convert to lowercase
sanitized = toLowercase(sanitized);
// Replace special characters
for (size_t i = 0; i < sanitized.length(); ++i) {
auto it = specialCharMap.find(sanitized[i]);
if (it != specialCharMap.end()) {
sanitized[i] = U'_';
} else if (!std::isalnum(static_cast<unsigned char>(sanitized[i])) &&
sanitized[i] != U'_' &&
sanitized[i] != U'.' &&
sanitized[i] != U'-') {
sanitized[i] = U'_';
}
}
// Convert back to UTF-8 and clean up
std::string result = utf32to8(sanitized);
return cleanupFileName(result);
}
std::string SinesFileSanitizer::sanitizeFileName(const std::string& fileName, size_t maxLength) {
if (maxLength == 0) {
return "unnamed_file";
}
std::string sanitized = sanitizeFileName(fileName);
// Check if UTF-8 encoded string needs truncation
if (sanitized.length() > maxLength) {
// Truncate at byte boundary
while (maxLength > 0 && (sanitized[maxLength] & 0xC0) == 0x80) {
--maxLength;
}
sanitized = sanitized.substr(0, maxLength);
sanitized = cleanupFileName(sanitized);
}
return sanitized;
}
bool SinesFileSanitizer::isValidFileName(const std::string& fileName) {
if (fileName.empty()) {
return false;
}
std::u32string utf32Name = utf8to32(fileName);
if (utf32Name.empty()) {
return false;
}
// Check first character isn't a dot or hyphen
if (utf32Name[0] == U'.' || utf32Name[0] == U'-') {
return false;
}
for (char32_t c : utf32Name) {
if (!std::isalnum(static_cast<unsigned char>(c)) &&
c != U'_' &&
c != U'.' &&
c != U'-') {
return false;
}
}
return true;
}