-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathFileMeta.cs
More file actions
310 lines (284 loc) · 13.1 KB
/
FileMeta.cs
File metadata and controls
310 lines (284 loc) · 13.1 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
using System;
using System.IO;
using System.Text;
using System.Collections.Generic;
using System.Text.RegularExpressions;
using System.Diagnostics;
namespace ARI_POC
{
/// <summary>
/// Determines the type of a file from it's header.
/// </summary>
/// <remarks>
/// In keeping with the FileMeta Manifesto (see http://www.filemeta.org/manifesto) file types
/// should be determined by their contents, not by their container or by their reference.
/// When necessary, resorts to the filename extension for backward compatibility.
/// </remarks>
//
// So far, supports the followign file types with their corresponding headers and extensions:
//
// header | extension | type
// ----------------------------------------
// <%#ejs | .ejs | application/x-embeddedjavascript
// /*javascript | .js | application/javascript
// <?xml | .xml | text/xml
//
// JSON presents a problem (as, likely, will many other formats) because it doesn't define any
// kind of header other than the opening brace. For the moment, I'm leaving it off the list
// bbecause it's not needed in this application. Despite heavy use of JSON, it's noever stored
// to disk. However, so long as no other format opens with a single brace with no other distinguishing
// features, that may be sufficient to add JSON.
public enum FileTypeId
{
unknown = 0,
ejs = 1,
js = 2,
xml = 3
};
/// <summary>
/// Retrieves file type and metadata information from files. Presently only
/// identifies JavaScript, Embedded JavaScript (EJS), and XML files and only reads metadata
/// from JavaScript and Embedded JavaScript.
/// </summary>
/// <remarks>
/// <para>This would be an amazing class if it worked with more than a limited set
/// of file types. However, it remains useful for the set with which it work
/// and open for extension.
/// </para>
/// <para>File type identification and metadata requires that files meet certain requirements:
/// </para>
/// <para>JavaScript: The file MUST begin with the text "/*javascript" followed by a newline.
/// This file-type declaration is cases-sensitive and must be all lower-case. Because "/*" is
/// the beginning of a multiline comment in JavaScript this line and subsequent lines until
/// the comment end will be ignored by other JavaScript.
/// </para>
/// <para>The declaration line (above) SHOULD be followed by one or more lines of metadata
/// in "microYAML" format (see below). The metadata is concluded by a line containing only
/// the close-comment sequence "*/
/// </para>
/// <para>Embedded JavaScript: The file MUST begin with the text "<%#ejs" followed by a
/// newline. This file-type declaration is case-sensitive and must be all lower-case. The
/// declaration is followed by one or mor lines of metadata in "microYAML" format (see
/// below). The metadata is concluded by the close-declaration sequence "%>".
/// </para>
/// <para>microYAML: The metadata uses a simplified subset of YAML inspired by but different
/// from the the work at: https://code.google.com/p/mini-yaml-parser/. It is a line-oriented
/// format and follows this syntax:
/// </para>
/// <para>key: literal -- Specifies a key and the literal value that follows.</para>
/// <para>key: | -- Specifies a key followed by a multiline literal.</para>
/// <para> some text -- All text following the multiline literal that's indented more than the key is part of the literal.</para>
/// <para># text -- Lines beginning with '#' are comments and ignored.</para>
/// <para>*/ -- End of document (for JavaScript).</para>
/// <para>%> -- End of document (for EJS).</para>
/// <para> -- Anything else is a syntax error that ends the metatadata and is silently ignored.</para>
/// <para>Keys must start with a letter and be composed of letters, </para>
/// <para>microYAML is a proper subset of YAML (except for document beginning and
/// ending syntax). Additional features of YAML could be added in the future.
/// </para>
/// </remarks>
public static class FileMeta
{
const int cMaxHeaderBytes = 32;
static readonly string sEjs_Header = "<%#ejs";
static readonly string sJs_Header = "/*javascript";
static readonly string sXml_Header = "<?xml";
static bool sStrict = true;
/// <summary>
/// If strict is true, file type header is required and types will not be inferred from the file extension.
/// </summary>
public static bool Strict
{
get { return sStrict; }
set { sStrict = value; }
}
public static FileTypeId GetFileTypeFromHeader(string header)
{
// When more types are added, we can optimize this using a hard-coded boolean search
// Unfortunately, a typical hash won't work because we don't know how many characters to use.
if (header.StartsWith(sEjs_Header)) return FileTypeId.ejs;
if (header.StartsWith(sJs_Header)) return FileTypeId.js;
if (header.StartsWith(sXml_Header)) return FileTypeId.xml;
return FileTypeId.unknown;
}
public static FileTypeId GetFileTypeFromExtension(string filename)
{
switch(Path.GetExtension(filename).ToLowerInvariant())
{
case ".ejs":
return FileTypeId.ejs;
case ".js":
return FileTypeId.js;
case "xml":
return FileTypeId.xml;
default:
return FileTypeId.unknown;
}
}
public static FileTypeId GetFileType(byte[] header, int offset, int len)
{
using (StreamReader reader = new StreamReader(new MemoryStream(header, 0, len), Encoding.UTF8, true, cMaxHeaderBytes, false))
{
return GetFileTypeFromHeader(reader.ReadToEnd());
}
}
public static FileTypeId GetFileType(Stream stream)
{
long pos = stream.Position;
byte[] buf = new byte[cMaxHeaderBytes];
int bufLen = stream.Read(buf, 0, cMaxHeaderBytes);
stream.Position = pos;
return GetFileType(buf, 0, bufLen);
}
public static FileTypeId GetFileType(string filename)
{
using (FileStream stream = new FileStream(filename, FileMode.Open, FileAccess.Read, FileShare.ReadWrite, cMaxHeaderBytes))
{
FileTypeId typeId = GetFileType(stream);
return (sStrict || typeId != FileTypeId.unknown) ? typeId : GetFileTypeFromExtension(filename);
}
}
public static TextReader GetReaderAndFileType(string filename, out FileTypeId rFileType)
{
FileStream stream = null;
try
{
stream = new FileStream(filename, FileMode.Open, FileAccess.Read, FileShare.Read);
FileTypeId typeId = GetFileType(stream);
if (!sStrict && typeId == FileTypeId.unknown)
{
typeId = GetFileTypeFromExtension(filename);
}
rFileType = typeId;
StreamReader reader = new StreamReader(stream, Encoding.UTF8, true);
stream = null;
return reader;
}
finally
{
if (stream != null) stream.Dispose();
}
}
/* This regular expression limits mini-YAML acceptance exclusively to lines that
* have keys and values that meet YAML "Plain Style". Since this doesn't support
* character escaping it limits the values that can be expressed. Nevertheless, it's
* sufficient for current needs. If those needs change, the syntax can be extended
* to use additional YAML features.
* This regex assumes that leading and trailing whitespace has alrady been trimmed.
*/
private static readonly Regex sYamlPlain = new Regex(@"^(?<key>[a-zA-Z_][a-zA-Z0-9_]*):[ \t]+(?<value>(\w.*)|(\|)|())\z");
private static string TrimAndCount(string txt, out int rIndent)
{
int indent = 0;
while (indent < txt.Length && (txt[indent] == ' ' || txt[indent] == '\t')) ++indent;
int end = txt.Length;
while (end > indent && (txt[end-1] == ' ' || txt[end-1] == '\t')) --end;
if (indent > 0 || end < txt.Length) txt = txt.Substring(indent, end - indent);
rIndent = indent;
return txt;
}
public static SortedDictionary<string, string> ParseMicroYaml(TextReader reader)
{
SortedDictionary<string, string> dict = new SortedDictionary<string, string>();
// Since YAML is a line-oriented format we just read line-by-line and treat each line accordingly.
// Read the first line
int indent;
string line = reader.ReadLine();
if (line == null) line = "*/"; // Fake end-of-document
line = TrimAndCount(line, out indent);
for (; ; )
{
if (line.Length == 0 || line[0] == '#')
{
// Do nothing
}
else if (line.StartsWith("*/") || line.StartsWith("%>"))
{
break; // End of document
}
else
{
Match match = sYamlPlain.Match(line);
if (!match.Success)
{
Debug.WriteLine("microYAML syntax error: '{0}'", line);
break; // Just consider this to be the end of input.
}
string key = match.Groups["key"].Value;
string value = match.Groups["value"].Value;
if (!value.Equals("|", StringComparison.Ordinal)) // Single-line value
{
dict[key] = value;
}
else
{
value = string.Empty;
for (; ; )
{
int indent2;
line = reader.ReadLine();
if (line == null) line = "*/"; // Fake end-of-document;
line = TrimAndCount(line, out indent2);
if (line.Length == 0 || indent2 > indent)
{
value = string.Concat(value, line, "\r\n");
}
else
{
dict[key] = value;
indent = indent2;
break ;
}
}
continue;
}
}
// Load the next line
line = reader.ReadLine();
if (line == null) break; // end of document
line = TrimAndCount(line, out indent);
}
return dict;
}
/// <summary>
/// Gets the file type and reads the metadata from a file.
/// </summary>
/// <param name="filename"></param>
/// <param name="rTypeId"></param>
/// <param name="rMetadata"></param>
public static void GetFileTypeAndMetadata(Stream stream, out FileTypeId rTypeId, out SortedDictionary<string, string> rMetadata)
{
FileTypeId typeId = GetFileType(stream);
SortedDictionary<string, string> metadata;
if (typeId == FileTypeId.ejs || typeId == FileTypeId.js)
{
long pos = stream.Position;
using (StreamReader reader = new StreamReader(stream, Encoding.UTF8, true, 256, true))
{
reader.ReadLine(); // Skip the filetype indicator line
metadata = ParseMicroYaml(reader);
}
stream.Position = pos;
}
else
{
metadata = new SortedDictionary<string, string>(); // Empty metadata
}
rTypeId = typeId;
rMetadata = metadata;
}
/// <summary>
/// Gets the file type and reads the metadata from a file.
/// </summary>
/// <param name="filename"></param>
/// <param name="rTypeId"></param>
/// <param name="rMetadata"></param>
public static void GetFileTypeAndMetadata(string filename, out FileTypeId rTypeId, out SortedDictionary<string, string> rMetadata)
{
using (FileStream stream = new FileStream(filename, FileMode.Open, FileAccess.Read, FileShare.Read))
{
GetFileTypeAndMetadata(stream, out rTypeId, out rMetadata);
}
}
}
}