Skip to content

Commit ade26c2

Browse files
nickpnickp
authored andcommitted
more
1 parent 3faf70a commit ade26c2

168 files changed

Lines changed: 12703 additions & 14 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.eleventy.js

Lines changed: 153 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,122 @@ module.exports = function(eleventyConfig) {
2525
return dedupeMappings[entityType]?.[entityName] || entityName;
2626
}
2727

28+
// Helper function to normalize document types (for grouping)
29+
function normalizeDocType(docType) {
30+
if (!docType) return null;
31+
return String(docType).toLowerCase().trim();
32+
}
33+
34+
// Helper function to format document types for display (title case)
35+
function formatDocType(docType) {
36+
if (!docType) return 'Unknown';
37+
return String(docType)
38+
.toLowerCase()
39+
.trim()
40+
.split(' ')
41+
.map(word => word.charAt(0).toUpperCase() + word.slice(1))
42+
.join(' ');
43+
}
44+
45+
// Helper function to normalize dates to consistent format
46+
function normalizeDate(dateStr) {
47+
if (!dateStr) return null;
48+
49+
const str = String(dateStr).trim();
50+
51+
// Already in ISO format (YYYY-MM-DD)
52+
if (/^\d{4}-\d{2}-\d{2}$/.test(str)) {
53+
return str;
54+
}
55+
56+
// Just a year (YYYY)
57+
if (/^\d{4}$/.test(str)) {
58+
return `${str}-00-00`;
59+
}
60+
61+
// Try to parse various date formats
62+
const months = {
63+
'jan': '01', 'january': '01',
64+
'feb': '02', 'february': '02',
65+
'mar': '03', 'march': '03',
66+
'apr': '04', 'april': '04',
67+
'may': '05',
68+
'jun': '06', 'june': '06',
69+
'jul': '07', 'july': '07',
70+
'aug': '08', 'august': '08',
71+
'sep': '09', 'september': '09',
72+
'oct': '10', 'october': '10',
73+
'nov': '11', 'november': '11',
74+
'dec': '12', 'december': '12'
75+
};
76+
77+
// "February 15, 2005" or "Feb 15, 2005"
78+
const match1 = str.match(/^(\w+)\s+(\d{1,2}),?\s+(\d{4})$/i);
79+
if (match1) {
80+
const month = months[match1[1].toLowerCase()];
81+
if (month) {
82+
const day = match1[2].padStart(2, '0');
83+
return `${match1[3]}-${month}-${day}`;
84+
}
85+
}
86+
87+
// "15 February 2005" or "15 Feb 2005"
88+
const match2 = str.match(/^(\d{1,2})\s+(\w+)\s+(\d{4})$/i);
89+
if (match2) {
90+
const month = months[match2[2].toLowerCase()];
91+
if (month) {
92+
const day = match2[1].padStart(2, '0');
93+
return `${match2[3]}-${month}-${day}`;
94+
}
95+
}
96+
97+
// "2005/02/15" or "2005.02.15"
98+
const match3 = str.match(/^(\d{4})[\/\.](\d{1,2})[\/\.](\d{1,2})$/);
99+
if (match3) {
100+
const month = match3[2].padStart(2, '0');
101+
const day = match3[3].padStart(2, '0');
102+
return `${match3[1]}-${month}-${day}`;
103+
}
104+
105+
// "02/15/2005" or "02.15.2005" (US format)
106+
const match4 = str.match(/^(\d{1,2})[\/\.](\d{1,2})[\/\.](\d{4})$/);
107+
if (match4) {
108+
const month = match4[1].padStart(2, '0');
109+
const day = match4[2].padStart(2, '0');
110+
return `${match4[3]}-${month}-${day}`;
111+
}
112+
113+
// Couldn't parse - return original
114+
return str;
115+
}
116+
117+
// Helper function to format dates for display
118+
function formatDate(normalizedDate) {
119+
if (!normalizedDate) return 'Unknown Date';
120+
121+
// Year only (YYYY-00-00)
122+
if (normalizedDate.endsWith('-00-00')) {
123+
return normalizedDate.substring(0, 4);
124+
}
125+
126+
// Full date (YYYY-MM-DD)
127+
const match = normalizedDate.match(/^(\d{4})-(\d{2})-(\d{2})$/);
128+
if (match) {
129+
const months = ['', 'January', 'February', 'March', 'April', 'May', 'June',
130+
'July', 'August', 'September', 'October', 'November', 'December'];
131+
const year = match[1];
132+
const month = parseInt(match[2]);
133+
const day = parseInt(match[3]);
134+
135+
if (month > 0 && month <= 12) {
136+
return `${months[month]} ${day}, ${year}`;
137+
}
138+
}
139+
140+
// Fallback
141+
return normalizedDate;
142+
}
143+
28144
// Cache the documents data - only compute once
29145
let cachedDocuments = null;
30146

@@ -143,17 +259,31 @@ module.exports = function(eleventyConfig) {
143259
people: [...new Set(Array.from(allEntities.people).map(p => applyDedupe('people', p)))],
144260
organizations: [...new Set(Array.from(allEntities.organizations).map(o => applyDedupe('organizations', o)))],
145261
locations: [...new Set(Array.from(allEntities.locations).map(l => applyDedupe('locations', l)))],
146-
dates: Array.from(allEntities.dates),
262+
dates: [...new Set(Array.from(allEntities.dates).map(d => {
263+
const normalized = normalizeDate(d);
264+
return normalized ? formatDate(normalized) : d;
265+
}))],
147266
reference_numbers: Array.from(allEntities.reference_numbers)
148267
};
149268

269+
// Normalize document metadata
270+
const normalizedMetadata = {
271+
...firstPage.document_metadata,
272+
document_type: firstPage.document_metadata?.document_type
273+
? formatDocType(firstPage.document_metadata.document_type)
274+
: null,
275+
date: firstPage.document_metadata?.date
276+
? formatDate(normalizeDate(firstPage.document_metadata.date))
277+
: firstPage.document_metadata?.date
278+
};
279+
150280
return {
151281
unique_id: normalizedDocNum, // Normalized version for unique URLs
152282
document_number: rawDocNums.length === 1 ? rawDocNums[0] : normalizedDocNum, // Show original if consistent, else normalized
153283
raw_document_numbers: rawDocNums, // All variations found
154284
pages: docPages,
155285
page_count: docPages.length,
156-
document_metadata: firstPage.document_metadata,
286+
document_metadata: normalizedMetadata,
157287
entities: deduplicatedEntities,
158288
full_text: docPages.map(p => p.full_text).join('\n\n--- PAGE BREAK ---\n\n'),
159289
folder: folders.join(', '), // Show all folders if document spans multiple
@@ -223,19 +353,25 @@ module.exports = function(eleventyConfig) {
223353
});
224354
}
225355

226-
// Dates
356+
// Dates (normalize for grouping)
227357
if (doc.entities?.dates) {
228358
doc.entities.dates.forEach(date => {
229-
if (!dates.has(date)) dates.set(date, []);
230-
dates.get(date).push(doc);
359+
const normalized = normalizeDate(date);
360+
if (normalized) {
361+
if (!dates.has(normalized)) dates.set(normalized, []);
362+
dates.get(normalized).push(doc);
363+
}
231364
});
232365
}
233366

234-
// Document types
367+
// Document types (normalize for grouping)
235368
const docType = doc.document_metadata?.document_type;
236369
if (docType) {
237-
if (!documentTypes.has(docType)) documentTypes.set(docType, []);
238-
documentTypes.get(docType).push(doc);
370+
const normalized = normalizeDocType(docType);
371+
if (normalized) {
372+
if (!documentTypes.has(normalized)) documentTypes.set(normalized, []);
373+
documentTypes.get(normalized).push(doc);
374+
}
239375
}
240376
});
241377

@@ -265,13 +401,17 @@ module.exports = function(eleventyConfig) {
265401
docs: dedupeDocArray(docs),
266402
count: dedupeDocArray(docs).length
267403
})).sort((a, b) => b.count - a.count),
268-
dates: Array.from(dates.entries()).map(([name, docs]) => ({
269-
name,
404+
dates: Array.from(dates.entries()).map(([normalizedDate, docs]) => ({
405+
name: formatDate(normalizedDate), // Display formatted version
406+
normalizedDate, // Keep normalized for sorting
270407
docs: dedupeDocArray(docs),
271408
count: dedupeDocArray(docs).length
272-
})).sort((a, b) => b.count - a.count),
273-
documentTypes: Array.from(documentTypes.entries()).map(([name, docs]) => ({
274-
name,
409+
})).sort((a, b) => {
410+
// Sort by normalized date (YYYY-MM-DD format sorts correctly)
411+
return b.normalizedDate.localeCompare(a.normalizedDate);
412+
}),
413+
documentTypes: Array.from(documentTypes.entries()).map(([normalizedType, docs]) => ({
414+
name: formatDocType(normalizedType), // Display formatted version
275415
docs: dedupeDocArray(docs),
276416
count: dedupeDocArray(docs).length
277417
})).sort((a, b) => b.count - a.count)

processing_index.json

Lines changed: 167 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10904,7 +10904,173 @@
1090410904
"IMAGES004/DOJ-OGR-00011129.jpg",
1090510905
"IMAGES004/DOJ-OGR-00011130.jpg",
1090610906
"IMAGES004/DOJ-OGR-00011131.jpg",
10907-
"IMAGES004/DOJ-OGR-00011132.jpg"
10907+
"IMAGES004/DOJ-OGR-00011132.jpg",
10908+
"IMAGES004/DOJ-OGR-00011133.jpg",
10909+
"IMAGES004/DOJ-OGR-00011134.jpg",
10910+
"IMAGES004/DOJ-OGR-00011135.jpg",
10911+
"IMAGES004/DOJ-OGR-00011136.jpg",
10912+
"IMAGES004/DOJ-OGR-00011137.jpg",
10913+
"IMAGES004/DOJ-OGR-00011138.jpg",
10914+
"IMAGES004/DOJ-OGR-00011139.jpg",
10915+
"IMAGES004/DOJ-OGR-00011140.jpg",
10916+
"IMAGES004/DOJ-OGR-00011141.jpg",
10917+
"IMAGES004/DOJ-OGR-00011142.jpg",
10918+
"IMAGES004/DOJ-OGR-00011143.jpg",
10919+
"IMAGES004/DOJ-OGR-00011144.jpg",
10920+
"IMAGES004/DOJ-OGR-00011145.jpg",
10921+
"IMAGES004/DOJ-OGR-00011146.jpg",
10922+
"IMAGES004/DOJ-OGR-00011147.jpg",
10923+
"IMAGES004/DOJ-OGR-00011148.jpg",
10924+
"IMAGES004/DOJ-OGR-00011149.jpg",
10925+
"IMAGES004/DOJ-OGR-00011150.jpg",
10926+
"IMAGES004/DOJ-OGR-00011151.jpg",
10927+
"IMAGES004/DOJ-OGR-00011152.jpg",
10928+
"IMAGES004/DOJ-OGR-00011153.jpg",
10929+
"IMAGES004/DOJ-OGR-00011154.jpg",
10930+
"IMAGES004/DOJ-OGR-00011155.jpg",
10931+
"IMAGES004/DOJ-OGR-00011156.jpg",
10932+
"IMAGES004/DOJ-OGR-00011157.jpg",
10933+
"IMAGES004/DOJ-OGR-00011158.jpg",
10934+
"IMAGES004/DOJ-OGR-00011159.jpg",
10935+
"IMAGES004/DOJ-OGR-00011160.jpg",
10936+
"IMAGES004/DOJ-OGR-00011161.jpg",
10937+
"IMAGES004/DOJ-OGR-00011162.jpg",
10938+
"IMAGES004/DOJ-OGR-00011163.jpg",
10939+
"IMAGES004/DOJ-OGR-00011164.jpg",
10940+
"IMAGES004/DOJ-OGR-00011165.jpg",
10941+
"IMAGES004/DOJ-OGR-00011166.jpg",
10942+
"IMAGES004/DOJ-OGR-00011167.jpg",
10943+
"IMAGES004/DOJ-OGR-00011168.jpg",
10944+
"IMAGES004/DOJ-OGR-00011169.jpg",
10945+
"IMAGES004/DOJ-OGR-00011170.jpg",
10946+
"IMAGES004/DOJ-OGR-00011171.jpg",
10947+
"IMAGES004/DOJ-OGR-00011172.jpg",
10948+
"IMAGES004/DOJ-OGR-00011173.jpg",
10949+
"IMAGES004/DOJ-OGR-00011174.jpg",
10950+
"IMAGES004/DOJ-OGR-00011175.jpg",
10951+
"IMAGES004/DOJ-OGR-00011176.jpg",
10952+
"IMAGES004/DOJ-OGR-00011177.jpg",
10953+
"IMAGES004/DOJ-OGR-00011178.jpg",
10954+
"IMAGES004/DOJ-OGR-00011179.jpg",
10955+
"IMAGES004/DOJ-OGR-00011180.jpg",
10956+
"IMAGES004/DOJ-OGR-00011181.jpg",
10957+
"IMAGES004/DOJ-OGR-00011182.jpg",
10958+
"IMAGES004/DOJ-OGR-00011183.jpg",
10959+
"IMAGES004/DOJ-OGR-00011184.jpg",
10960+
"IMAGES004/DOJ-OGR-00011185.jpg",
10961+
"IMAGES004/DOJ-OGR-00011186.jpg",
10962+
"IMAGES004/DOJ-OGR-00011187.jpg",
10963+
"IMAGES004/DOJ-OGR-00011188.jpg",
10964+
"IMAGES004/DOJ-OGR-00011189.jpg",
10965+
"IMAGES004/DOJ-OGR-00011190.jpg",
10966+
"IMAGES004/DOJ-OGR-00011191.jpg",
10967+
"IMAGES004/DOJ-OGR-00011192.jpg",
10968+
"IMAGES004/DOJ-OGR-00011193.jpg",
10969+
"IMAGES004/DOJ-OGR-00011194.jpg",
10970+
"IMAGES004/DOJ-OGR-00011195.jpg",
10971+
"IMAGES004/DOJ-OGR-00011196.jpg",
10972+
"IMAGES004/DOJ-OGR-00011197.jpg",
10973+
"IMAGES004/DOJ-OGR-00011198.jpg",
10974+
"IMAGES004/DOJ-OGR-00011199.jpg",
10975+
"IMAGES004/DOJ-OGR-00011200.jpg",
10976+
"IMAGES004/DOJ-OGR-00011201.jpg",
10977+
"IMAGES004/DOJ-OGR-00011202.jpg",
10978+
"IMAGES004/DOJ-OGR-00011203.jpg",
10979+
"IMAGES004/DOJ-OGR-00011204.jpg",
10980+
"IMAGES004/DOJ-OGR-00011205.jpg",
10981+
"IMAGES004/DOJ-OGR-00011206.jpg",
10982+
"IMAGES004/DOJ-OGR-00011207.jpg",
10983+
"IMAGES004/DOJ-OGR-00011208.jpg",
10984+
"IMAGES004/DOJ-OGR-00011209.jpg",
10985+
"IMAGES004/DOJ-OGR-00011210.jpg",
10986+
"IMAGES004/DOJ-OGR-00011211.jpg",
10987+
"IMAGES004/DOJ-OGR-00011212.jpg",
10988+
"IMAGES004/DOJ-OGR-00011213.jpg",
10989+
"IMAGES004/DOJ-OGR-00011214.jpg",
10990+
"IMAGES004/DOJ-OGR-00011215.jpg",
10991+
"IMAGES004/DOJ-OGR-00011216.jpg",
10992+
"IMAGES004/DOJ-OGR-00011217.jpg",
10993+
"IMAGES004/DOJ-OGR-00011218.jpg",
10994+
"IMAGES004/DOJ-OGR-00011219.jpg",
10995+
"IMAGES004/DOJ-OGR-00011220.jpg",
10996+
"IMAGES004/DOJ-OGR-00011221.jpg",
10997+
"IMAGES004/DOJ-OGR-00011222.jpg",
10998+
"IMAGES004/DOJ-OGR-00011223.jpg",
10999+
"IMAGES004/DOJ-OGR-00011224.jpg",
11000+
"IMAGES004/DOJ-OGR-00011225.jpg",
11001+
"IMAGES004/DOJ-OGR-00011226.jpg",
11002+
"IMAGES004/DOJ-OGR-00011227.jpg",
11003+
"IMAGES004/DOJ-OGR-00011228.jpg",
11004+
"IMAGES004/DOJ-OGR-00011229.jpg",
11005+
"IMAGES004/DOJ-OGR-00011230.jpg",
11006+
"IMAGES004/DOJ-OGR-00011231.jpg",
11007+
"IMAGES004/DOJ-OGR-00011232.jpg",
11008+
"IMAGES004/DOJ-OGR-00011233.jpg",
11009+
"IMAGES004/DOJ-OGR-00011234.jpg",
11010+
"IMAGES004/DOJ-OGR-00011235.jpg",
11011+
"IMAGES004/DOJ-OGR-00011236.jpg",
11012+
"IMAGES004/DOJ-OGR-00011237.jpg",
11013+
"IMAGES004/DOJ-OGR-00011238.jpg",
11014+
"IMAGES004/DOJ-OGR-00011239.jpg",
11015+
"IMAGES004/DOJ-OGR-00011240.jpg",
11016+
"IMAGES004/DOJ-OGR-00011241.jpg",
11017+
"IMAGES004/DOJ-OGR-00011242.jpg",
11018+
"IMAGES004/DOJ-OGR-00011243.jpg",
11019+
"IMAGES004/DOJ-OGR-00011244.jpg",
11020+
"IMAGES004/DOJ-OGR-00011245.jpg",
11021+
"IMAGES004/DOJ-OGR-00011246.jpg",
11022+
"IMAGES004/DOJ-OGR-00011247.jpg",
11023+
"IMAGES004/DOJ-OGR-00011248.jpg",
11024+
"IMAGES004/DOJ-OGR-00011249.jpg",
11025+
"IMAGES004/DOJ-OGR-00011250.jpg",
11026+
"IMAGES004/DOJ-OGR-00011251.jpg",
11027+
"IMAGES004/DOJ-OGR-00011252.jpg",
11028+
"IMAGES004/DOJ-OGR-00011253.jpg",
11029+
"IMAGES004/DOJ-OGR-00011254.jpg",
11030+
"IMAGES004/DOJ-OGR-00011255.jpg",
11031+
"IMAGES004/DOJ-OGR-00011256.jpg",
11032+
"IMAGES004/DOJ-OGR-00011257.jpg",
11033+
"IMAGES004/DOJ-OGR-00011258.jpg",
11034+
"IMAGES004/DOJ-OGR-00011259.jpg",
11035+
"IMAGES004/DOJ-OGR-00011260.jpg",
11036+
"IMAGES004/DOJ-OGR-00011261.jpg",
11037+
"IMAGES004/DOJ-OGR-00011262.jpg",
11038+
"IMAGES004/DOJ-OGR-00011263.jpg",
11039+
"IMAGES004/DOJ-OGR-00011264.jpg",
11040+
"IMAGES004/DOJ-OGR-00011265.jpg",
11041+
"IMAGES004/DOJ-OGR-00011266.jpg",
11042+
"IMAGES004/DOJ-OGR-00011267.jpg",
11043+
"IMAGES004/DOJ-OGR-00011268.jpg",
11044+
"IMAGES004/DOJ-OGR-00011269.jpg",
11045+
"IMAGES004/DOJ-OGR-00011270.jpg",
11046+
"IMAGES004/DOJ-OGR-00011271.jpg",
11047+
"IMAGES004/DOJ-OGR-00011272.jpg",
11048+
"IMAGES004/DOJ-OGR-00011273.jpg",
11049+
"IMAGES004/DOJ-OGR-00011274.jpg",
11050+
"IMAGES004/DOJ-OGR-00011275.jpg",
11051+
"IMAGES004/DOJ-OGR-00011276.jpg",
11052+
"IMAGES004/DOJ-OGR-00011277.jpg",
11053+
"IMAGES004/DOJ-OGR-00011278.jpg",
11054+
"IMAGES004/DOJ-OGR-00011279.jpg",
11055+
"IMAGES004/DOJ-OGR-00011280.jpg",
11056+
"IMAGES004/DOJ-OGR-00011281.jpg",
11057+
"IMAGES004/DOJ-OGR-00011282.jpg",
11058+
"IMAGES004/DOJ-OGR-00011283.jpg",
11059+
"IMAGES004/DOJ-OGR-00011284.jpg",
11060+
"IMAGES004/DOJ-OGR-00011285.jpg",
11061+
"IMAGES004/DOJ-OGR-00011286.jpg",
11062+
"IMAGES004/DOJ-OGR-00011287.jpg",
11063+
"IMAGES004/DOJ-OGR-00011288.jpg",
11064+
"IMAGES004/DOJ-OGR-00011289.jpg",
11065+
"IMAGES004/DOJ-OGR-00011290.jpg",
11066+
"IMAGES004/DOJ-OGR-00011291.jpg",
11067+
"IMAGES004/DOJ-OGR-00011292.jpg",
11068+
"IMAGES004/DOJ-OGR-00011293.jpg",
11069+
"IMAGES004/DOJ-OGR-00011294.jpg",
11070+
"IMAGES004/DOJ-OGR-00011295.jpg",
11071+
"IMAGES004/DOJ-OGR-00011296.jpg",
11072+
"IMAGES004/DOJ-OGR-00011297.jpg",
11073+
"IMAGES004/DOJ-OGR-00011299.jpg"
1090811074
],
1090911075
"last_updated": "/Users/nickp/code/files"
1091011076
}

0 commit comments

Comments
 (0)