-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathire-builder-guide.html
More file actions
855 lines (802 loc) · 77 KB
/
ire-builder-guide.html
File metadata and controls
855 lines (802 loc) · 77 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>IRE Builder Guide — Institutional Reasoning Engines</title>
<style>
:root {
--navy:#1B3A5C; --mid:#2E6DA4; --light:#D5E8F0;
--accent:#C0392B; --green:#27AE60; --gold:#D4A017;
--gray:#5A5A5A; --lgray:#F4F6F9; --white:#fff; --black:#1A1A1A;
--sw:260px;
}
*{box-sizing:border-box;margin:0;padding:0}
body{font-family:'Segoe UI',Arial,sans-serif;color:var(--black);background:#EEF2F7}
/* Topbar */
.topbar{position:fixed;top:0;left:0;right:0;height:54px;background:var(--navy);
display:flex;align-items:center;padding:0 20px;z-index:1000;
box-shadow:0 2px 8px rgba(0,0,0,.3)}
.topbar-title{font-size:15px;font-weight:700;color:#fff;letter-spacing:.3px}
.topbar-badge{margin-left:10px;background:var(--accent);color:#fff;
font-size:11px;font-weight:700;padding:2px 8px;border-radius:10px}
.topbar-sub{margin-left:auto;font-size:12px;color:rgba(255,255,255,.6)}
/* Sidebar */
.sidebar{position:fixed;top:54px;left:0;bottom:0;width:var(--sw);
background:var(--navy);overflow-y:auto;z-index:900;padding-bottom:40px}
.sb-group{padding:14px 14px 4px;font-size:10px;font-weight:700;
color:rgba(255,255,255,.35);letter-spacing:1.2px;text-transform:uppercase}
.sb-link{display:block;padding:8px 16px 8px 18px;color:rgba(255,255,255,.72);
font-size:13px;text-decoration:none;border-left:3px solid transparent;
cursor:pointer;transition:all .15s;background:none;border-top:none;
border-right:none;border-bottom:none;width:100%;text-align:left}
.sb-link:hover,.sb-link.active{background:rgba(255,255,255,.09);
color:#fff;border-left-color:var(--light)}
.sb-link.sub{padding-left:30px;font-size:12px}
/* Main */
.main{margin-left:var(--sw);padding-top:54px;min-height:100vh}
.content{max-width:980px;margin:0 auto;padding:36px 28px 80px}
/* Sections — ALL VISIBLE, separated by page-break style */
.section{display:none}
.section.active{display:block}
/* Typography */
.page-title{font-size:30px;color:var(--navy);margin-bottom:6px;font-weight:800}
.page-title span{color:var(--mid)}
.page-sub{color:var(--gray);font-size:15px;margin-bottom:28px;line-height:1.65}
h2.sh{font-size:20px;color:var(--navy);margin:36px 0 10px;padding-bottom:7px;
border-bottom:3px solid var(--light);font-weight:700}
h3.sth{font-size:15px;color:var(--mid);margin:20px 0 7px;font-weight:700}
p{line-height:1.72;color:var(--black);margin-bottom:12px;font-size:14.5px}
ul,ol{margin:0 0 12px 22px}
li{line-height:1.7;font-size:14.5px;margin-bottom:3px}
code{background:#E5EBF2;padding:1px 5px;border-radius:3px;font-size:12.5px;font-family:monospace}
strong{color:var(--navy)}
/* Cards */
.cards{display:grid;grid-template-columns:repeat(auto-fill,minmax(240px,1fr));gap:14px;margin:16px 0}
.card{background:#fff;border-radius:9px;padding:18px;box-shadow:0 2px 7px rgba(0,0,0,.07);
border-top:4px solid var(--mid)}
.card.green{border-top-color:var(--green)}
.card.gold{border-top-color:var(--gold)}
.card.red{border-top-color:var(--accent)}
.card.navy{border-top-color:var(--navy)}
.card h4{font-size:13.5px;font-weight:700;color:var(--navy);margin-bottom:6px}
.card p{font-size:12.5px;margin:0;color:var(--gray)}
.tag{display:inline-block;font-size:10.5px;font-weight:700;padding:2px 7px;
border-radius:8px;margin-bottom:7px}
.tag-a{background:#D5E8F0;color:#1B3A5C}
.tag-b{background:#D5F5E3;color:#1A5C35}
.tag-c{background:#FEF9E7;color:#7D6608}
.tag-d{background:#FDEBD0;color:#784212}
.tag-e{background:#FADBD8;color:#7B241C}
.tag-f{background:#E8DAEF;color:#512E5F}
.tag-g{background:#D1F2EB;color:#0E6655}
/* Callouts */
.callout{border-left:5px solid var(--mid);background:var(--light);
padding:14px 18px;border-radius:0 8px 8px 0;margin:16px 0}
.callout.warn{border-left-color:var(--gold);background:#FEF9E7}
.callout.ok{border-left-color:var(--green);background:#EAFAF1}
.callout.danger{border-left-color:var(--accent);background:#FDEDEC}
.callout.dark{border-left-color:var(--navy);background:#EBF2FA}
.cl{font-size:10.5px;font-weight:800;letter-spacing:1px;text-transform:uppercase;
color:var(--mid);margin-bottom:5px}
.callout.warn .cl{color:var(--gold)}
.callout.ok .cl{color:var(--green)}
.callout.danger .cl{color:var(--accent)}
.callout.dark .cl{color:var(--navy)}
.callout p{margin:0;font-size:14px}
/* Tables */
.tw{overflow-x:auto;margin:14px 0 20px;border-radius:8px;
box-shadow:0 2px 7px rgba(0,0,0,.07)}
table{width:100%;border-collapse:collapse;background:#fff;font-size:13.5px}
th{background:var(--navy);color:#fff;padding:9px 13px;text-align:left;font-size:12.5px;font-weight:700}
td{padding:8px 13px;border-bottom:1px solid #E5EBF2;vertical-align:top;line-height:1.5}
tr:nth-child(even) td{background:var(--lgray)}
tr:last-child td{border-bottom:none}
/* Diagram */
.dw{background:#fff;border-radius:9px;padding:22px;margin:16px 0;
box-shadow:0 2px 7px rgba(0,0,0,.07);overflow-x:auto}
.dt{font-size:12px;font-weight:700;color:var(--mid);margin-bottom:14px;
letter-spacing:.5px;text-transform:uppercase}
/* Steps */
.steps{margin:16px 0}
.step{display:flex;gap:14px;margin-bottom:18px}
.sn{flex-shrink:0;width:34px;height:34px;border-radius:50%;background:var(--navy);
color:#fff;display:flex;align-items:center;justify-content:center;
font-size:14px;font-weight:800;margin-top:2px}
.sb h4{font-size:14.5px;color:var(--navy);font-weight:700;margin-bottom:3px}
.sb p{font-size:13.5px;margin:0;color:var(--gray)}
/* Hero */
.hero{background:linear-gradient(135deg,var(--navy) 0%,#2E6DA4 100%);
color:#fff;border-radius:11px;padding:36px;margin-bottom:28px}
.hero h2{font-size:26px;font-weight:800;margin-bottom:10px}
.hero p{font-size:15px;opacity:.9;line-height:1.7;margin:0}
/* Phase bar */
.pbar{display:flex;border-radius:8px;overflow:hidden;margin:20px 0}
.pi{flex:1;padding:9px 6px;text-align:center;font-size:11px;
font-weight:700;color:#fff;cursor:pointer}
.pi:hover{opacity:.85}
/* Accordion */
.acc{margin:10px 0;border-radius:8px;overflow:hidden;
box-shadow:0 2px 6px rgba(0,0,0,.06)}
.ah{background:#fff;padding:13px 16px;cursor:pointer;display:flex;
align-items:center;justify-content:space-between;font-weight:700;
font-size:13.5px;color:var(--navy);border-left:4px solid var(--mid)}
.ah:hover{background:var(--lgray)}
.ab{background:#fff;padding:0 16px;overflow:hidden;max-height:0;transition:all .28s}
.ab.open{max-height:1200px;padding:13px 16px;border-top:1px solid #E5EBF2}
.arrow{transition:transform .2s;font-size:11px}
.arrow.open{transform:rotate(90deg)}
/* Cost pills */
.cp{display:inline-block;font-size:11.5px;font-weight:700;padding:2px 9px;border-radius:10px;margin-left:6px}
.free{background:#D5F5E3;color:#1E8449}
.low{background:#FEF9E7;color:#9A7D0A}
.med{background:#FDEBD0;color:#A04000}
.high{background:#FADBD8;color:#922B21}
@media(max-width:768px){
.sidebar{display:none}
.main{margin-left:0}
.content{padding:20px 14px 60px}
.cards{grid-template-columns:1fr}
.pbar{flex-wrap:wrap}
.pi{flex:0 0 50%}
}
</style>
</head>
<body>
<div class="topbar">
<span class="topbar-title">IRE Builder Guide</span>
<span class="topbar-badge">v1.2</span>
<span class="topbar-sub">Institutional Reasoning Engines</span>
</div>
<nav class="sidebar" id="sidebar">
<div class="sb-group">Overview</div>
<button class="sb-link active" onclick="show('intro')">What Is an IRE?</button>
<button class="sb-link" onclick="show('arch')">Architecture at a Glance</button>
<div class="sb-group">Build Guide</div>
<button class="sb-link" onclick="show('prereqs')">Prerequisites</button>
<button class="sb-link" onclick="show('tools')">Tools & Subscriptions</button>
<button class="sb-link" onclick="show('costs')">Cost Breakdown</button>
<button class="sb-link" onclick="show('phases')">Build Phases</button>
<div class="sb-group">Workflows</div>
<button class="sb-link" onclick="show('flow-case')">Case Flow</button>
<button class="sb-link" onclick="show('flow-ingest')">Document Ingestion</button>
<button class="sb-link" onclick="show('flow-entity')">Entity Resolution</button>
<button class="sb-link" onclick="show('flow-reason')">Reasoning Loop</button>
<button class="sb-link" onclick="show('flow-privacy')">Privacy Gateway</button>
<button class="sb-link" onclick="show('flow-audit')">Audit Chain</button>
<div class="sb-group">Clusters</div>
<button class="sb-link sub" onclick="show('ca')">Cluster A — Documentary</button>
<button class="sb-link sub" onclick="show('cb')">Cluster B — Behavioural</button>
<button class="sb-link sub" onclick="show('cc')">Cluster C — Network</button>
<button class="sb-link sub" onclick="show('cd')">Cluster D — Reasoning</button>
<button class="sb-link sub" onclick="show('ce')">Cluster E — Privacy</button>
<button class="sb-link sub" onclick="show('cf')">Cluster F — Memory</button>
<button class="sb-link sub" onclick="show('cg')">Cluster G — Integrity</button>
<div class="sb-group">More</div>
<button class="sb-link" onclick="show('ux')">Investigator UX</button>
<button class="sb-link" onclick="show('ai')">Building With AI</button>
<button class="sb-link" onclick="show('profiles')">Deployment Profiles</button>
<button class="sb-link" onclick="show('faq')">FAQ</button>
</nav>
<div class="main"><div class="content">
<!-- ═══════ INTRO ═══════ -->
<div class="section active" id="s-intro">
<div class="hero">
<h2>Build Your Own Institutional Reasoning Engine</h2>
<p>A complete guide for non-technical founders — every tool, subscription, workflow, and flowchart needed to build and deploy this architecture. Plain English throughout.</p>
<div class="callout dark"><div class="cl">Two Documents — Two Distinct Roles</div>
<p><strong>This Builder Guide is the execution document.</strong> It answers: "Can we actually build this, and how?" It is designed for developers, technical co-founders, and implementation teams.<br><br>
The companion <strong>IRE Whitepaper (IRE_v1.2_Final.pdf)</strong> is the strategic and credibility document. It defines the category, establishes the architectural foundation, and answers: "Should this exist, and is it credible?" It is designed for investors, CXOs, regulators, and senior advisors.<br><br>
Use the whitepaper to establish <em>why</em>. Use this guide to establish <em>how</em>.</p></div>
</div>
<h1 class="page-title">What Is an <span>Institutional Reasoning Engine?</span></h1>
<p class="page-sub">An IRE is not a chatbot. Not automation. It is an AI system that runs a structured investigation workflow — forming a hypothesis, iterating through evidence, and requiring human approval at every finding before it enters the record.</p>
<div class="callout dark"><div class="cl">The One-Sentence Version</div>
<p>A professional investigation workflow encoded in software — AI accelerates the analysis, humans remain accountable for every conclusion.</p></div>
<h2 class="sh">The Three Things That Make It Different</h2>
<div class="cards">
<div class="card navy"><h4>🔒 Methodology as Engineering</h4><p>Investigation rules are baked into the system — not left to human discretion. The AI cannot skip a step any more than a calculator can divide by zero.</p></div>
<div class="card green"><h4>👤 Human Gate at Every Finding</h4><p>No AI output enters the official record without a named investigator approving it. The approval IS the record — not a step before the record.</p></div>
<div class="card gold"><h4>🔗 Tamper-Evident Audit Chain</h4><p>Every action is cryptographically linked. Alter one log entry and every subsequent entry breaks. Mathematically detectable.</p></div>
</div>
<h2 class="sh">What It Is vs What It Is Not</h2>
<div class="tw"><table>
<tr><th>This IS</th><th>This IS NOT</th></tr>
<tr><td>A methodology encoding framework — process as engineering constraints</td><td>A replacement for investigator judgment</td></tr>
<tr><td>Modular — scales from 8 base components to 35</td><td>A product you can buy off the shelf</td></tr>
<tr><td>Applicable to any regulated investigation, any industry</td><td>A compliance guarantee in your jurisdiction</td></tr>
<tr><td>AI-accelerated with mandatory human gates</td><td>An autonomous AI that makes decisions independently</td></tr>
<tr><td>Tamper-evident by default; independently verifiable with Cluster G active</td><td>Unconditionally immutable in all configurations</td></tr>
<tr><td>Upgradeable incrementally — start small, grow</td><td>A system requiring full build before delivering value</td></tr>
</table></div>
</div>
<!-- ═══════ ARCHITECTURE ═══════ -->
<div class="section" id="s-arch">
<h1 class="page-title">Architecture <span>at a Glance</span></h1>
<p class="page-sub">Two layers: a mandatory base every deployment must include, and modular clusters that activate based on your investigation context.</p>
<div class="dw"><div class="dt">Full IRE Architecture — Base + 7 Clusters</div>
<img src="diagrams/diagram-1.svg" alt="Diagram 1" style="width:100%;max-width:100%;display:block;"></div>
<h2 class="sh">The 8 Mandatory Base Components</h2>
<div class="tw"><table>
<tr><th>#</th><th>Component</th><th>Plain English</th><th>Why Non-Negotiable</th></tr>
<tr><td><strong>B1</strong></td><td>Case-Scoped Isolation</td><td>Each case gets its own sealed container. Evidence from Case A cannot leak into Case B — by design.</td><td>Cross-contamination makes findings legally indefensible.</td></tr>
<tr><td><strong>B2</strong></td><td>Retrieval Verifier</td><td>Before any AI claim reaches an investigator, the system checks it exists in the actual case documents. A database lookup — not AI judgment.</td><td>Prevents hallucinated findings entering the record.</td></tr>
<tr><td><strong>B3</strong></td><td>Evidence Grounder</td><td>Every verified claim gets a citation — document name, page number, chunk ID. Like footnotes, automatic and tamper-evident.</td><td>Regulators and courts demand traceable findings.</td></tr>
<tr><td><strong>B4</strong></td><td>Human Review Gate</td><td>No AI finding enters the official record without a named investigator approving it. The approval event is itself logged.</td><td>Accountability. A human is responsible for every finding.</td></tr>
<tr><td><strong>B5</strong></td><td>Audit Event Logger</td><td>Every action logged: what was queried, retrieved, which AI model used, who approved what, when.</td><td>You cannot defend a record you cannot reconstruct.</td></tr>
<tr><td><strong>B6</strong></td><td>Hash-Chain Immutability</td><td>Each log entry cryptographically linked to the previous. Altering any entry breaks every entry after it.</td><td>Proves the record was not tampered with after the fact.</td></tr>
<tr><td><strong>B7</strong></td><td>Model Version Pinning</td><td>Every case stores exactly which AI model version was used. Mid-investigation upgrades are blocked.</td><td>Reproducibility — you must re-run the same reasoning months later.</td></tr>
<tr><td><strong>B8</strong></td><td>Algorithmic Bias Monitor</td><td>Monitors whether the AI is systematically over-flagging any demographic group. Active from day one.</td><td>Biased AI in regulated investigation is a liability, not a quirk.</td></tr>
</table></div>
<h2 class="sh">The 7 Modular Clusters</h2>
<div class="cards">
<div class="card"><div class="tag tag-a">Cluster A</div><h4>Documentary Evidence</h4><p>Turns any document pile into searchable evidence. PDFs, spreadsheets, emails — queryable by meaning.</p></div>
<div class="card"><div class="tag tag-b">Cluster B</div><h4>Behavioural & Interview</h4><p>Detects contradictions between interview statements and documentary evidence. Cites both sides automatically.</p></div>
<div class="card"><div class="tag tag-c">Cluster C</div><h4>Entity & Network Intelligence</h4><p>Maps who is connected to whom. Surfaces circular flows and fraud rings invisible from individual documents.</p></div>
<div class="card"><div class="tag tag-d">Cluster D</div><h4>Advanced Reasoning</h4><p>The recursive reasoning engine. Forms hypotheses, tests them, revises them, repeats until confident or flagged.</p></div>
<div class="card"><div class="tag tag-e">Cluster E</div><h4>Privacy Gateway</h4><p>Replaces real identifiers with tokens before anything reaches external AI. De-tokenises only on approval.</p></div>
<div class="card"><div class="tag tag-f">Cluster F</div><h4>Institutional Memory</h4><p>Fine-tunes the AI on your own completed cases. The system learns your fraud typologies over time.</p></div>
<div class="card"><div class="tag tag-g">Cluster G</div><h4>Integrity Elevation</h4><p>Daily external anchoring of the audit chain. Elevates from institutional-grade to independently verifiable audit integrity.</p></div>
</div>
</div>
<!-- ═══════ PREREQUISITES ═══════ -->
<div class="section" id="s-prereqs">
<h1 class="page-title">Prerequisites</h1>
<p class="page-sub">Three things must be true before you write a single line of code. The system encodes your methodology — if it is not documented, the build will fail at the hardest moment.</p>
<div class="callout warn"><div class="cl">Do This First</div>
<p>Most teams skip prerequisites and then wonder why the AI produces inconsistent outputs. The IRE encodes your investigation methodology — if that methodology is not written down, there is nothing to encode.</p></div>
<div class="steps">
<div class="step"><div class="sn">1</div><div class="sb"><h4>Document Your Investigation Methodology</h4>
<p>Write down exactly how investigations work today: how a case is opened, what evidence types you collect, what constitutes a finding, what your output documents look like (STR, HLIR, case closure memo). This becomes the specification the system is built against. Use Claude to help draft this — describe your process and ask it to structure it as a methodology document.</p></div></div>
<div class="step"><div class="sn">2</div><div class="sb"><h4>Complete a Data Inventory</h4>
<p>List every type of evidence you routinely handle: bank statement CSVs, KYC PDFs, email exports, interview recordings, transaction logs. For each type note: format, where stored, what personal data it contains. This determines which clusters you need and shapes your Cluster E design.</p></div></div>
<div class="step"><div class="sn">3</div><div class="sb"><h4>Get Legal & Compliance Sign-Off</h4>
<p>Before AI touches investigation data, your legal team must sign off on: (a) human review gate requirements in your jurisdiction, (b) whether pseudonymisation is sufficient for your data protection obligations, (c) whether AI-generated outputs are admissible in your regulatory proceedings.</p></div></div>
</div>
<h2 class="sh">Technical Prerequisites</h2>
<div class="tw"><table>
<tr><th>Prerequisite</th><th>What You Need</th><th>If You Don't Have It</th></tr>
<tr><td>Server / Cloud Environment</td><td>Linux server or cloud VM with at least 32GB RAM for local AI models</td><td>AWS EC2 g4dn.2xlarge (~$0.75/hr, ₹63/hr). Spin up only when needed.</td></tr>
<tr><td>Python 3.10+</td><td>Entire stack is Python-based. AI tools write the code — server just needs Python.</td><td>Any Ubuntu 22.04 LTS VM includes Python.</td></tr>
<tr><td>Docker</td><td>Most components (Qdrant, Neo4j, n8n) run as Docker containers.</td><td>Docker Desktop for dev, Docker Engine for production.</td></tr>
<tr><td>Git & GitHub</td><td>Version control. Enables AI coding tools most effectively.</td><td>Free. github.com. 5 commands is all you need.</td></tr>
</table></div>
<h2 class="sh">Skills You Need</h2>
<div class="callout ok"><div class="cl">Good News for Non-Technical Founders</div>
<p>With Claude Code or Cursor, you can build the majority of this system with 1 developer. The architecture is well-defined — the hard part is design, not code.</p></div>
<div class="cards">
<div class="card green"><h4>You — Domain Expert</h4><p>Investigation methodology, compliance requirements, user stories, QA of outputs. Cannot be outsourced.</p></div>
<div class="card"><h4>1 Python Developer</h4><p>Builds the pipeline, integrates tools, writes orchestration code. AI tools handle 60–70% of actual code.</p></div>
<div class="card gold"><h4>Legal Counsel (Part-Time)</h4><p>Reviews AI-in-investigation policies, data processing agreements, admissibility. One engagement to set up.</p></div>
</div>
</div>
<!-- ═══════ TOOLS ═══════ -->
<div class="section" id="s-tools">
<h1 class="page-title">Tools <span>& Subscriptions</span></h1>
<p class="page-sub">Every tool needed to build the full IRE stack. Organised by component, with free and paid options at each layer.</p>
<h2 class="sh">Vector Database — Case-Scoped Evidence Store</h2>
<div class="callout"><div class="cl">What This Is</div><p>Stores all case documents as mathematical representations so they can be searched by meaning. One namespace per case = isolation by architecture.</p></div>
<div class="tw"><table>
<tr><th>Tool</th><th>Type</th><th>Cost</th><th>Best For</th></tr>
<tr><td><strong>Qdrant</strong></td><td>Open Source</td><td><span class="cp free">Free</span> self-hosted / $25/mo cloud</td><td>Most teams — best balance. Our default recommendation.</td></tr>
<tr><td>ChromaDB</td><td>Open Source</td><td><span class="cp free">Free</span></td><td>Development / prototyping</td></tr>
<tr><td>Weaviate</td><td>Open Source</td><td><span class="cp free">Free</span> / $25+/mo</td><td>Larger teams needing more query features</td></tr>
<tr><td>Pinecone</td><td>Cloud SaaS</td><td><span class="cp low">$70+/mo</span></td><td>Teams avoiding infrastructure — but data leaves environment</td></tr>
</table></div>
<h2 class="sh">Graph Database — Entity & Network Intelligence</h2>
<div class="tw"><table>
<tr><th>Tool</th><th>Type</th><th>Cost</th><th>Notes</th></tr>
<tr><td><strong>Neo4j Community</strong></td><td>Open Source</td><td><span class="cp free">Free</span></td><td>Standard choice. Mature Cypher query language. Self-hosted = data on-premise.</td></tr>
<tr><td>Memgraph</td><td>Open Source</td><td><span class="cp free">Free</span></td><td>Faster for real-time streaming graph updates.</td></tr>
<tr><td>Neo4j AuraDB</td><td>Cloud SaaS</td><td><span class="cp low">$65+/mo</span></td><td>Managed cloud. Check data residency before use.</td></tr>
</table></div>
<h2 class="sh">Local AI Models — On-Premise Reasoning</h2>
<div class="callout warn"><div class="cl">Why Local Models Matter</div><p>Investigation data is sensitive. The majority of AI reasoning should happen on local models — data never leaves your environment. External AI is used only for final narrative generation, always pseudonymised first.</p></div>
<div class="tw"><table>
<tr><th>Tool</th><th>What It Does</th><th>Cost</th><th>Models</th><th>Hardware</th></tr>
<tr><td><strong>Ollama</strong></td><td>Runs local LLMs. Simplest setup available.</td><td><span class="cp free">Free</span></td><td>Llama 3.1 70B (Agent), Llama 3.1 8B (Orchestration)</td><td>70B: 40GB+ GPU RAM. 8B: 8GB GPU or fast CPU.</td></tr>
<tr><td>vLLM</td><td>High-performance serving for production</td><td><span class="cp free">Free</span></td><td>Any HuggingFace model</td><td>NVIDIA GPU required</td></tr>
<tr><td>BGE-M3 Embedding</td><td>Converts text to vectors for semantic search</td><td><span class="cp free">Free</span></td><td>via Ollama</td><td>Runs on CPU — no GPU needed</td></tr>
</table></div>
<h2 class="sh">External AI APIs — SOTA Reasoning (Pseudonymised Only)</h2>
<div class="tw"><table>
<tr><th>Provider</th><th>Models</th><th>Cost / M tokens</th><th>INR / M tokens</th></tr>
<tr><td><strong>Anthropic</strong></td><td>Claude Sonnet / Opus</td><td><span class="cp low">$3–$15</span></td><td>₹250–₹1,250</td></tr>
<tr><td>OpenAI</td><td>GPT-4o / GPT-4 Turbo</td><td><span class="cp low">$5–$10</span></td><td>₹420–₹840</td></tr>
<tr><td>Google</td><td>Gemini 1.5 Pro</td><td><span class="cp low">$3.50</span></td><td>₹295</td></tr>
<tr><td>Sarvam AI</td><td>Indian languages</td><td><span class="cp free">~$0.50</span></td><td>₹42</td></tr>
</table></div>
<h2 class="sh">Pipeline Orchestration</h2>
<div class="tw"><table>
<tr><th>Tool</th><th>What It Does</th><th>Cost</th><th>Notes</th></tr>
<tr><td><strong>n8n</strong></td><td>Visual workflow builder. Connects components without writing orchestration code from scratch.</td><td><span class="cp free">Free</span> self-hosted</td><td>Best for non-technical teams.</td></tr>
<tr><td>LangGraph</td><td>Python framework for agentic reasoning loops (the recursive reasoning in Cluster D).</td><td><span class="cp free">Free</span></td><td>Best option for the Planner-Analyzer-Critic loop.</td></tr>
<tr><td>LangChain</td><td>Python AI pipeline framework. More code, more control.</td><td><span class="cp free">Free</span></td><td>Developer-facing.</td></tr>
</table></div>
<h2 class="sh">PII Detection & Pseudonymisation</h2>
<div class="tw"><table>
<tr><th>Tool</th><th>What It Does</th><th>Cost</th></tr>
<tr><td><strong>Microsoft Presidio</strong></td><td>Open-source PII detection + anonymisation. Supports custom recognisers for investigation-specific entities. 100% on-premise.</td><td><span class="cp free">Free</span></td></tr>
<tr><td>spaCy NER</td><td>Named entity recognition. Use as a component within Presidio.</td><td><span class="cp free">Free</span></td></tr>
</table></div>
<h2 class="sh">Document Processing & OCR</h2>
<div class="tw"><table>
<tr><th>Tool</th><th>What It Does</th><th>Cost</th></tr>
<tr><td><strong>PyMuPDF (fitz)</strong></td><td>Fast PDF text extraction. First choice for text-based PDFs.</td><td><span class="cp free">Free</span></td></tr>
<tr><td>Tesseract OCR</td><td>Extracts text from scanned PDFs and images.</td><td><span class="cp free">Free</span></td></tr>
<tr><td>unstructured.io</td><td>Handles mixed document types (PDF, DOCX, XLSX, email) with one API.</td><td><span class="cp free">Free</span> open source</td></tr>
<tr><td>AWS Textract</td><td>Managed OCR with table extraction. Best for complex bank statement layouts.</td><td><span class="cp low">$1.50 / 1,000 pages</span> (₹126)</td></tr>
</table></div>
<h2 class="sh">Speech-to-Text — Interview Transcripts</h2>
<div class="tw"><table>
<tr><th>Tool</th><th>What It Does</th><th>Cost</th></tr>
<tr><td><strong>Whisper (Faster-Whisper)</strong></td><td>Local speech-to-text. Runs on-premise — audio never leaves your environment. Best for sensitive recordings.</td><td><span class="cp free">Free</span></td></tr>
<tr><td>Sarvam AI STT</td><td>Indian language speech-to-text.</td><td><span class="cp free">~₹1/min</span></td></tr>
<tr><td>Deepgram</td><td>Cloud STT with speaker diarisation. Only for non-sensitive recordings.</td><td><span class="cp low">$0.0043/min</span> (₹0.36/min)</td></tr>
</table></div>
<h2 class="sh">Audit Chain & Integrity</h2>
<div class="tw"><table>
<tr><th>Tool</th><th>What It Does</th><th>Cost</th></tr>
<tr><td><strong>PostgreSQL</strong></td><td>Stores the hash-chain audit log. Append-only table. Self-hosted.</td><td><span class="cp free">Free</span></td></tr>
<tr><td>AWS QLDB</td><td>External notarisation ledger (Cluster G only).</td><td><span class="cp low">~$5–20/mo</span></td></tr>
<tr><td>Azure Confidential Ledger</td><td>Alternative to QLDB for Azure stacks.</td><td><span class="cp low">~$10–25/mo</span></td></tr>
</table></div>
</div>
<!-- ═══════ COSTS ═══════ -->
<div class="section" id="s-costs">
<h1 class="page-title">Cost <span>Breakdown</span></h1>
<p class="page-sub">Detailed cost figures available on request. Contact for pricing information.</p>
<div class="callout ok"><div class="cl">The Honest Summary</div>
<p>The majority of the stack is free and open source. Costs scale with deployment profile and case volume. Contact us for a detailed cost model tailored to your organisation.</p></div>
<h2 class="sh">One-Time Build Costs</h2>
<div class="tw"><table>
<tr><th>Item</th><th>USD</th><th>INR</th></tr>
<tr><td>Developer time — Profile 3 MVP</td><td>****</td><td>****</td></tr>
<tr><td>Legal review — AI-in-investigation policies</td><td>****</td><td>****</td></tr>
<tr><td>Security review</td><td>****</td><td>****</td></tr>
<tr><td>Domain + SSL (optional web interface)</td><td>****</td><td>****</td></tr>
</table></div>
<h2 class="sh">Monthly Running Costs by Profile</h2>
<div class="tw"><table>
<tr><th>Profile</th><th>Cases/Month</th><th>USD/Month</th><th>INR/Month</th><th>Main Cost Driver</th></tr>
<tr><td><strong>1 — Minimal</strong></td><td>****</td><td>****</td><td>****</td><td>Cloud VM only. No AI API cost.</td></tr>
<tr><td><strong>2 — Interview-Led</strong></td><td>****</td><td>****</td><td>****</td><td>VM + Whisper STT compute.</td></tr>
<tr><td><strong>3 — Document Investigation</strong></td><td>****</td><td>****</td><td>****</td><td>VM + optional SOTA API.</td></tr>
<tr><td><strong>4 — Corporate Fraud</strong></td><td>****</td><td>****</td><td>****</td><td>VM with 70B GPU model + Neo4j.</td></tr>
<tr><td><strong>5 — Financial Crime</strong></td><td>****</td><td>****</td><td>****</td><td>GPU compute + SOTA API.</td></tr>
<tr><td><strong>6 — Enterprise Forensic</strong></td><td>****</td><td>****</td><td>****</td><td>Full stack + fine-tuning + notarisation.</td></tr>
</table></div>
<div class="callout"><div class="cl">Cost Optimisation Tips</div>
<p>Use <strong>spot/preemptible instances</strong> for GPU compute — significantly cheaper. <strong>Only run the large model when needed</strong> — use the smaller model for orchestration. <strong>Batch SOTA API calls</strong> — one per section, not per claim. <strong>Cache embeddings</strong> — documents don't need re-embedding unless changed.</p></div>
</div>
<!-- ═══════ PHASES ═══════ -->
<div class="section" id="s-phases">
<h1 class="page-title">Build <span>Phases</span></h1>
<p class="page-sub">Each phase delivers standalone value — you do not need to finish everything before getting something useful.</p>
<div class="pbar">
<div class="pi" style="background:#1B3A5C">Phase 1<br><small>Base</small></div>
<div class="pi" style="background:#2E6DA4">Phase 2<br><small>Cluster A</small></div>
<div class="pi" style="background:#27AE60">Phase 3<br><small>Cluster B</small></div>
<div class="pi" style="background:#D4A017">Phase 4<br><small>C + D</small></div>
<div class="pi" style="background:#C0392B">Phase 5<br><small>E + G</small></div>
<div class="pi" style="background:#512E5F">Phase 6<br><small>Reports</small></div>
</div>
<h2 class="sh">Phase 1 — Mandatory Base (Weeks 1–6)</h2>
<div class="callout ok"><div class="cl">Deliverable</div><p>A working case management system with tamper-evident audit logging and human approval gates. Defensible output even with no AI reasoning yet.</p></div>
<h3 class="sth">What to Build</h3>
<ol><li>Case creation and management (PostgreSQL + simple API)</li><li>Case-scoped namespaces in Qdrant (B1)</li><li>Audit event logger writing to PostgreSQL (B5)</li><li>Hash-chain — SHA-256 chaining on every log entry (B6)</li><li>Human review gate — approval UI writing to audit chain (B4)</li><li>Model version pinning — store model_id + version with every AI call (B7)</li><li>Bias monitoring dashboard — basic demographic flag rate tracker (B8)</li></ol>
<h3 class="sth">Exit Criteria</h3>
<ul><li>10 synthetic cases run end-to-end with complete audit chains</li><li>Hash chain verification confirms no tampering detectable</li><li>Human review gate blocks record creation without approval event</li><li>Zero cross-case data leakage confirmed by namespace isolation test</li></ul>
<h2 class="sh">The Real Implementation Challenge: Data Messiness</h2>
<div class="callout danger"><div class="cl">What Actually Dominates Months 1–3</div>
<p>AI reasoning failures are not your biggest early problem. Data quality is. Plan for this from day one — not as an afterthought when investigators complain that documents are "not showing up."</p></div>
<h3 class="sth">The Most Common Production Blockers</h3>
<div class="tw"><table>
<tr><th>Problem</th><th>What Happens</th><th>Fix</th></tr>
<tr><td>Scanned PDFs without OCR layer</td><td>PyMuPDF returns empty string. Ingestion pipeline produces empty chunks. Evidence silently missing from case.</td><td>Detect text layer presence before parsing. If empty → route to Tesseract automatically. Never fail silently.</td></tr>
<tr><td>Password-protected files</td><td>Parser throws cryptic error. File never ingests. Investigator doesn't know.</td><td>Pre-ingestion check: detect encryption before upload. Return clear error: "File is encrypted — please provide decrypted version."</td></tr>
<tr><td>Non-standard CSV encodings</td><td>pandas throws UnicodeDecodeError on GBK, Windows-1252, or ISO-8859-1 encoded bank statements.</td><td>Detect encoding with chardet before parsing any CSV. Normalise to UTF-8 at ingestion.</td></tr>
<tr><td>Inconsistent date formats</td><td>DD/MM/YYYY from one bank, MM/DD/YYYY from another, epoch timestamps from transaction logs. Chronology breaks.</td><td>Normalise all dates to ISO 8601 at ingestion. Log original format in metadata. Never assume format.</td></tr>
<tr><td>Garbled entity names from OCR</td><td>"Rajesh Kumar" becomes "R@jesh Kum4r" in a scanned KYC. Entity resolution fails. Graph node phantom.</td><td>OCR quality threshold: flag chunks where character confidence drops below 80%. Route to human review before entity extraction.</td></tr>
<tr><td>Inconsistent column headers across bank statement providers</td><td>"Amount" vs "Debit" vs "CR/DR" vs "Transaction Value" — same data, different labels across providers.</td><td>Build a column normalisation map per provider. Maintain it as a config file, not hardcoded logic.</td></tr>
<tr><td>Handwritten annotations on documents</td><td>OCR accuracy drops to 40–60% on handwriting. Worse on Indian scripts.</td><td>Flag handwritten content separately. Route to human transcription. Never auto-ingest without QC.</td></tr>
</table></div>
<div class="callout warn"><div class="cl">Build a Rejected Documents Log</div>
<p>Every case should have a <code>rejected_documents</code> log — every file that failed ingestion and exactly why. This serves two purposes: (1) the investigator knows their evidence is incomplete; (2) you have a systematic record of preprocessing failures to fix in the next sprint.</p></div>
<h2 class="sh">Phase 2 — Cluster A: Document Ingestion (Weeks 7–14)</h2>
<div class="callout ok"><div class="cl">Deliverable</div><p>Investigators can upload any document type and query it by meaning. Evidence retrieval under 1 second. Every claim cites its source and page.</p></div>
<ol><li>Document parser — PyMuPDF for PDFs, openpyxl for Excel, python-docx for Word</li><li>Document-type-aware chunker — different strategies per file type</li><li>Local embedding model — BGE-M3 via Ollama</li><li>Qdrant indexer — writes to case-scoped namespace</li><li>Retrieval Verifier — checks claims against namespace (B2)</li><li>Evidence Grounder — attaches chunk_id + source + page (B3)</li></ol>
<h2 class="sh">Phase 3 — Cluster B: Behavioural (Months 3–4)</h2>
<div class="callout ok"><div class="cl">Deliverable</div><p>Interview transcripts ingested and searchable. Contradiction detection live.</p></div>
<ol><li>Whisper STT — local transcription with speaker diarisation</li><li>Transcript chunker — speaker-turn level with timestamp + speaker metadata</li><li>Extended PII recognisers for Presidio — spoken-form names, informal references</li><li>Contradiction Detector — cross-references transcript claims vs document corpus</li><li>Transcript Approval Gate — separate HITL flag for statement-derived findings</li></ol>
<h2 class="sh">Phase 4 — Clusters C + D: Graph + Reasoning (Months 4–6)</h2>
<div class="callout ok"><div class="cl">Deliverable</div><p>Entity network mapping live. Recursive reasoning loop operational. Evidence scoring running. Profile 4 capability achieved.</p></div>
<ol><li>Entity extractor — NER pipeline feeding resolution queue</li><li>Three-tier entity resolution — Tier 1 auto, Tier 2/3 blocking HITL screens</li><li>Neo4j graph builder — entities as nodes, transactions as weighted edges</li><li>Graph query layer — circular flow, community detection, path analysis</li><li>LangGraph reasoning loop — Planner → Analyzer → Verifier → Grounder → Critic → Score</li><li>Calibrated evidence scorer — 4-component deterministic score</li></ol>
<h2 class="sh">Phase 5 — Clusters E + G: Privacy + Integrity (Months 6–10)</h2>
<div class="callout warn"><div class="cl">Legal Sign-Off Required Before This Phase</div><p>Cluster E activates external AI APIs. Get legal review of pseudonymisation approach and data processing agreements first.</p></div>
<ol><li>Presidio pseudonymisation pipeline with custom recognisers</li><li>Token map store — in-memory, encrypted at rest, session-scoped, never leaves environment</li><li>De-tokenisation gate — restores identifiers only post-approval, on-premise</li><li>External SOTA API integration — receives pseudonymised payload only</li><li>G1 — AWS QLDB integration for daily audit chain anchoring</li><li>G2 — Recording file hash → transcript linkage in audit chain</li></ol>
<h2 class="sh">Phase 6 — Report Generation + Go Live (Month 10+)</h2>
<div class="callout ok"><div class="cl">Deliverable</div><p>Full draft report in 60–90 seconds. Every claim cited. Human review protocol live. New system is primary investigation record.</p></div>
<ol><li>Query template library — one RAG + graph query pair per report section</li><li>Parallel section generator — all sections generated simultaneously</li><li>Section review UI — finding + citation map side by side, edit + approve</li><li>Citation Map appendix — auto-generated index of all findings to sources</li><li>Final report lock — approved sections assembled, report hash in audit chain</li></ol>
</div>
<!-- ═══════ FLOW: CASE ═══════ -->
<div class="section" id="s-flow-case">
<h1 class="page-title">Workflow: <span>End-to-End Case Flow</span></h1>
<p class="page-sub">How a case moves through the full IRE system — from opening to locked report.</p>
<div class="dw"><div class="dt">Full Case Lifecycle</div>
<img src="diagrams/diagram-2.svg" alt="Diagram 2" style="width:100%;max-width:100%;display:block;"></div>
<h2 class="sh">Investigator Accountability Schema</h2>
<p>Logging that a human approved a finding is necessary but not sufficient. For outputs to be genuinely audit-ready, the approval record must capture the quality and basis of the human judgment — not just the fact that it occurred.</p>
<div class="tw"><table>
<tr><th>Field</th><th>Type</th><th>Required</th><th>Purpose</th></tr>
<tr><td><code>investigator_id</code></td><td>UUID</td><td>Yes</td><td>Identity binding — who approved</td></tr>
<tr><td><code>action</code></td><td>approve / edit_approve / reject / escalate</td><td>Yes</td><td>Nature of decision</td></tr>
<tr><td><code>diff_hash</code></td><td>SHA-256</td><td>Yes</td><td>Hash of any edits made before approval — proves what changed</td></tr>
<tr><td><code>confidence_level</code></td><td>high / medium / low / uncertain</td><td>Yes</td><td>Investigator's stated confidence</td></tr>
<tr><td><code>rationale</code></td><td>Free text, min 20 chars</td><td>If confidence = low</td><td>Forces articulation of doubt when confidence is low</td></tr>
<tr><td><code>disagreement_flag</code></td><td>Boolean</td><td>Yes</td><td>True if investigator disagrees with AI finding but approves for procedural reasons</td></tr>
<tr><td><code>disagreement_note</code></td><td>Free text</td><td>If flag=true</td><td>Documents the nature of disagreement for the record</td></tr>
<tr><td><code>elapsed_review_time</code></td><td>Seconds</td><td>Yes</td><td>How long spent on this section — detects rubber-stamping</td></tr>
</table></div>
<div class="callout danger"><div class="cl">Rubber-Stamp Detection</div>
<p>An investigator approving 95%+ of findings without edits and with review times under 10 seconds per section is rubber-stamping — not reviewing. <code>elapsed_review_time</code> below a minimum threshold (suggest 30 seconds per section) triggers a quality review flag. This makes the human gate architecturally meaningful rather than ceremonial.</p></div>
<h2 class="sh">What Happens at Each Stage</h2>
<div class="tw"><table>
<tr><th>Stage</th><th>System Does</th><th>Investigator Does</th><th>Logged</th></tr>
<tr><td>Case Open</td><td>Creates isolated namespace, initialises hash chain</td><td>Names case, sets scope</td><td>case_open event</td></tr>
<tr><td>Evidence Upload</td><td>Parses, chunks, embeds, indexes all files</td><td>Uploads files, then waits (async)</td><td>ingestion event per file with file_hash</td></tr>
<tr><td>Entity Resolution</td><td>Proposes merges with evidence for/against</td><td>Approves/rejects Tier 2/3 decisions</td><td>resolution decision + evidence presented</td></tr>
<tr><td>Graph Build</td><td>Constructs entity network automatically</td><td>Reviews graph for obvious errors</td><td>graph_build event</td></tr>
<tr><td>Reasoning Loop</td><td>Runs 3–7 iterations of hypothesis → evidence → verify → score</td><td>Can interrupt or redirect at any time</td><td>hypothesis_update per iteration</td></tr>
<tr><td>Report Generation</td><td>Generates all sections in parallel with citations</td><td>Waits 60–90 seconds</td><td>report_generation event</td></tr>
<tr><td>Human Review</td><td>Presents finding + citation map per section</td><td>Reads, edits, approves each section</td><td>approval event per section with diff_hash</td></tr>
<tr><td>Case Close</td><td>Locks report, stores hash, appends Citation Map</td><td>Signs off final report</td><td>case_close event with report_hash</td></tr>
</table></div>
</div>
<!-- ═══════ FLOW: INGESTION ═══════ -->
<div class="section" id="s-flow-ingest">
<h1 class="page-title">Workflow: <span>Document Ingestion</span></h1>
<p class="page-sub">How a document goes from upload to queryable evidence in the case namespace.</p>
<div class="dw"><div class="dt">Cluster A — Document Ingestion Pipeline</div>
<img src="diagrams/diagram-3.svg" alt="Diagram 3" style="width:100%;max-width:100%;display:block;"></div>
<div class="callout warn"><div class="cl">Most Common Ingestion Failures</div>
<p><strong>Scanned PDFs without OCR layer:</strong> If PyMuPDF returns no text, auto-route to Tesseract. <strong>Password-protected files:</strong> Require investigator to decrypt before upload. <strong>Non-standard CSV encodings:</strong> Detect encoding with chardet before parsing.</p></div>
</div>
<!-- ═══════ FLOW: ENTITY ═══════ -->
<div class="section" id="s-flow-entity">
<h1 class="page-title">Workflow: <span>Entity Resolution</span></h1>
<p class="page-sub">How the system determines "Rajesh Kumar", "R. Kumar", and "RJSH_KMR" are the same person — and what happens when it is not sure.</p>
<div class="dw"><div class="dt">Three-Tier Entity Resolution</div>
<img src="diagrams/diagram-4.svg" alt="Diagram 4" style="width:100%;max-width:100%;display:block;"></div>
<div class="callout danger"><div class="cl">Critical: Do Not Skip Entity Resolution</div>
<p>A 10% entity duplication rate in a 500-node graph produces 50 phantom nodes — enough to break circular flow detection entirely. Build Tier 2/3 resolution before building the graph layer.</p></div>
</div>
<!-- ═══════ FLOW: REASONING ═══════ -->
<div class="section" id="s-flow-reason">
<h1 class="page-title">Workflow: <span>Recursive Reasoning Loop</span></h1>
<p class="page-sub">How the AI forms a hypothesis, tests it against evidence, and iterates until confident or flagged. This is Cluster D — the reasoning engine.</p>
<div class="dw"><div class="dt">Cluster D — Recursive Reasoning Loop</div>
<img src="diagrams/diagram-5.svg" alt="Diagram 5" style="width:100%;max-width:100%;display:block;"></div>
<h2 class="sh">Evidence Score Thresholds</h2>
<div class="cards">
<div class="card"><h4>0.80 — Section Lock</h4><p>Individual report sections lock at this score. Still requires minimum 3 citations.</p></div>
<div class="card gold"><h4>0.90 — Final Finding Lock</h4><p>Final finding sections require this higher threshold.</p></div>
<div class="card red"><h4>High Score + Thin Evidence</h4><p>Score high but fewer than 3 citations? System flags: "High confidence on thin evidence — manual review required." Never auto-lock.</p></div>
</div>
</div>
<!-- ═══════ FLOW: PRIVACY ═══════ -->
<div class="section" id="s-flow-privacy">
<h1 class="page-title">Workflow: <span>Privacy Gateway</span></h1>
<p class="page-sub">How PII is removed before data reaches any external AI, and restored only after investigator approval — inside your environment.</p>
<div class="dw"><div class="dt">Cluster E — Pseudonymisation Pipeline</div>
<img src="diagrams/diagram-6.svg" alt="Diagram 6" style="width:100%;max-width:100%;display:block;"></div>
<div class="callout warn"><div class="cl">Known Limitation — Inference Re-Identification</div>
<p>Pseudonymisation controls what enters the AI — not what it infers. A model given pseudonymised data may produce output that, combined with other information, re-identifies a data subject. The Human Checkpoint is the primary control. Get legal counsel to review your approach before going live.</p></div>
</div>
<!-- ═══════ FLOW: AUDIT ═══════ -->
<div class="section" id="s-flow-audit">
<h1 class="page-title">Workflow: <span>Audit Chain</span></h1>
<p class="page-sub">How every action is permanently recorded and cryptographically linked — making tampering mathematically detectable.</p>
<div class="dw"><div class="dt">Hash-Chain Immutability Design</div>
<img src="diagrams/diagram-7.svg" alt="Diagram 7" style="width:100%;max-width:100%;display:block;"></div>
<h2 class="sh">What Gets Logged</h2>
<div class="tw"><table>
<tr><th>Event</th><th>Key Fields</th><th>Why It Matters</th></tr>
<tr><td>File ingestion</td><td>file_name, file_hash, chunk_count, namespace_id</td><td>Proves exactly what evidence was in the case at what time</td></tr>
<tr><td>RAG retrieval</td><td>query_text, chunk_ids returned, timestamp</td><td>Shows what the AI was told, not just what it said</td></tr>
<tr><td>AI model call</td><td>model_id, model_version, temperature, input_hash, output_hash</td><td>Reproducibility — can re-run same call months later</td></tr>
<tr><td>Retrieval verification</td><td>claims verified, claims stripped, unverified_register_hash</td><td>Documents the anti-hallucination layer ran</td></tr>
<tr><td>Human approval</td><td>investigator_id, section, diff_hash, timestamp</td><td>The accountability record — who approved what</td></tr>
<tr><td>Final report generation</td><td>report_hash, citation_map_hash, model_versions_used</td><td>Tamper-evident seal on the final output</td></tr>
</table></div>
</div>
<!-- ═══════ CLUSTERS A-G ═══════ -->
<div class="section" id="s-ca">
<h1 class="page-title">Cluster A — <span>Documentary Evidence</span></h1>
<div class="callout"><div class="cl">In Plain English</div><p>Turns any document pile into queryable evidence. Upload a contract, spreadsheet, email chain, or scanned report — every sentence becomes retrievable by meaning, not keyword. An investigator asking "what did the subject know and when?" gets an answer in seconds, with source and page number attached.</p></div>
<div class="cards">
<div class="card"><h4>A1 Ingestion</h4><p>Parses all file types: PDF, Excel, CSV, email, Word, audio transcripts.</p></div>
<div class="card"><h4>A2 Chunking</h4><p>Splits documents using document-type-aware strategies.</p></div>
<div class="card"><h4>A3 Embedding</h4><p>Converts chunks to vectors using BGE-M3 locally. No data leaves environment.</p></div>
<div class="card"><h4>A4 Extraction</h4><p>Pulls entities, dates, amounts, relationships from each chunk.</p></div>
<div class="card"><h4>A5 Templates</h4><p>Pre-built RAG queries optimised per report section and evidence type.</p></div>
<div class="card"><h4>A6 Parallel Gen</h4><p>All report sections generated simultaneously. 60–90 seconds total.</p></div>
<div class="card navy"><h4>A7 Citation Map</h4><p>Auto-generated index mapping every claim to its source document, chunk, and page.</p></div>
</div>
<h2 class="sh">Tools Required</h2>
<div class="tw"><table>
<tr><th>Component</th><th>Tool</th><th>Cost</th></tr>
<tr><td>PDF parsing</td><td>PyMuPDF (fitz)</td><td><span class="cp free">Free</span></td></tr>
<tr><td>OCR for scanned PDFs</td><td>Tesseract</td><td><span class="cp free">Free</span></td></tr>
<tr><td>Excel/CSV parsing</td><td>pandas + openpyxl</td><td><span class="cp free">Free</span></td></tr>
<tr><td>Email parsing</td><td>extract-msg + mailparser</td><td><span class="cp free">Free</span></td></tr>
<tr><td>Embedding model</td><td>BGE-M3 via Ollama</td><td><span class="cp free">Free</span></td></tr>
<tr><td>Vector store</td><td>Qdrant self-hosted</td><td><span class="cp free">Free</span></td></tr>
</table></div>
</div>
<div class="section" id="s-cb">
<h1 class="page-title">Cluster B — <span>Behavioural & Interview</span></h1>
<div class="callout"><div class="cl">In Plain English</div><p>Detects when someone says something in an interview that contradicts the evidence. The system checks every document and transaction record in the case and flags contradictions automatically — citing both the statement and the contradicting evidence.</p></div>
<div class="cards">
<div class="card"><h4>B1 Transcript Chunking</h4><p>Splits by speaker turn. Each chunk tagged with speaker ID, timestamp, session ID.</p></div>
<div class="card"><h4>B2 Contradiction Detector</h4><p>Cross-references every transcript claim against document corpus and graph. Cites both sides.</p></div>
<div class="card navy"><h4>B3 Transcript Approval Gate</h4><p>Separate approval flag for findings from interview statements — beyond standard section approval.</p></div>
</div>
<h2 class="sh">Tools Required</h2>
<div class="tw"><table>
<tr><th>Component</th><th>Tool</th><th>Cost</th></tr>
<tr><td>Speech-to-text (local)</td><td>Whisper via Faster-Whisper</td><td><span class="cp free">Free</span></td></tr>
<tr><td>Speaker diarisation</td><td>pyannote-audio</td><td><span class="cp free">Free (HuggingFace token required)</span></td></tr>
<tr><td>PII scrubbing</td><td>Presidio with custom recognisers</td><td><span class="cp free">Free</span></td></tr>
<tr><td>Contradiction detection</td><td>LangChain + Qdrant retrieval</td><td><span class="cp free">Free</span></td></tr>
</table></div>
</div>
<div class="section" id="s-cc">
<h1 class="page-title">Cluster C — <span>Entity & Network Intelligence</span></h1>
<div class="callout"><div class="cl">In Plain English</div><p>Answers the question no document search can: who is connected to whom? Six independently filed insurance claims look unrelated until a graph shows four share two witnesses.</p></div>
<h2 class="sh">Tools Required</h2>
<div class="tw"><table>
<tr><th>Component</th><th>Tool</th><th>Cost</th></tr>
<tr><td>Graph database</td><td>Neo4j Community self-hosted</td><td><span class="cp free">Free</span></td></tr>
<tr><td>Graph queries</td><td>Neo4j Python driver + Cypher</td><td><span class="cp free">Free</span></td></tr>
<tr><td>NER for entity extraction</td><td>spaCy + custom models</td><td><span class="cp free">Free</span></td></tr>
<tr><td>Fuzzy matching (Tier 2)</td><td>rapidfuzz (Jaro-Winkler)</td><td><span class="cp free">Free</span></td></tr>
<tr><td>Graph algorithms</td><td>Neo4j Graph Data Science plugin</td><td><span class="cp free">Free (community)</span></td></tr>
</table></div>
</div>
<div class="section" id="s-cd">
<h1 class="page-title">Cluster D — <span>Advanced Reasoning</span></h1>
<div class="callout"><div class="cl">In Plain English</div><p>The reasoning engine. Forms a hypothesis, searches for evidence that tests it, challenges its own reasoning, revises, repeats. A 340-page document set that takes 3 days manually produces a draft finding in 90 seconds.</p></div>
<h2 class="sh">The Adversarial Critic — Active Hypothesis Invalidation</h2>
<div class="callout danger"><div class="cl">The Problem This Solves</div>
<p>A reasoning loop that only refines toward a conclusion — without genuinely testing alternative explanations — is not an investigation tool. It is a <strong>confirmation bias engine</strong>. The Adversarial Critic is the architectural control that prevents this.</p></div>
<p>For every iteration where the evidence score exceeds 0.70, the Adversarial Critic fires an explicit counter-query: <em>what evidence, if present in the corpus, would contradict or explain away the current hypothesis?</em> It retrieves that counter-evidence and forces the reasoning loop to account for it before proceeding.</p>
<div class="tw"><table>
<tr><th>Query Type</th><th>What It Asks</th><th>What It Surfaces</th></tr>
<tr><td>Alternative explanation</td><td>What legitimate business reason exists for the observed pattern?</td><td>Legitimate invoices, documented approval chains, regulatory filings</td></tr>
<tr><td>Counter-entity</td><td>Does any evidence suggest a different entity — not the subject — is responsible?</td><td>Third-party records, alternate beneficiary documents, entity resolution edge cases</td></tr>
<tr><td>Timeline invalidation</td><td>Is there evidence contradicting the proposed chronology?</td><td>Timestamp mismatches, document dating anomalies, contradicting records</td></tr>
<tr><td>Typology mismatch</td><td>Does the pattern match any known non-suspicious pattern from your institutional corpus?</td><td>Cluster F holdout patterns, industry-standard profiles, regulatory safe harbour descriptions</td></tr>
</table></div>
<p>A hypothesis that cannot withstand the Adversarial Critic is flagged as <strong>"contested hypothesis — alternative explanation not ruled out"</strong> and escalated with both supporting and counter-evidence presented side by side for the investigator to adjudicate.</p>
<h3 class="sth">How to Build It</h3>
<ul>
<li>Fire after every reasoning iteration where evidence score exceeds 0.70</li>
<li>Use same retrieval mechanism as main loop — RAG query against case namespace</li>
<li>Pass adversarial query results to Critic alongside supporting evidence</li>
<li>Block evidence score from exceeding 0.85 until adversarial check passes or investigator explicitly overrides</li>
<li>Prompt approach: <em>"Given the current hypothesis [X], retrieve all evidence in the case corpus that could support an alternative explanation. Focus on: legitimate business context, third-party responsibility, timeline contradictions."</em></li>
</ul>
<h2 class="sh">Tools Required</h2>
<div class="tw"><table>
<tr><th>Component</th><th>Tool</th><th>Cost</th></tr>
<tr><td>Reasoning loop</td><td>LangGraph</td><td><span class="cp free">Free</span></td></tr>
<tr><td>Local 8B model (orchestration)</td><td>Llama 3.1 8B via Ollama</td><td><span class="cp free">Free</span></td></tr>
<tr><td>Local 70B model (analysis)</td><td>Llama 3.1 70B via Ollama</td><td><span class="cp free">Free</span> + GPU compute</td></tr>
<tr><td>Schema validation</td><td>Pydantic</td><td><span class="cp free">Free</span></td></tr>
<tr><td>Hypothesis state persistence</td><td>Redis or PostgreSQL</td><td><span class="cp free">Free</span></td></tr>
</table></div>
</div>
<div class="section" id="s-ce">
<h1 class="page-title">Cluster E — <span>Privacy Gateway</span></h1>
<div class="callout"><div class="cl">In Plain English</div><p>Replaces every personal identifier with an anonymous token before anything leaves your systems. The AI reasons about tokens, not people. Real identifiers restored only after investigator approval, only inside your environment.</p></div>
<h2 class="sh">Tools Required</h2>
<div class="tw"><table>
<tr><th>Component</th><th>Tool</th><th>Cost</th></tr>
<tr><td>PII detection</td><td>Microsoft Presidio + custom recognisers</td><td><span class="cp free">Free</span></td></tr>
<tr><td>Token map store</td><td>Redis (in-memory, encrypted)</td><td><span class="cp free">Free</span></td></tr>
<tr><td>External AI API</td><td>Anthropic Claude / OpenAI GPT-4</td><td><span class="cp low">$3–15 per M tokens</span></td></tr>
</table></div>
</div>
<div class="section" id="s-cf">
<h1 class="page-title">Cluster F — <span>Institutional Memory</span></h1>
<div class="callout"><div class="cl">In Plain English</div><p>After 200+ cases, train the AI on your own investigation history. It learns your fraud typologies, your reporting standards, what you flag and what you clear.</p></div>
<div class="callout warn"><div class="cl">Minimum Threshold</div><p>Do not fine-tune before 200 completed cases. A model trained on 80 cases overfits. The general model outperforms an under-trained specific one.</p></div>
<h2 class="sh">Tools Required</h2>
<div class="tw"><table>
<tr><th>Component</th><th>Tool</th><th>Cost</th></tr>
<tr><td>Fine-tuning compute</td><td>AWS/GCP GPU instances (A100)</td><td><span class="cp med">$3–8/hr (₹252–672/hr)</span></td></tr>
<tr><td>Training framework</td><td>Axolotl or LLaMA-Factory</td><td><span class="cp free">Free</span></td></tr>
<tr><td>Holdout evaluation</td><td>Custom evaluation harness</td><td><span class="cp free">Free</span></td></tr>
</table></div>
</div>
<div class="section" id="s-cg">
<h1 class="page-title">Cluster G — <span>Integrity Elevation</span></h1>
<div class="callout"><div class="cl">In Plain English</div><p>Daily external anchoring of your audit chain. A third party can mathematically verify every entry — without trusting your systems. "We say it wasn't altered" becomes "it's mathematically provable."</p></div>
<h2 class="sh">Tools Required</h2>
<div class="tw"><table>
<tr><th>Component</th><th>Tool</th><th>Cost</th></tr>
<tr><td>External ledger anchor (G1)</td><td>AWS QLDB or Azure Confidential Ledger</td><td><span class="cp low">~$5–20/month</span></td></tr>
<tr><td>Recording → transcript hash link (G2)</td><td>Custom Python — SHA-256 + PostgreSQL</td><td><span class="cp free">Free</span></td></tr>
<tr><td>Chain verification tool</td><td>Custom Python verification script</td><td><span class="cp free">Free</span></td></tr>
</table></div>
</div>
<!-- ═══════ AI GUIDE ═══════ -->
<div class="section" id="s-ai">
<h1 class="page-title">Building <span>With AI Tools</span></h1>
<p class="page-sub">Which AI tools to use for which parts of the build — and how to prompt them effectively.</p>
<div class="callout ok"><div class="cl">The Core Principle</div>
<p>You provide the architecture and domain expertise. AI tools write the code. You design each component's inputs, outputs, and constraints in plain English — AI coding tools implement it. Never ask AI to design the architecture from scratch.</p></div>
<h2 class="sh">Which AI Tool for Which Task</h2>
<div class="tw"><table>
<tr><th>Task</th><th>Best Tool</th><th>Sample Prompt Approach</th></tr>
<tr><td>Write pipeline code (ingestion, chunking, embedding)</td><td><strong>Claude Code or Cursor</strong></td><td>"Build a document ingestion pipeline: takes PDF, extracts with PyMuPDF, chunks by paragraph with 100-char overlap, stores in Qdrant namespace {case_id} with metadata {source, page_ref, chunk_id}"</td></tr>
<tr><td>Design database schemas</td><td><strong>Claude (this interface)</strong></td><td>"Design a PostgreSQL schema for an append-only audit log with SHA-256 hash chaining where each row links to the previous row's hash."</td></tr>
<tr><td>Write Cypher queries for Neo4j</td><td><strong>Claude or ChatGPT</strong></td><td>"Write a Cypher query that detects circular fund flows where the same amount passes through 3+ entities and returns to origin within 30 days."</td></tr>
<tr><td>Build the HITL review UI</td><td><strong>Claude Code or v0.dev</strong></td><td>"Build a section review interface: finding text on left, citation map on right, edit button, approve button. Approve writes to audit log."</td></tr>
<tr><td>Write investigation query templates</td><td><strong>Claude (this interface)</strong></td><td>"Write a RAG query for the Red Flags section of a financial crime investigation. Should surface: structuring, layering, velocity spikes, round-tripping."</td></tr>
<tr><td>Debug errors</td><td><strong>Claude Code or Cursor</strong></td><td>Paste the full error + relevant code. "This is failing at the Qdrant write step. Here is the error and the ingestion code."</td></tr>
</table></div>
<h2 class="sh">AI Tool Subscriptions You Need</h2>
<div class="tw"><table>
<tr><th>Tool</th><th>What It Is</th><th>Cost</th><th>INR</th><th>Priority</th></tr>
<tr><td><strong>Claude Pro</strong></td><td>Architecture design, query templates, documentation, review.</td><td>$20/month</td><td>₹1,680/month</td><td>Essential</td></tr>
<tr><td><strong>Claude Code</strong></td><td>Terminal-based agentic coding. Reads your repo, writes and runs code, fixes errors.</td><td>~$30–100/month</td><td>₹2,520–₹8,400/month</td><td>Essential for build phase</td></tr>
<tr><td><strong>Cursor</strong></td><td>IDE with AI coding built in. Alternative to Claude Code.</td><td>$20/month</td><td>₹1,680/month</td><td>Choose: Cursor OR Claude Code</td></tr>
<tr><td><strong>LangSmith</strong></td><td>Traces every LLM call — essential for debugging the reasoning loop.</td><td>Free / $39/month</td><td>₹3,276/month</td><td>Add in Phase 4</td></tr>
</table></div>
<h2 class="sh">Prompting Principles</h2>
<div class="steps">
<div class="step"><div class="sn">1</div><div class="sb"><h4>Specify Inputs and Outputs Exactly</h4><p>Don't say "build a chunker." Say "Build a chunker that takes a string and returns a list of dicts with keys: chunk_id (UUID), text (string), page_ref (int), source_doc (string), char_count (int). Use 500-char chunks with 100-char overlap."</p></div></div>
<div class="step"><div class="sn">2</div><div class="sb"><h4>Include the Non-Negotiable Constraint</h4><p>Always add the architectural constraint: "The namespace_id must match the case_id — never query across namespaces" or "Temperature must be 0 — no randomness in extraction tasks."</p></div></div>
<div class="step"><div class="sn">3</div><div class="sb"><h4>Ask for Tests Alongside Code</h4><p>Add "also write 3 pytest test cases" to every code request. Test cases clarify what the code should do and catch errors before production.</p></div></div>
<div class="step"><div class="sn">4</div><div class="sb"><h4>Review Before Running</h4><p>Read the generated code before running in production. You don't need to understand every line — but you should understand what it does. Ask Claude to explain anything unclear.</p></div></div>
</div>
</div>
<!-- ═══════ PROFILES ═══════ -->
<div class="section" id="s-profiles">
<h1 class="page-title">Deployment <span>Profiles</span></h1>
<p class="page-sub">Six reference configurations — from minimal compliant to full enterprise forensic. Start where your needs are, grow from there.</p>
<div class="tw"><table>
<tr><th>Profile</th><th>Components</th><th>Use Case</th><th>Monthly Cost</th><th>Build Time</th></tr>
<tr><td><strong>1 — Minimal Compliant</strong></td><td>8</td><td>Simple HR, low-volume internal investigations</td><td>$20–50 / ₹1,680–4,200</td><td>4–6 weeks</td></tr>
<tr><td><strong>2 — Interview-Led</strong></td><td>11</td><td>HR investigations, whistleblower, testimony-primary cases</td><td>$30–80 / ₹2,520–6,720</td><td>6–8 weeks</td></tr>
<tr><td><strong>3 — Document Investigation</strong></td><td>20</td><td>Legal due diligence, procurement review, regulatory exam prep</td><td>$80–200 / ₹6,720–16,800</td><td>10–14 weeks</td></tr>
<tr><td><strong>4 — Corporate Fraud</strong></td><td>28</td><td>FCPA/ABAC, bribery, beneficial ownership, multi-party fraud</td><td>$200–500 / ₹16,800–42,000</td><td>16–24 weeks</td></tr>
<tr><td><strong>5 — Financial Crime Full</strong></td><td>31</td><td>AML, STR generation, transaction monitoring follow-up</td><td>$600–2,000 / ₹50,400–1,68,000</td><td>24–32 weeks</td></tr>
<tr><td><strong>6 — Enterprise Forensic</strong></td><td>35</td><td>Enforcement agency, Big 4 forensic, court-bound outputs</td><td>$2,000–6,000 / ₹1,68,000–5,04,000</td><td>32–40 weeks</td></tr>
</table></div>
<div class="callout dark"><div class="cl">Which Profile Should You Start With?</div>
<p>Documents primary → Profile 3. Interviews primary → Profile 2. Entity network mapping needed → Profile 4. Regulated financial institution with external AI → Profile 5. Start at the lowest profile covering your current needs — upgrade incrementally without rebuilding.</p></div>
</div>
<!-- ═══════ FAQ ═══════ -->
<div class="section" id="s-faq">
<h1 class="page-title">Frequently Asked <span>Questions</span></h1>
<div class="acc"><div class="ah" onclick="toggleAcc(this)"><span>Do I need to know how to code to build this?</span><span class="arrow">▶</span></div>
<div class="ab"><p>You need at least one person who can write Python and work with APIs. With Claude Code or Cursor, a single developer can build Profile 3 in 8–12 weeks. You as the founder provide the domain knowledge — what the system needs to do, what constitutes a finding. AI tools handle most of the actual code writing.</p></div></div>
<div class="acc"><div class="ah" onclick="toggleAcc(this)"><span>Can I use cloud services instead of self-hosting everything?</span><span class="arrow">▶</span></div>
<div class="ab"><p>Yes, with a critical caveat: investigation data is sensitive. The vector database, graph database, audit log, and PII token map must stay on-premise or in a jurisdiction-compliant private cloud. Only pseudonymised data may reach external AI APIs (Cluster E). For Profiles 1–4, everything can run on a private cloud VM without any external AI.</p></div></div>
<div class="acc"><div class="ah" onclick="toggleAcc(this)"><span>How long before the system delivers value?</span><span class="arrow">▶</span></div>
<div class="ab"><p>Phase 1 (Mandatory Base) delivers defensible audit-logged case management in 4–6 weeks — before any AI reasoning is active. Phase 2 delivers evidence search in under 1 second by week 14. Phase 4 delivers draft findings in 60–90 seconds. You do not need the full platform to start getting value.</p></div></div>
<div class="acc"><div class="ah" onclick="toggleAcc(this)"><span>What is the minimum hardware I need?</span><span class="arrow">▶</span></div>
<div class="ab"><p>Profiles 1–3 (no 70B model): cloud VM with 8GB RAM and 4 vCPUs (~$30–50/month on AWS/GCP). Profile 4+ (70B model): GPU instance with at least 40GB GPU RAM. AWS g5.12xlarge or GCP a2-highgpu-1g. Budget $150–400/month for compute. Alternatively, an RTX 4090 workstation (24GB VRAM) can run 70B in two instances.</p></div></div>
<div class="acc"><div class="ah" onclick="toggleAcc(this)"><span>Is the system admissible in court or regulatory proceedings?</span><span class="arrow">▶</span></div>
<div class="ab"><p>The architecture is designed for maximum defensibility — tamper-evident audit chain, human approval at every finding, evidence citations on every claim. With Cluster G active, integrity is independently verifiable. However, admissibility is a legal question that depends on jurisdiction and proceeding type. Engage qualified legal counsel before relying on IRE outputs in any formal proceeding.</p></div></div>
<div class="acc"><div class="ah" onclick="toggleAcc(this)"><span>How do I handle Indian regulatory requirements (RBI, PMLA, SEBI)?</span><span class="arrow">▶</span></div>
<div class="ab"><p>The architecture is jurisdiction-agnostic — map your specific regulatory framework into the investigation methodology document. For RBI/PMLA: your STR templates, threshold amounts, and filing windows become the query template library (A5) and framework coverage checklist (D4). For data residency: all data stores must be on Indian infrastructure — use AWS Mumbai, Azure India, or domestic providers (Jio Cloud, Tata TCS). Engage a PMLA-specialist counsel to sign off before production.</p></div></div>
<div class="acc"><div class="ah" onclick="toggleAcc(this)"><span>What is the difference between institutional-grade and independently verifiable audit integrity?</span><span class="arrow">▶</span></div>
<div class="ab"><p>Institutional-grade (Base B5+B6): the audit chain is tamper-evident — any modification breaks the cryptographic links, detectable by anyone with access to your systems. Court-grade (+ Cluster G): the daily chain hash is anchored to an independent external ledger. A third party can verify integrity without accessing your systems. "We say it wasn't altered" vs "a third party can prove it wasn't altered."</p></div></div>
<div class="acc"><div class="ah" onclick="toggleAcc(this)"><span>When should I NOT build this?</span><span class="arrow">▶</span></div>
<div class="ab"><p>Don't build this if: fewer than 5 investigations per month (overhead exceeds value at that scale); investigation methodology is not documented (nothing to encode); you cannot allocate one developer for 3+ months. Start with a simpler case management system and return to IRE when volume and complexity justify it.</p></div></div>
</div>
<!-- ═══════ INVESTIGATOR UX ═══════ -->
<div class="section" id="s-ux">
<h1 class="page-title">Investigator <span>UX — Decision Screens</span></h1>
<p class="page-sub">The backend is only half the system. This is where product success actually happens — the investigator-facing interfaces that make the architecture usable in practice.</p>
<div class="callout danger"><div class="cl">The Missing Piece</div>
<p>Both the whitepaper and most technical architectures are backend-heavy. But investigators interact with screens, not APIs. If the decision screens are confusing, slow, or disconnected from the evidence, investigators will bypass them — defeating the human gate entirely.</p></div>
<h2 class="sh">The Five Critical Screens</h2>
<h3 class="sth">Screen 1 — Entity Resolution Decision Screen</h3>
<p>Appears when Tier 2 or Tier 3 entity match is detected. The most important HITL screen in the system — graph quality lives or dies here.</p>
<div class="tw"><table>
<tr><th>Left Panel</th><th>Right Panel</th><th>Bottom Bar</th></tr>
<tr><td>Entity A — all known attributes, source documents, frequency of appearance</td><td>Entity B — all known attributes, source documents, frequency of appearance</td><td>Three buttons: MERGE (green) / KEEP SEPARATE (gray) / LINK AS RELATED PARTY (yellow). Confidence score shown prominently. Timer visible.</td></tr>
</table></div>
<p><strong>Design principle:</strong> Never show just names. Show the evidence that led to the match proposal — field comparison table, similarity scores, co-occurrence in documents. The investigator is making an evidentiary decision, not a clerical one.</p>
<h3 class="sth">Screen 2 — Finding Review Screen (Section-by-Section)</h3>
<p>The primary review interface. Appears after report generation for each section.</p>
<div class="tw"><table>
<tr><th>Left Panel — Finding</th><th>Right Panel — Citation Map</th></tr>
<tr><td>Full finding text with inline citation tags highlighted. Edit button. Word count. Iteration count (how many reasoning cycles produced this).</td><td>Every citation expanded: document name, page number, chunk text preview. Click to open source document at exact location.</td></tr>
</table></div>
<p><strong>Design principle:</strong> The investigator should never have to leave this screen to verify a citation. Everything needed to approve or reject is visible in one view.</p>
<h3 class="sth">Screen 3 — Contradiction Alert Screen</h3>
<p>Appears when Cluster B detects a statement-evidence contradiction. Requires separate approval before entering the record.</p>
<div class="tw"><table>
<tr><th>Top Half — Statement</th><th>Bottom Half — Contradicting Evidence</th></tr>
<tr><td>Exact transcript quote with speaker ID, timestamp, session number, and interview context.</td><td>The contradicting document chunk, highlighted to the exact contradicting sentence, with source and page.</td></tr>
</table></div>
<p><strong>Design principle:</strong> Both sides presented simultaneously. Investigator must explicitly assess the contradiction — not just note that it exists. Approval options: MATERIAL CONTRADICTION / MINOR DISCREPANCY / NOT A CONTRADICTION (with rationale required for the last option).</p>
<h3 class="sth">Screen 4 — Graph + Evidence Unified View</h3>
<p>The most complex screen. Shows entity network alongside the documents that support it.</p>
<ul>
<li><strong>Graph panel (left 60%):</strong> Interactive entity network. Click any node → documents referencing that entity appear in right panel. Click any edge → transactions supporting that relationship appear.</li>
<li><strong>Document panel (right 40%):</strong> Scrollable evidence feed filtered to selected node/edge. Each chunk shows source, page, date, and citation count.</li>
<li><strong>Timeline bar (bottom):</strong> Drag to filter both graph and documents to a date range.</li>
</ul>
<p><strong>Design principle:</strong> Graph and documents are not separate tabs — they are linked views. Selecting a graph element immediately updates the document panel. This is the screen that makes network intelligence accessible to non-technical investigators.</p>
<h3 class="sth">Screen 5 — Adversarial Review Screen</h3>
<p>Appears when the Adversarial Critic flags a contested hypothesis.</p>
<div class="tw"><table>
<tr><th>Left Column — Supporting Evidence</th><th>Right Column — Counter-Evidence</th></tr>
<tr><td>Evidence supporting current hypothesis with citations and evidence score.</td><td>Evidence supporting alternative explanation with citations and source.</td></tr>
</table></div>
<p>Investigator must explicitly choose: HYPOTHESIS STANDS / REVISE HYPOTHESIS / ESCALATE FOR SENIOR REVIEW. Cannot skip this screen when adversarial flag is raised.</p>
<h2 class="sh">Build Approach for Non-Technical Founders</h2>
<div class="steps">
<div class="step"><div class="sn">1</div><div class="sb"><h4>Wireframe First with Claude</h4>
<p>Describe each screen to Claude: "Build an HTML mockup of a finding review screen with: left panel showing finding text with highlighted citations, right panel showing citation map with source preview, approve/edit/reject buttons at the bottom." Claude will generate a working HTML prototype in one pass.</p></div></div>
<div class="step"><div class="sn">2</div><div class="sb"><h4>Test With a Real Investigator Before Coding the Backend</h4>
<p>Get an HTML mockup in front of an actual investigator (even yourself) before connecting it to the backend. The most expensive UI bugs are architectural — discovered after the backend is built.</p></div></div>
<div class="step"><div class="sn">3</div><div class="sb"><h4>Graph Visualisation: Use an Existing Library</h4>
<p>Do not build a graph renderer from scratch. Use: <strong>vis.js Network</strong> (free, straightforward) or <strong>Cytoscape.js</strong> (more powerful, still free). Connect to your Neo4j query layer via a simple REST API. The library handles the rendering.</p></div></div>
<div class="step"><div class="sn">4</div><div class="sb"><h4>Every Approval Writes to Audit Chain</h4>
<p>The UI is the interface to the audit chain. Every button click that constitutes an investigator decision must write an immutable log entry. This is not optional — build it into the frontend as a requirement, not an afterthought.</p></div></div>
</div>
<h2 class="sh">Graph Visualisation Libraries</h2>
<div class="tw"><table>
<tr><th>Library</th><th>Best For</th><th>Cost</th><th>Neo4j Integration</th></tr>
<tr><td><strong>vis.js Network</strong></td><td>Simple to medium complexity graphs. Quick to set up. Good for most investigation use cases.</td><td>Free</td><td>Fetch Cypher results as JSON, feed directly to vis.js dataset</td></tr>
<tr><td>Cytoscape.js</td><td>Complex graphs with custom layouts and styling. Better performance at scale.</td><td>Free</td><td>Same as vis.js — JSON data format</td></tr>
<tr><td>Neo4j Bloom</td><td>Built-in Neo4j graph explorer. Zero frontend code. Good for internal teams.</td><td>Free (Community) / $65+/mo (Cloud)</td><td>Native — no integration needed</td></tr>
<tr><td>D3.js Force Graph</td><td>Maximum customisation. Steep learning curve.</td><td>Free</td><td>Manual data transformation required</td></tr>
</table></div>
</div>
</div></div><!-- /.content /.main -->
<script>
function show(id) {
document.querySelectorAll('.section').forEach(function(s) {
s.classList.remove('active');
});
var el = document.getElementById('s-' + id);
if (el) {
el.classList.add('active');
window.scrollTo(0, 0);
}
document.querySelectorAll('.sb-link').forEach(function(a) {
a.classList.remove('active');
});
document.querySelectorAll('.sb-link').forEach(function(a) {
var oc = a.getAttribute('onclick') || '';
if (oc.indexOf("'" + id + "'") !== -1) {
a.classList.add('active');
}
});
}
function toggleAcc(header) {
var body = header.nextElementSibling;
var arrow = header.querySelector('.arrow');
var isOpen = body.classList.contains('open');
document.querySelectorAll('.ab.open').forEach(function(b) {
b.classList.remove('open');
var a = b.previousElementSibling.querySelector('.arrow');
if (a) a.classList.remove('open');
});
if (!isOpen) {
body.classList.add('open');
if (arrow) arrow.classList.add('open');
}
}
</script>
</body>
</html>