-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy path__init__.py
More file actions
67 lines (56 loc) · 1.89 KB
/
__init__.py
File metadata and controls
67 lines (56 loc) · 1.89 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
"""
Embedding Space Invaders - Multi-Layer Detection System for LLM Prompt Injection
A multi-layer embedding analysis system that attempts to detect prompt injection
attacks by measuring geometric distances in transformer embedding space.
Status: Doesn't really work (96.9% and 100% FPR), but well-documented!
Key Features:
- Multi-layer transformer analysis (5 layers)
- Statistical anomaly detection (Mahalanobis, cosine similarity, PCA)
- Detection-first approach with optional mitigation
- Comprehensive logging and evaluation
- Spectacular failure metrics
Main Components:
1. EmbeddingSpaceInvaders: Multi-layer detection pipeline
2. ESInvadersHarness: Experiment runner and evaluator
Usage:
# Basic detection
from embedding_space_invaders import EmbeddingSpaceInvaders
detector = EmbeddingSpaceInvaders(
model_name="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
detection_mode="voting",
min_anomalous_layers=2
)
detector.train_baseline(safe_prompts)
scores = detector.score_prompt("User input here")
# Run experiments
from embedding_space_invaders import ESInvadersHarness
harness = ESInvadersHarness(detector)
results = harness.run_detection_experiment(test_prompts, labels)
"""
from .embedding_space_invaders import (
EmbeddingSpaceInvaders,
AggregatedScores,
LayerScores,
ThresholdConfig
)
from .es_invaders_harness import ESInvadersHarness
from .metrics import (
compute_mahalanobis_distance,
compute_cosine_similarity,
compute_pca_projections,
compute_baseline_statistics,
compute_attack_centroid
)
__version__ = "1.0.0"
__all__ = [
"EmbeddingSpaceInvaders",
"ESInvadersHarness",
"AggregatedScores",
"LayerScores",
"ThresholdConfig",
"compute_mahalanobis_distance",
"compute_cosine_similarity",
"compute_pca_projections",
"compute_baseline_statistics",
"compute_attack_centroid"
]