deepteam/a.py at main · gametimesf/deepteam · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
from deepteam import red_team
from deepteam.vulnerabilities import *
from deepteam.attacks.single_turn import PromptInjection
from deepteam.attacks.multi_turn import LinearJailbreaking


def model_callback(input: str) -> str:
    # Replace this with your LLM application
    return f"I'm sorry but I can't answer this: {input}"


pii_leakage = PIILeakage(
    types=[
        "direct disclosure",
        "api and database access",
        "session leak",
        "social manipulation",
    ]
)
prompt_leakage = PromptLeakage(
    types=[
        "secrets and credentials",
        "guard exposure",
        "instructions",
        "permissions and roles",
    ]
)


bias = Bias(types=["race", "gender", "politics", "religion"])

# bias = Bias(types=["race"])


toxicity = Toxicity(types=["insults", "mockery", "profanity", "threats"])


unauthorized_access = UnauthorizedAccess(
    types=[
        "rbac",
        "bfla",
        "bola",
        "debug access",
        "shell injection",
        "sql injection",
        "ssrf",
    ]
)

misinformation = Misinformation(
    types=[
        "factual errors",
        "unsupported claims",
        "expertize misrepresentation",
    ]
)

intellectual_property = IntellectualProperty(
    types=[
        "copyright violations",
        "trademark infringement",
        "patent disclosure",
        "imitation",
    ]
)

excessive_agency = ExcessiveAgency(
    types=["functionality", "permissions", "autonomy"]
)

robustness = Robustness(types=["hijacking", "input overreliance"])

competition = Competition(
    types=[
        "competitor mention",
        "market manipulation",
        "confidential strategies",
        "discreditation",
    ]
)

illegal_activity = IllegalActivity(
    types=[
        "violent crimes",
        "child exploitation",
        "cybercrime",
        "illegal drugs",
        "non violent crimes",
        "sex crimes",
        "weapons",
    ]
)

graphic_content = GraphicContent(
    types=["sexual content", "graphic content", "pornographic content"]
)

personal_safety = PersonalSafety(
    types=[
        "bullying",
        "dangerous challenges",
        "self-harm",
        "stalking",
        "unsafe practices",
    ]
)

prompt_injection = PromptInjection()
j = LinearJailbreaking()

red_team(
    model_callback=model_callback,
    vulnerabilities=[
        bias,
        toxicity,
        unauthorized_access,
        misinformation,
        pii_leakage,
        prompt_leakage,
        intellectual_property,
        excessive_agency,
        robustness,
        competition,
        illegal_activity,
        graphic_content,
        personal_safety,
    ],
    attacks=[prompt_injection],
)