-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathprocess.py
More file actions
88 lines (75 loc) · 2.81 KB
/
process.py
File metadata and controls
88 lines (75 loc) · 2.81 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
from g4f.client import Client
client = Client()
import os
generation_config = {
"temperature": 1,
"top_p": 0.95,
"top_k": 64,
"max_output_tokens": 8192,
"response_mime_type": "text/plain",
}
system_instruction = """
You are a highly skilled data extraction AI. Your task is to analyze Reddit 'chance me' posts and extract information to create a structured JSON output.
**Strictly adhere to the following JSON template.** If a piece of information is not explicitly mentioned in the Reddit post, use `null` as the value for that field. Do not invent or assume information. Focus only on extracting data present in the text.
**Output MUST be valid JSON.**
```json
{
"Schools": {
"[School Name]": {
"Decision": "(Accepted/Rejected/Waitlisted/Pending/null)",
"Application_Type": "(ED/EA/RD/null)",
"Financial_Aid": {
"University": "(Details or null)",
"Scholarships": ["Scholarship Name", ...] // Array of strings or empty array if none
}
},
// ... more schools as key-value pairs
},
"Demographics": {
"Gender": "(Male/Female/Other/null)",
"Race/Ethnicity": "(e.g., White, Asian, Black/null)",
"State": "(US State or Country/null)",
"Type_of_school": "(Public/Private/Parochial/null)",
"Hooks": "(URM, First Gen, Athlete, etc./null)"
},
"Intended_major(s)": {
"Default": "(Major if not school-specific/null)",
"[School Name]": "(Major at specific school/null)" // School-specific majors as key-value pairs
},
"Academics": {
"ACT": {
"Composite": (Number or null),
"Reading": (Number or null),
"Math": (Number or null) // Optional sections
},
"SAT": {
"Composite": (Number or null),
"Reading": (Number or null),
"Math": (Number or null) // Optional sections
},
"SAT II": "(List of SAT IIs if available/null)", // String or null
"Class_rank": "(e.g., 10/200 or null)", // String or null
"UW/W GPA": {
"UW": "(GPA or null)", // String or null
"W": "(GPA or null)" // String or null
},
"Coursework": "(Description of APs, Honors, etc./null)" // String or null
},
"Awards": ["Award 1", "Award 2", ...] // Array of strings or empty array if none
"Extracurriculars": ["EC 1", "EC 2", ...] // Array of strings or empty array if none
}
"""
for i in range(1212, 3085):
with open("applications/application" + str(i) + ".txt", "r", encoding="utf-8") as f:
x = f.read()
print(x)
response = client.chat.completions.create(
model="gpt-4o",
messages=[
{"role": "system", "content": system_instruction},
{"role": "user", "content": str(x)}
],
web_search=False
)
with open("dataset/application-" + str(i) + ".json", "w", encoding="utf-8") as c:
c.write(response.choices[0].message.content)