modeltrain/process.py at main · trisanths/modeltrain · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
from g4f.client import Client

client = Client()

import os

generation_config = {
  "temperature": 1,
  "top_p": 0.95,
  "top_k": 64,
  "max_output_tokens": 8192,
  "response_mime_type": "text/plain",
}


system_instruction = """
You are a highly skilled data extraction AI. Your task is to analyze Reddit 'chance me' posts and extract information to create a structured JSON output.

**Strictly adhere to the following JSON template.**  If a piece of information is not explicitly mentioned in the Reddit post, use `null` as the value for that field.  Do not invent or assume information.  Focus only on extracting data present in the text.

**Output MUST be valid JSON.**

```json
{
  "Schools": {
    "[School Name]": {
      "Decision": "(Accepted/Rejected/Waitlisted/Pending/null)",
      "Application_Type": "(ED/EA/RD/null)",
      "Financial_Aid": {
        "University": "(Details or null)",
        "Scholarships": ["Scholarship Name", ...] // Array of strings or empty array if none
      }
    },
    // ... more schools as key-value pairs
  },
  "Demographics": {
    "Gender": "(Male/Female/Other/null)",
    "Race/Ethnicity": "(e.g., White, Asian, Black/null)",
    "State": "(US State or Country/null)",
    "Type_of_school": "(Public/Private/Parochial/null)",
    "Hooks": "(URM, First Gen, Athlete, etc./null)"
  },
  "Intended_major(s)": {
    "Default": "(Major if not school-specific/null)",
    "[School Name]": "(Major at specific school/null)" // School-specific majors as key-value pairs
  },
  "Academics": {
    "ACT": {
      "Composite": (Number or null),
      "Reading": (Number or null),
      "Math": (Number or null) // Optional sections
    },
    "SAT": {
      "Composite": (Number or null),
      "Reading": (Number or null),
      "Math": (Number or null) // Optional sections
    },
    "SAT II": "(List of SAT IIs if available/null)", // String or null
    "Class_rank": "(e.g., 10/200 or null)", // String or null
    "UW/W GPA": {
      "UW": "(GPA or null)", // String or null
      "W": "(GPA or null)"  // String or null
    },
    "Coursework": "(Description of APs, Honors, etc./null)" // String or null
  },
  "Awards": ["Award 1", "Award 2", ...] // Array of strings or empty array if none
  "Extracurriculars": ["EC 1", "EC 2", ...] // Array of strings or empty array if none
}
"""

for i in range(1212, 3085):

    with open("applications/application" + str(i) + ".txt", "r", encoding="utf-8") as f:
        x = f.read()
        print(x)

    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": system_instruction},
            {"role": "user", "content": str(x)}
        ],
        web_search=False
    )
    with open("dataset/application-" + str(i) + ".json", "w", encoding="utf-8") as c:
        c.write(response.choices[0].message.content)