From 41692eea0ed7fc50a77b2b21a61c6032e5e845f8 Mon Sep 17 00:00:00 2001
From: akshajnad <98061663+akshajnad@users.noreply.github.com>
Date: Wed, 3 Sep 2025 23:28:12 -0400
Subject: [PATCH 1/6] Create app.py

---
 submissions/Akshaj Nadimpalli/app.py | 590 +++++++++++++++++++++++++++
 1 file changed, 590 insertions(+)
 create mode 100644 submissions/Akshaj Nadimpalli/app.py

diff --git a/submissions/Akshaj Nadimpalli/app.py b/submissions/Akshaj Nadimpalli/app.py
new file mode 100644
index 0000000..23730ab
--- /dev/null
+++ b/submissions/Akshaj Nadimpalli/app.py	
@@ -0,0 +1,590 @@
+import argparse, csv, json, os, random, re, html
+from pathlib import Path
+from bs4 import BeautifulSoup
+import numpy as np
+from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
+import json
+import urllib.parse
+
+# ---------- Utilities ----------
+
+from urllib.request import Request, urlopen
+from urllib.error import HTTPError, URLError
+
+def _fetch(url, headers):
+    req = Request(url, headers=headers)
+    return urlopen(req, timeout=20).read().decode("utf-8", errors="ignore")
+
+def get_text(src: str) -> str:
+    
+    def is_short(s: str, n: int = 800) -> bool:
+        return len((s or "").strip()) < n
+
+    if src.startswith(("http://", "https://")):
+        hdrs = {
+            "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0 Safari/537.36",
+            "Accept-Language": "en-US,en;q=0.9",
+        }
+        def fetch(u): 
+            return _fetch(u, hdrs)
+
+        # ---------------- WIKIPEDIA ----------------
+        if "wikipedia.org/wiki/" in src:
+            slug = src.rsplit("/", 1)[-1]
+            title = urllib.parse.unquote(slug).replace("_", " ")
+
+            try:
+                plain = fetch(f"https://en.wikipedia.org/api/rest_v1/page/plain/{slug}")
+                plain = re.sub(r'\r\n?', '\n', plain)
+                cleaned = clean_wiki_text(plain)
+                if not is_short(cleaned, 1200):
+                    return cleaned
+            except Exception:
+                pass
+
+            try:
+                mob = fetch(f"https://en.wikipedia.org/api/rest_v1/page/mobile-html/{slug}")
+                text_only = re.sub(r"<[^>]+>", " ", mob)
+                text_only = re.sub(r"\s+", " ", text_only)
+                cleaned = clean_wiki_text(text_only)
+                if not is_short(cleaned, 1200):
+                    return cleaned
+            except Exception:
+                pass
+
+            try:
+                api = fetch("https://en.wikipedia.org/w/api.php"
+                            f"?action=query&prop=extracts&explaintext=1&redirects=1&format=json&titles={urllib.parse.quote(title)}")
+                obj = json.loads(api)
+                pages = obj.get("query", {}).get("pages", {})
+                extract = ""
+                for _, page in pages.items():
+                    if "extract" in page and page["extract"]:
+                        extract = page["extract"]
+                        break
+                cleaned = clean_wiki_text(extract)
+                if cleaned and not is_short(cleaned, 1200):
+                    return cleaned
+            except Exception:
+                pass
+
+            try:
+                html_doc = fetch(src)
+                soup = BeautifulSoup(html_doc, "html.parser")
+                for el in soup.select("sup.reference, span.mw-editsection"):
+                    el.decompose()
+                root = soup.select_one("div.mw-parser-output") or soup
+                paras = []
+                for node in root.find_all(["p", "h2", "h3", "li"], recursive=True):
+                    txt = node.get_text(" ", strip=True)
+                    if not txt:
+                        continue
+                    if node.name in ("h2", "h3"):
+                        paras.append(f"\n== {txt} ==")
+                    else:
+                        paras.append(txt)
+                body = "\n".join(paras)
+                cleaned = clean_wiki_text(body)
+                if not is_short(cleaned, 1200):
+                    return cleaned
+            except Exception:
+                pass
+
+            try:
+                data = json.loads(fetch(f"https://en.wikipedia.org/api/rest_v1/page/summary/{slug}"))
+                extract = (data.get("extract") or "").strip()
+                cleaned = clean_wiki_text(extract)
+                if cleaned:
+                    return cleaned
+            except Exception:
+                pass
+
+            return ""  #failed :(
+
+        # --------------- NON-WIKIPEDIA ---------------
+        try:
+            html_doc = fetch(src)
+        except Exception as e:
+            raise FileNotFoundError(f"Could not fetch URL: {src} ({e})")
+
+        soup = BeautifulSoup(html_doc, "html.parser")
+        body = " ".join(x.get_text(" ", strip=True) for x in soup.find_all(["p","li","h2","h3"]))
+        return re.sub(r"\s+", " ", body).strip()
+
+    # --------------- LOCAL FILE ----------------
+    p = Path(src)
+    if p.exists():
+        return p.read_text(encoding="utf-8", errors="ignore")
+    raise FileNotFoundError(f"Source not found: {src}")
+
+def clean_wiki_text(text: str) -> str:
+    text = text.replace("\xa0", " ")
+
+    text = re.sub(
+        r'(?:^|\n)==\s*(References|See also|External links|Notes|Further reading)\s*==.*$',
+        '',
+        text,
+        flags=re.I | re.S
+    )
+
+    text = re.sub(r'\s*\[(?:\d+|[a-z]|note\s*\d+|citation needed)\]\s*', ' ', text, flags=re.I)
+
+    text = re.sub(r'\((?:citation needed|clarification needed|disputed|page needed|dead link|link rot)\)', ' ', text, flags=re.I)
+
+    text = re.sub(r'\bISBN[:\s][0-9Xx\-–]+\b', ' ', text)
+    text = re.sub(r'\bDOI:\s*[^\s,;]+', ' ', text, flags=re.I)
+    text = re.sub(r'\bPMID\s*\d+\b', ' ', text, flags=re.I)
+    text = re.sub(r'\bRetrieved\s+\d{1,2}\s+\w+\s+\d{4}\b', ' ', text, flags=re.I)
+
+    text = re.sub(r'\bv\s*•\s*t\s*•\s*e\b', ' ', text, flags=re.I)
+
+    text = re.sub(r'[ \t]+', ' ', text)
+    text = re.sub(r'\n{2,}', '\n', text)
+    return text.strip()
+
+def is_content_sentence(s: str) -> bool:
+    s = s.strip()
+
+    # Must end like a real sentence
+    if not re.search(r'[.!?]"?$', s):  # period/exclaim/question (optional closing quote)
+        return False
+
+    # Length & words
+    if not (50 <= len(s) <= 350):
+        return False
+    if len(s.split()) < 8:
+        return False
+
+    if not re.search(r'\b(is|are|was|were|has|have|had|includes?|use[sd]?|named|called|described|developed|introduced|designed|built|created|founded|occurr?ed|took|won|published|works?|served|led|caused)\b', s, flags=re.I):
+        return False
+
+    bad_patterns = [
+        r'\bISBN\b', r'\bDOI\b', r'\bPMID\b',
+        r'\bRetrieved\b', r'Wayback Machine', r'Archive\.?org',
+        r'www\.', r'https?://', r'\.com\b', r'\.org\b', r'\.edu\b',
+        r'\bIn:\s', r'pcs\.c1\.Page', r'\bv\s*•\s*t\s*•\s*e\b',
+        r'^[\"“][A-Z][^.!?]{0,100}[\"”]$',
+        r'^[A-Z][A-Za-z0-9 ,–\-:]{0,120}$',
+    ]
+    if re.search("|".join(bad_patterns), s, flags=re.I):
+        return False
+
+    letters = re.findall(r'[A-Za-z]', s)
+    if letters:
+        upper_ratio = sum(1 for ch in letters if ch.isupper()) / len(letters)
+        if upper_ratio > 0.35:
+            return False
+
+    return True
+
+
+def sent_split(text: str):
+    chunks = re.split(r'(?<=[.!?])\s+(?=[A-Z0-9(“"])', text)
+    chunks = [c.strip() for c in chunks if c and len(c.strip()) >= 10]
+
+    merged = []
+    i = 0
+    while i < len(chunks):
+        s = chunks[i]
+        if re.search(r'\b[A-Z]\.$', s) and i + 1 < len(chunks):
+            s = (s + " " + chunks[i+1]).strip()
+            i += 2
+        else:
+            i += 1
+        merged.append(s)
+
+    keep = [m for m in merged if is_content_sentence(m)]
+
+    if len(keep) < 10:
+        keep = [m for m in merged if 40 <= len(m) <= 350 and not re.search(r'(ISBN|DOI|PMID|www\.|https?://)', m, flags=re.I)]
+
+    seen, out = set(), []
+    for s in keep:
+        k = s.lower()
+        if k not in seen:
+            seen.add(k); out.append(s)
+    return out[:400]
+
+def swap_entity(sentence: str, target: str, replacement: str) -> str:
+    pattern = re.escape(target)
+    return re.sub(pattern, replacement, sentence, count=1)
+
+# ---------- Teacher components ----------
+
+_fe = None
+_ner = None
+_t2t = None
+
+
+def get_teacher_pipes():
+    global _fe, _ner, _t2t
+    if _fe is None:
+        _fe = pipeline("feature-extraction", model="sentence-transformers/all-MiniLM-L6-v2")
+    if _ner is None:
+        _ner = pipeline("ner", model="dslim/bert-base-NER", aggregation_strategy="simple")
+    if _t2t is None:
+        _t2t = pipeline("text2text-generation", model="google/flan-t5-small", max_new_tokens=96)
+    return _fe, _ner, _t2t
+
+
+def embed_sents(fe, sents):
+    vecs = [np.array(fe(s, truncation=True)[0]).mean(axis=0) for s in sents]
+    return np.stack(vecs)
+
+
+def cosine(a, b):
+    return float((a @ b) / (np.linalg.norm(a) * np.linalg.norm(b) + 1e-9))
+
+
+def rank_sentences(sents, fe):
+    M = embed_sents(fe, sents)
+    centroid = M.mean(axis=0)
+    scores = [cosine(v, centroid) for v in M]
+    idx = np.argsort(scores)[::-1]
+    return [sents[i] for i in idx]
+
+
+def pick_entity(ner_res):
+    pref = {"ORG":3,"PERSON":3,"LOC":3,"GPE":3,"EVENT":2,"DATE":2,"WORK_OF_ART":2,"LAW":2}
+    best = None; bestw = -1
+    for ent in ner_res:
+        w = pref.get(ent.get("entity_group"), 1)
+        word = ent.get("word", "").strip()
+        if w > bestw and len(word) > 2:
+            best, bestw = ent, w
+    return best
+
+def parse_distractors(raw: str, sentence: str):
+    parts = re.split(r'[;\n]|(?:,\s(?=[A-Z]))', raw)
+    alts, seen = [], set()
+
+    for p in parts:
+        t = p.strip(" -•\t\"'[]()")
+        if not (2 <= len(t) <= 80):
+            continue
+        if re.search(r'(Alt\d|##|pcs\.c1|https?://|www\.)', t, flags=re.I):
+            continue
+        if not re.search(r'[A-Za-z]{3,}', t):
+            continue
+        if t in sentence:
+            continue
+        low = t.lower()
+        if low in seen:
+            continue
+        seen.add(low)
+        alts.append(t)
+
+    return alts[:4]
+
+def _norm_ent(s: str) -> str:
+    s = s.replace("##", "")
+    s = re.sub(r"\s+", " ", s).strip()
+    return s
+
+def _distinct3(a: str, b: str, c: str) -> bool:
+    aa, bb, cc = a.strip(), b.strip(), c.strip()
+    if len({aa, bb, cc}) < 3:
+        return False
+    return True
+
+def _bad_text(s: str) -> bool:
+    if not s or len(s) < 8:
+        return True
+    if re.search(r'(Alt\d|##|https?://|www\.)', s, re.I):
+        return True
+    letters = re.findall(r'[A-Za-z]', s)
+    if letters and (sum(ch.isupper() for ch in letters) / len(letters)) > 0.45:
+        return True
+    return False
+
+def display_clean(s: str) -> str:
+    s = re.sub(r'\s*\[(?:\d+|[a-z]|note\s*\d+|citation needed|self-published source\?)\]\s*', ' ', s, flags=re.I)
+    s = re.sub(r'\s*\. \.\s*', '. ', s)
+    s = re.sub(r'\s+', ' ', s).strip()
+    s = s.replace('##', '')
+    return s
+
+def _distinct3(a: str, b: str, c: str) -> bool:
+    a, b, c = a.strip(), b.strip(), c.strip()
+    if not a or not b or not c:
+        return False
+    return len({a, b, c}) == 3
+
+def _looks_like_title_or_link(s: str) -> bool:
+    if re.search(r'(?:www\.|https?://|\.com\b|\.org\b|\.edu\b)', s, flags=re.I):
+        return True
+    if re.match(r'^[\"“][A-Z][^.!?]{0,60}[\"”]$', s):
+        return True
+    letters = re.findall(r'[A-Za-z]', s)
+    if letters and (sum(1 for ch in letters if ch.isupper()) / len(letters)) > 0.45:
+        return True
+    return False
+
+
+def list_entities(sentence: str, ner):
+    pref = {"ORG":3,"PERSON":3,"LOC":3,"GPE":3,"EVENT":2,"DATE":2,"WORK_OF_ART":1,"LAW":1}
+    ents = ner(sentence)
+    seen = set(); out = []
+    for e in ents:
+        t = e.get("entity_group"); w = (e.get("word") or "").strip()
+        if not t or len(w) < 3: 
+            continue
+        if w.lower() in seen: 
+            continue
+        seen.add(w.lower())
+        out.append((w, t, pref.get(t, 0)))
+    out.sort(key=lambda x: x[2], reverse=True)
+    return [(w,t) for (w,t,_) in out]
+
+def build_entity_pool(sents, ner):
+    pool = {}
+    for s in sents:
+        for e in ner(s):
+            g = e.get("entity_group")
+            w = (e.get("word") or "").strip()
+            if g and len(w) > 2:
+                pool.setdefault(g, set()).add(w)
+    return pool
+
+
+def teacher_make_item(sentence: str, entity_pool=None, forced_entity=None):
+    fe, ner, t2t = get_teacher_pipes()
+
+    if forced_entity is not None:
+        ent_text, ent_type = forced_entity
+    else:
+        ents = ner(sentence)
+        anchor = pick_entity(ents)
+        if not anchor:
+            return None
+        ent_text = anchor["word"]
+        ent_type = anchor.get("entity_group")
+
+    ent_text = _norm_ent(ent_text)
+
+    prompt = (
+        f"You are writing exam distractors. Given the sentence:\n"
+        f"'{sentence}'\n"
+        f"Propose TWO realistic but incorrect alternatives for the entity '{ent_text}'. "
+        f"Each must be the same entity TYPE and plausibly confusable in context. "
+        f"Return as a plain list separated by semicolons."
+    )
+    cand = t2t(prompt)[0]["generated_text"]
+    alts = parse_distractors(cand, sentence)
+
+    if len(alts) < 2:
+        cand2 = t2t(prompt + "\nReturn exactly two alternatives in the format: Alt1; Alt2")[0]["generated_text"]
+        alts = parse_distractors(cand2, sentence)
+
+    if len(alts) < 2 and entity_pool and ent_type in entity_pool:
+        candidates = [x for x in list(entity_pool[ent_type]) if _norm_ent(x).lower() != ent_text.lower()]
+        random.shuffle(candidates)
+        for c in candidates:
+            c2 = _norm_ent(c)
+            if c2 not in alts:
+                alts.append(c2)
+            if len(alts) >= 2:
+                break
+
+    alts = [a for a in alts if a][:2]
+    if len(alts) < 2:
+        return None
+
+    wrong1 = swap_entity(sentence, ent_text, alts[0])
+    wrong2 = swap_entity(sentence, ent_text, alts[1])
+
+    if wrong1 == sentence or wrong2 == sentence:
+        return None
+
+    if (_bad_text(wrong1) or _bad_text(wrong2) or
+        _looks_like_title_or_link(wrong1) or _looks_like_title_or_link(wrong2)):
+        return None
+
+    correct = sentence
+    if not _distinct3(correct, wrong1, wrong2):
+        return None
+
+    correct = display_clean(correct)
+    wrong1  = display_clean(wrong1)
+    wrong2  = display_clean(wrong2)
+    
+    expl = t2t(
+        f"Explain concisely (1–2 sentences each) why these are wrong in context:\n"
+        f"Context: {sentence}\nWrong A: {wrong1}\nWrong B: {wrong2}"
+    )[0]["generated_text"]
+
+    return {
+        "context": sentence,
+        "entity": ent_text,
+        "correct": correct,
+        "distractors": [wrong1, wrong2],
+        "explanations": [expl]
+    }
+
+
+# ---------- Student (adapter) inference ----------
+
+_student_model = None
+_student_tok = None
+
+
+def load_student(adapter_dir: str):
+    global _student_model, _student_tok
+    if _student_model is None:
+        base = "google/flan-t5-small"
+        _student_tok = AutoTokenizer.from_pretrained(base)
+        _student_model = AutoModelForSeq2SeqLM.from_pretrained(base)
+        if adapter_dir and Path(adapter_dir).exists():
+            from peft import PeftModel
+            _student_model = PeftModel.from_pretrained(_student_model, adapter_dir)
+        _student_model.eval()
+    return _student_tok, _student_model
+
+
+def student_generate(adapter_dir: str, sentence: str, entity_pool=None):
+    tok, mdl = load_student(adapter_dir)
+    prompt = (
+        "Task: Given a factual sentence and a target entity from it, "
+        "produce a JSON object with keys: correct (string), distractors (array of two strings), "
+        "explanations (array of 1-2 short sentences). The distractors must be plausible but incorrect and of the same entity type.\n"
+        f"Sentence: {sentence}\n"
+        "Output strictly valid JSON."
+    )
+    ids = tok(prompt, return_tensors="pt")
+    gen = mdl.generate(**ids, max_new_tokens=192)
+    out = tok.decode(gen[0], skip_special_tokens=True).strip()
+    try:
+        obj = json.loads(out)
+        if not isinstance(obj.get("correct"), str):
+            raise ValueError
+        if not isinstance(obj.get("distractors"), list) or len(obj["distractors"]) < 2:
+            raise ValueError
+        return {
+            "context": sentence,
+            "entity": None,
+            "correct": obj["correct"],
+            "distractors": obj["distractors"][:2],
+            "explanations": obj.get("explanations", [])
+        }
+    except Exception:
+        return teacher_make_item(sentence, entity_pool=entity_pool)
+
+# ---------- Emitters ----------
+
+def write_anki(cards, outdir: Path):
+    csvp = outdir / "anki_flashcards.csv"
+    with open(csvp, "w", newline="", encoding="utf-8") as f:
+        w = csv.writer(f)
+        w.writerow(["Front","Back","Extra"])
+        for c in cards:
+            q = "Which statement is correct?"
+            A = display_clean(c["correct"] or "")
+            B = display_clean(c["distractors"][0] or "")
+            C = display_clean(c["distractors"][1] or "")
+            if not _distinct3(A, B, C):
+                continue
+
+            correct_slot = random.randint(0, 2)
+            texts = [None, None, None]
+            texts[correct_slot] = A
+            rem = [i for i in (0,1,2) if i != correct_slot]
+            texts[rem[0]] = B
+            texts[rem[1]] = C
+
+            opts = [(lab, texts[i], i == correct_slot) for i, lab in enumerate(["A","B","C"])]
+            front = f"{q}<br><br>" + "<br>".join(f"{lab}) {html.escape(txt)}" for lab, txt, _ in opts)
+            correct_letter = ["A","B","C"][correct_slot]
+            exp = display_clean(' '.join(c.get('explanations', [])))
+            back = f"Correct: {correct_letter}" + (f"<br><br>{html.escape(exp)}" if exp else "")
+            extra = f"Context: {html.escape(display_clean(c['context']))}"
+            w.writerow([front, back, extra])
+
+
+def write_quiz(cards, outdir: Path):
+    htmlp = outdir / "quiz.html"
+    with open(htmlp, "w", encoding="utf-8") as f:
+        f.write("<!doctype html><meta charset='utf-8'><title>WhyWrong Quiz</title>")
+        f.write("<style>body{font-family:system-ui;margin:2rem;max-width:800px} .q{margin:1.5rem 0;padding:1rem;border:1px solid #ddd;border-radius:12px} .ans{display:none;margin:.5rem 0;color:#111;background:#f6f6f6;padding:.5rem;border-radius:8px}</style>")
+        f.write("<h1>WhyWrong — Counterfactual Quiz</h1>")
+        f.write("<p>Pick the correct statement. Click 'Check' to reveal the answer & explanation.</p>")
+        f.write("<script>function chk(i,ans){let sel=[...document.querySelectorAll('input[name=q'+i+']:checked')][0];let box=document.getElementById('ans'+i);box.style.display='block';box.querySelector('b').innerText=(sel?sel.value:'—')===ans?'Correct':'Incorrect';}</script>")
+        for i, c in enumerate(cards):
+            q = "Which statement is correct?"
+            A = display_clean(c["correct"] or "")
+            B = display_clean(c["distractors"][0] or "")
+            C = display_clean(c["distractors"][1] or "")
+            if not _distinct3(A, B, C):
+                continue
+
+            correct_slot = random.randint(0, 2)
+            texts = [None, None, None]
+            texts[correct_slot] = A
+            rem = [idx for idx in (0,1,2) if idx != correct_slot]
+            texts[rem[0]] = B
+            texts[rem[1]] = C
+
+            correct_letter = ["A","B","C"][correct_slot]
+
+            f.write(f"<div class='q'><h3>Q{i+1}. {html.escape(q)}</h3>")
+            for lab, txt in zip(["A","B","C"], texts):
+                f.write(f"<label><input type='radio' name='q{i}' value='{lab}'> {lab}) {html.escape(txt)}</label><br>")
+            exp = display_clean(' '.join(c.get('explanations', [])))
+            f.write(f"<button onclick=\"chk({i},'{correct_letter}')\">Check</button><p></p>")
+            f.write(f"<div class='ans' id='ans{i}'><b></b><div><em>Explanation:</em> {html.escape(exp)}</div><div><em>Context:</em> {html.escape(display_clean(c['context']))}</div></div></div>")
+
+# ---------- Main ----------
+
+def main():
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--source", required=True, help="URL or path to text")
+    ap.add_argument("--k", type=int, default=10, help="number of cards to PRODUCE")
+    ap.add_argument("--mode", choices=["teacher","student"], default="teacher")
+    ap.add_argument("--adapters", default=None, help="path to LoRA adapters (for student mode)")
+    ap.add_argument("--outdir", default="outputs")
+    args = ap.parse_args()
+
+    os.makedirs(args.outdir, exist_ok=True)
+
+    text = get_text(args.source)
+    sents = sent_split(text)
+    print(f"[debug] sentences extracted = {len(sents)}; target cards k = {args.k}")
+
+    cards = []
+
+    fe, ner, _ = get_teacher_pipes()
+
+    ranked = rank_sentences(sents, fe)
+
+    pool = build_entity_pool(ranked[:120], ner)
+
+    made = 0
+    if args.mode == "teacher":
+        for s in ranked:
+            item = teacher_make_item(s, entity_pool=pool)
+            if item:
+                cards.append(item)
+                made += 1
+                if made >= args.k:
+                    break
+    else:
+        for s in ranked:
+            item = student_generate(args.adapters, s, entity_pool=pool)
+            if item:
+                cards.append(item)
+                made += 1
+                if made >= args.k:
+                    break
+
+    if not cards:
+        print(f"[debug] sentences extracted = {len(sents)}; trying to make k={args.k}")
+        raise SystemExit("No cards produced. Try a different source or increase --k.")
+
+    outdir = Path(args.outdir)
+    write_anki(cards, outdir)
+    write_quiz(cards, outdir)
+    (outdir/"report.json").write_text(json.dumps(cards, indent=2), encoding="utf-8")
+    print(f"Wrote: {outdir/'anki_flashcards.csv'}, {outdir/'quiz.html'}, {outdir/'report.json'}")
+
+
+if __name__ == "__main__":
+    main()

From 8de34d1c6c1a8ee3347612914a71a4c9333fc2f0 Mon Sep 17 00:00:00 2001
From: akshajnad <98061663+akshajnad@users.noreply.github.com>
Date: Wed, 3 Sep 2025 23:29:03 -0400
Subject: [PATCH 2/6] Create make_synth_data.py

---
 .../Akshaj Nadimpalli/make_synth_data.py      | 143 ++++++++++++++++++
 1 file changed, 143 insertions(+)
 create mode 100644 submissions/Akshaj Nadimpalli/make_synth_data.py

diff --git a/submissions/Akshaj Nadimpalli/make_synth_data.py b/submissions/Akshaj Nadimpalli/make_synth_data.py
new file mode 100644
index 0000000..6e58301
--- /dev/null
+++ b/submissions/Akshaj Nadimpalli/make_synth_data.py	
@@ -0,0 +1,143 @@
+import argparse, json, os, random, hashlib
+from pathlib import Path
+
+from app import (
+    get_text, sent_split, get_teacher_pipes, rank_sentences,
+    teacher_make_item, list_entities, build_entity_pool, swap_entity
+)
+
+random.seed(42)
+
+def norm_key(sentence: str, forced_entity):
+    ent_txt, ent_typ = (forced_entity or ("", ""))
+    raw = f"{sentence.strip()}||{(ent_txt or '').strip()}||{(ent_typ or '').strip()}"
+    return hashlib.md5(raw.encode("utf-8")).hexdigest()
+
+def fabricate_item(sentence: str, ent_text: str, ent_type: str, pool_by_type: dict):
+    choices = [x for x in list(pool_by_type.get(ent_type, [])) if x.lower() != (ent_text or "").lower()]
+    random.shuffle(choices)
+    if len(choices) < 2:
+        return None
+
+    wrong1 = swap_entity(sentence, ent_text, choices[0])
+    wrong2 = swap_entity(sentence, ent_text, choices[1])
+
+    expl = (
+        f"'{choices[0]}' and '{choices[1]}' are {ent_type} entities that do not match the original fact in context. "
+        f"The correct statement includes '{ent_text}' instead."
+    )
+
+    return {
+        "context": sentence,
+        "entity": ent_text,
+        "correct": sentence,
+        "distractors": [wrong1, wrong2],
+        "explanations": [expl]
+    }
+
+def build_example(sentence: str, entity_pool, forced_entity=None):
+    item = teacher_make_item(sentence, entity_pool=entity_pool, forced_entity=forced_entity)
+    if item:
+        inst = (
+            "Task: Given a factual sentence, produce a JSON object with keys: "
+            "correct (string), distractors (array of two strings), explanations (array with 1-2 sentences). "
+            "The distractors must be plausible but incorrect and of the same entity type.\n"
+            f"Sentence: {item['context']}\nOutput strictly valid JSON."
+        )
+        tgt = json.dumps({
+            "correct": item["correct"],
+            "distractors": item["distractors"][:2],
+            "explanations": item.get("explanations", [])
+        }, ensure_ascii=False)
+        return {"input": inst, "target": tgt}
+
+    if forced_entity is not None:
+        ent_text, ent_type = forced_entity
+    else:
+        ent_text, ent_type = None, None
+
+    if ent_text and ent_type:
+        item2 = fabricate_item(sentence, ent_text, ent_type, entity_pool)
+        if item2:
+            inst = (
+                "Task: Given a factual sentence, produce a JSON object with keys: "
+                "correct (string), distractors (array of two strings), explanations (array with 1-2 sentences). "
+                "The distractors must be plausible but incorrect and of the same entity type.\n"
+                f"Sentence: {item2['context']}\nOutput strictly valid JSON."
+            )
+            tgt = json.dumps({
+                "correct": item2["correct"],
+                "distractors": item2["distractors"][:2],
+                "explanations": item2.get("explanations", [])
+            }, ensure_ascii=False)
+            return {"input": inst, "target": tgt}
+
+    return None
+
+def main():
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--sources", required=True, help="text file with one URL or local path per line")
+    ap.add_argument("--out", default="data/train.jsonl")
+    ap.add_argument("--k", type=int, default=30, help="TARGET items to MAKE per source")
+    ap.add_argument("--per_sentence_max", type=int, default=5, help="max entities to try per sentence")
+    ap.add_argument("--max_ranked", type=int, default=400, help="how many ranked sents to consider per source")
+    ap.add_argument("--target_total", type=int, default=1000, help="stop when total items reach this number")
+    args = ap.parse_args()
+
+    srcs = [x.strip() for x in Path(args.sources).read_text(encoding="utf-8").splitlines() if x.strip()]
+    random.shuffle(srcs)
+
+    os.makedirs(Path(args.out).parent, exist_ok=True)
+
+    fe, ner, _ = get_teacher_pipes()
+    n_ok = 0
+    seen_keys = set()
+
+    with open(args.out, "w", encoding="utf-8") as f:
+        for si, src in enumerate(srcs, 1):
+            if n_ok >= args.target_total:
+                break
+            try:
+                text = get_text(src)
+                sents = sent_split(text)
+                if not sents:
+                    print(f"[warn] no usable sentences in {src}; skipping")
+                    continue
+
+                ranked = rank_sentences(sents, fe)
+                pool = build_entity_pool(ranked[:min(len(ranked), args.max_ranked)], ner)
+
+                made_here = 0
+                tried_sentences = 0
+
+                for s in ranked[:args.max_ranked]:
+                    if made_here >= args.k or n_ok >= args.target_total:
+                        break
+                    ents = list_entities(s, ner)[:args.per_sentence_max]
+                    tried_sentences += 1
+                    if not ents:
+                        continue
+
+                    for ent in ents:
+                        if made_here >= args.k or n_ok >= args.target_total:
+                            break
+                        key = norm_key(s, ent)
+                        if key in seen_keys:
+                            continue
+
+                        ex = build_example(s, pool, forced_entity=ent)
+                        if ex:
+                            f.write(json.dumps(ex, ensure_ascii=False) + "\n")
+                            seen_keys.add(key)
+                            n_ok += 1
+                            made_here += 1
+
+                print(f"[info] [{si}/{len(srcs)}] {src}: made {made_here}/{args.k} (tried {tried_sentences} sents; total={n_ok})")
+
+            except Exception as e:
+                print(f"[warn] skipping {src}: {e}")
+
+    print(f"Wrote {n_ok} examples to {args.out}")
+
+if __name__ == "__main__":
+    main()

From 8c5d1f8e78ecb9dd289889dbbcd5a810b4edd4f6 Mon Sep 17 00:00:00 2001
From: akshajnad <98061663+akshajnad@users.noreply.github.com>
Date: Wed, 3 Sep 2025 23:30:24 -0400
Subject: [PATCH 3/6] Create train_lora.py

---
 submissions/Akshaj Nadimpalli/train_lora.py | 85 +++++++++++++++++++++
 1 file changed, 85 insertions(+)
 create mode 100644 submissions/Akshaj Nadimpalli/train_lora.py

diff --git a/submissions/Akshaj Nadimpalli/train_lora.py b/submissions/Akshaj Nadimpalli/train_lora.py
new file mode 100644
index 0000000..42ddd91
--- /dev/null
+++ b/submissions/Akshaj Nadimpalli/train_lora.py	
@@ -0,0 +1,85 @@
+import argparse, json
+from pathlib import Path
+from datasets import load_dataset
+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Trainer, TrainingArguments
+from peft import get_peft_model, LoraConfig, TaskType
+from inspect import signature
+
+def main():
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--train_path", required=True)
+    ap.add_argument("--output_dir", default="adapters/whywrong-lora")
+    ap.add_argument("--epochs", type=int, default=1)
+    ap.add_argument("--batch_size", type=int, default=8)
+    ap.add_argument("--lr", type=float, default=1e-4)
+    ap.add_argument("--lora_r", type=int, default=16)
+    ap.add_argument("--lora_alpha", type=int, default=32)
+    ap.add_argument("--lora_dropout", type=float, default=0.05)
+    ap.add_argument("--max_source_len", type=int, default=768)
+    ap.add_argument("--max_target_len", type=int, default=256)
+    args = ap.parse_args()
+
+    base = "google/flan-t5-small"
+    tok = AutoTokenizer.from_pretrained(base)
+    mdl = AutoModelForSeq2SeqLM.from_pretrained(base)
+
+    ds = load_dataset("json", data_files=args.train_path, split="train")
+    ds = ds.train_test_split(test_size=0.1, seed=42)
+
+    def tok_fn(batch):
+        model_in = tok(batch["input"], truncation=True, max_length=args.max_source_len)
+        labels = tok(text_target=batch["target"], truncation=True, max_length=args.max_target_len)
+        model_in["labels"] = labels["input_ids"]
+        return model_in
+
+    ds_tok = ds.map(tok_fn, batched=True, remove_columns=ds["train"].column_names)
+
+    peft_cfg = LoraConfig(
+        task_type=TaskType.SEQ_2_SEQ_LM,
+        r=args.lora_r,
+        lora_alpha=args.lora_alpha,
+        lora_dropout=args.lora_dropout,
+        target_modules=["q","v","k","o"]  # common for T5
+    )
+    mdl = get_peft_model(mdl, peft_cfg)
+
+    collator = DataCollatorForSeq2Seq(tok, model=mdl)
+
+    kw = dict(
+        output_dir=args.output_dir,
+        per_device_train_batch_size=args.batch_size,
+        per_device_eval_batch_size=args.batch_size,
+        learning_rate=args.lr,
+        num_train_epochs=args.epochs,
+        logging_steps=50,
+        remove_unused_columns=True,
+        report_to=[],
+    )
+    
+    sig = signature(TrainingArguments)
+    if "evaluation_strategy" in sig.parameters:
+        kw["evaluation_strategy"] = "epoch"
+    if "save_strategy" in sig.parameters:
+        kw["save_strategy"] = "epoch"
+    if "fp16" in sig.parameters:
+        kw["fp16"] = False
+    if "bf16" in sig.parameters:
+        kw["bf16"] = False
+    
+    train_args = TrainingArguments(**kw)
+
+    trainer = Trainer(
+        model=mdl,
+        args=train_args,
+        train_dataset=ds_tok["train"],
+        eval_dataset=ds_tok["test"],
+        data_collator=collator,
+        tokenizer=tok,
+    )
+
+    trainer.train()
+    mdl.save_pretrained(args.output_dir)
+    print(f"Saved LoRA adapters to {args.output_dir}")
+
+if __name__ == "__main__":
+    main()

From eb702b5026826b017f9b0efde5382a7f2130abb0 Mon Sep 17 00:00:00 2001
From: akshajnad <98061663+akshajnad@users.noreply.github.com>
Date: Wed, 3 Sep 2025 23:30:39 -0400
Subject: [PATCH 4/6] Create requirements.txt

---
 submissions/Akshaj Nadimpalli/requirements.txt | 10 ++++++++++
 1 file changed, 10 insertions(+)
 create mode 100644 submissions/Akshaj Nadimpalli/requirements.txt

diff --git a/submissions/Akshaj Nadimpalli/requirements.txt b/submissions/Akshaj Nadimpalli/requirements.txt
new file mode 100644
index 0000000..8c783b8
--- /dev/null
+++ b/submissions/Akshaj Nadimpalli/requirements.txt	
@@ -0,0 +1,10 @@
+transformers>=4.42.0
+peft>=0.11.0
+accelerate>=0.30.0
+datasets>=2.20.0
+sentencepiece>=0.2.0
+torch>=2.2.0
+numpy>=1.26.0
+beautifulsoup4>=4.12.0
+nltk>=3.8.1
+scikit-learn>=1.4.0

From 70420a879dec35bd9197024cca91f45066dc7b4c Mon Sep 17 00:00:00 2001
From: akshajnad <98061663+akshajnad@users.noreply.github.com>
Date: Wed, 3 Sep 2025 23:30:56 -0400
Subject: [PATCH 5/6] Create sample_sources.txt

---
 .../Akshaj Nadimpalli/sample_sources.txt      | 51 +++++++++++++++++++
 1 file changed, 51 insertions(+)
 create mode 100644 submissions/Akshaj Nadimpalli/sample_sources.txt

diff --git a/submissions/Akshaj Nadimpalli/sample_sources.txt b/submissions/Akshaj Nadimpalli/sample_sources.txt
new file mode 100644
index 0000000..1e7988a
--- /dev/null
+++ b/submissions/Akshaj Nadimpalli/sample_sources.txt	
@@ -0,0 +1,51 @@
+https://en.wikipedia.org/wiki/DNA
+https://en.wikipedia.org/wiki/Mitochondrion
+https://en.wikipedia.org/wiki/CRISPR
+https://en.wikipedia.org/wiki/RNA_interference
+https://en.wikipedia.org/wiki/Insulin
+https://en.wikipedia.org/wiki/Photosynthesis
+https://en.wikipedia.org/wiki/Great_Depression
+https://en.wikipedia.org/wiki/New_Deal
+https://en.wikipedia.org/wiki/Marshall_Plan
+https://en.wikipedia.org/wiki/Cuban_Missile_Crisis
+https://en.wikipedia.org/wiki/United_Nations
+https://en.wikipedia.org/wiki/NATO
+https://en.wikipedia.org/wiki/World_Wide_Web
+https://en.wikipedia.org/wiki/Internet
+https://en.wikipedia.org/wiki/Alan_Turing
+https://en.wikipedia.org/wiki/Claude_Shannon
+https://en.wikipedia.org/wiki/Unix
+https://en.wikipedia.org/wiki/Linux
+https://en.wikipedia.org/wiki/Algorithm
+https://en.wikipedia.org/wiki/Dijkstra%27s_algorithm
+https://en.wikipedia.org/wiki/Quicksort
+https://en.wikipedia.org/wiki/Relational_database
+https://en.wikipedia.org/wiki/SQL
+https://en.wikipedia.org/wiki/General_relativity
+https://en.wikipedia.org/wiki/Quantum_mechanics
+https://en.wikipedia.org/wiki/Higgs_boson
+https://en.wikipedia.org/wiki/Periodic_table
+https://en.wikipedia.org/wiki/Haber_process
+https://en.wikipedia.org/wiki/Ammonia
+https://en.wikipedia.org/wiki/Photosystem_II
+https://en.wikipedia.org/wiki/Vaccination
+https://en.wikipedia.org/wiki/Penicillin
+https://en.wikipedia.org/wiki/Human_immunodeficiency_virus
+https://en.wikipedia.org/wiki/Coronavirus
+https://en.wikipedia.org/wiki/Federal_Reserve
+https://en.wikipedia.org/wiki/Inflation
+https://en.wikipedia.org/wiki/Gross_domestic_product
+https://en.wikipedia.org/wiki/Glass%E2%80%93Steagall_Legislation
+https://en.wikipedia.org/wiki/Brown_v._Board_of_Education
+https://en.wikipedia.org/wiki/Miranda_v._Arizona
+https://en.wikipedia.org/wiki/Clean_Air_Act_(United_States)
+https://en.wikipedia.org/wiki/Bill_of_Rights
+https://en.wikipedia.org/wiki/Manhattan_Project
+https://en.wikipedia.org/wiki/Apollo_program
+https://en.wikipedia.org/wiki/International_Space_Station
+https://en.wikipedia.org/wiki/Global_Positioning_System
+https://en.wikipedia.org/wiki/Artificial_intelligence
+https://en.wikipedia.org/wiki/Machine_learning
+https://en.wikipedia.org/wiki/Neural_network
+https://en.wikipedia.org/wiki/Support_vector_machine
+https://en.wikipedia.org/wiki/Backpropagation

From c4944535bc0b192b41622646d3e6c24438f8acf4 Mon Sep 17 00:00:00 2001
From: akshajnad <98061663+akshajnad@users.noreply.github.com>
Date: Wed, 3 Sep 2025 23:31:30 -0400
Subject: [PATCH 6/6] Create README.md

---
 submissions/Akshaj Nadimpalli/README.md | 26 +++++++++++++++++++++++++
 1 file changed, 26 insertions(+)
 create mode 100644 submissions/Akshaj Nadimpalli/README.md

diff --git a/submissions/Akshaj Nadimpalli/README.md b/submissions/Akshaj Nadimpalli/README.md
new file mode 100644
index 0000000..7382548
--- /dev/null
+++ b/submissions/Akshaj Nadimpalli/README.md	
@@ -0,0 +1,26 @@
+Project: WhyWrong (Akshaj Nadimpalli)
+Generates deceptive, counterfactual flashcards from any article or local text. (currently changes a specific word of the phrase and provide 3 choices, of which one is true).
+
+0) Setup
+python -m venv venv
+source venv/bin/activate  # (Windows: venv\Scripts\activate)
+pip install -r requirements.txt
+
+1) Build tiny dataset
+python make_synth_data.py --sources sample_sources.txt \
+  --out data/train.jsonl \
+  --k 30 --per_sentence_max 5 --max_ranked 500 --target_total 1000
+
+2) Train LoRA
+python train_lora.py \
+  --train_path data/train.jsonl \
+  --output_dir adapters/whywrong-lora \
+  --epochs 1 \
+  --batch_size 8 \
+  --lr 1e-4 \
+  --lora_r 16 --lora_alpha 32 --lora_dropout 0.05
+
+3) Run generator on new article (replace link with any wikipedia article)
+Teacher (no training, just do this directly after the setup): python app.py --source "https://en.wikipedia.org/wiki/Psychology" \   
+  --k 12 --mode teacher --outdir outputs
+Student (with trained model): python app.py --source "https://en.wikipedia.org/wiki/Psychology" --k 10 --mode student --adapters adapters/whywrong-lora --outdir outputs