From 41692eea0ed7fc50a77b2b21a61c6032e5e845f8 Mon Sep 17 00:00:00 2001 From: akshajnad <98061663+akshajnad@users.noreply.github.com> Date: Wed, 3 Sep 2025 23:28:12 -0400 Subject: [PATCH 1/6] Create app.py --- submissions/Akshaj Nadimpalli/app.py | 590 +++++++++++++++++++++++++++ 1 file changed, 590 insertions(+) create mode 100644 submissions/Akshaj Nadimpalli/app.py diff --git a/submissions/Akshaj Nadimpalli/app.py b/submissions/Akshaj Nadimpalli/app.py new file mode 100644 index 0000000..23730ab --- /dev/null +++ b/submissions/Akshaj Nadimpalli/app.py @@ -0,0 +1,590 @@ +import argparse, csv, json, os, random, re, html +from pathlib import Path +from bs4 import BeautifulSoup +import numpy as np +from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM +import json +import urllib.parse + +# ---------- Utilities ---------- + +from urllib.request import Request, urlopen +from urllib.error import HTTPError, URLError + +def _fetch(url, headers): + req = Request(url, headers=headers) + return urlopen(req, timeout=20).read().decode("utf-8", errors="ignore") + +def get_text(src: str) -> str: + + def is_short(s: str, n: int = 800) -> bool: + return len((s or "").strip()) < n + + if src.startswith(("http://", "https://")): + hdrs = { + "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0 Safari/537.36", + "Accept-Language": "en-US,en;q=0.9", + } + def fetch(u): + return _fetch(u, hdrs) + + # ---------------- WIKIPEDIA ---------------- + if "wikipedia.org/wiki/" in src: + slug = src.rsplit("/", 1)[-1] + title = urllib.parse.unquote(slug).replace("_", " ") + + try: + plain = fetch(f"https://en.wikipedia.org/api/rest_v1/page/plain/{slug}") + plain = re.sub(r'\r\n?', '\n', plain) + cleaned = clean_wiki_text(plain) + if not is_short(cleaned, 1200): + return cleaned + except Exception: + pass + + try: + mob = fetch(f"https://en.wikipedia.org/api/rest_v1/page/mobile-html/{slug}") + text_only = re.sub(r"<[^>]+>", " ", mob) + text_only = re.sub(r"\s+", " ", text_only) + cleaned = clean_wiki_text(text_only) + if not is_short(cleaned, 1200): + return cleaned + except Exception: + pass + + try: + api = fetch("https://en.wikipedia.org/w/api.php" + f"?action=query&prop=extracts&explaintext=1&redirects=1&format=json&titles={urllib.parse.quote(title)}") + obj = json.loads(api) + pages = obj.get("query", {}).get("pages", {}) + extract = "" + for _, page in pages.items(): + if "extract" in page and page["extract"]: + extract = page["extract"] + break + cleaned = clean_wiki_text(extract) + if cleaned and not is_short(cleaned, 1200): + return cleaned + except Exception: + pass + + try: + html_doc = fetch(src) + soup = BeautifulSoup(html_doc, "html.parser") + for el in soup.select("sup.reference, span.mw-editsection"): + el.decompose() + root = soup.select_one("div.mw-parser-output") or soup + paras = [] + for node in root.find_all(["p", "h2", "h3", "li"], recursive=True): + txt = node.get_text(" ", strip=True) + if not txt: + continue + if node.name in ("h2", "h3"): + paras.append(f"\n== {txt} ==") + else: + paras.append(txt) + body = "\n".join(paras) + cleaned = clean_wiki_text(body) + if not is_short(cleaned, 1200): + return cleaned + except Exception: + pass + + try: + data = json.loads(fetch(f"https://en.wikipedia.org/api/rest_v1/page/summary/{slug}")) + extract = (data.get("extract") or "").strip() + cleaned = clean_wiki_text(extract) + if cleaned: + return cleaned + except Exception: + pass + + return "" #failed :( + + # --------------- NON-WIKIPEDIA --------------- + try: + html_doc = fetch(src) + except Exception as e: + raise FileNotFoundError(f"Could not fetch URL: {src} ({e})") + + soup = BeautifulSoup(html_doc, "html.parser") + body = " ".join(x.get_text(" ", strip=True) for x in soup.find_all(["p","li","h2","h3"])) + return re.sub(r"\s+", " ", body).strip() + + # --------------- LOCAL FILE ---------------- + p = Path(src) + if p.exists(): + return p.read_text(encoding="utf-8", errors="ignore") + raise FileNotFoundError(f"Source not found: {src}") + +def clean_wiki_text(text: str) -> str: + text = text.replace("\xa0", " ") + + text = re.sub( + r'(?:^|\n)==\s*(References|See also|External links|Notes|Further reading)\s*==.*$', + '', + text, + flags=re.I | re.S + ) + + text = re.sub(r'\s*\[(?:\d+|[a-z]|note\s*\d+|citation needed)\]\s*', ' ', text, flags=re.I) + + text = re.sub(r'\((?:citation needed|clarification needed|disputed|page needed|dead link|link rot)\)', ' ', text, flags=re.I) + + text = re.sub(r'\bISBN[:\s][0-9Xx\-–]+\b', ' ', text) + text = re.sub(r'\bDOI:\s*[^\s,;]+', ' ', text, flags=re.I) + text = re.sub(r'\bPMID\s*\d+\b', ' ', text, flags=re.I) + text = re.sub(r'\bRetrieved\s+\d{1,2}\s+\w+\s+\d{4}\b', ' ', text, flags=re.I) + + text = re.sub(r'\bv\s*•\s*t\s*•\s*e\b', ' ', text, flags=re.I) + + text = re.sub(r'[ \t]+', ' ', text) + text = re.sub(r'\n{2,}', '\n', text) + return text.strip() + +def is_content_sentence(s: str) -> bool: + s = s.strip() + + # Must end like a real sentence + if not re.search(r'[.!?]"?$', s): # period/exclaim/question (optional closing quote) + return False + + # Length & words + if not (50 <= len(s) <= 350): + return False + if len(s.split()) < 8: + return False + + if not re.search(r'\b(is|are|was|were|has|have|had|includes?|use[sd]?|named|called|described|developed|introduced|designed|built|created|founded|occurr?ed|took|won|published|works?|served|led|caused)\b', s, flags=re.I): + return False + + bad_patterns = [ + r'\bISBN\b', r'\bDOI\b', r'\bPMID\b', + r'\bRetrieved\b', r'Wayback Machine', r'Archive\.?org', + r'www\.', r'https?://', r'\.com\b', r'\.org\b', r'\.edu\b', + r'\bIn:\s', r'pcs\.c1\.Page', r'\bv\s*•\s*t\s*•\s*e\b', + r'^[\"“][A-Z][^.!?]{0,100}[\"”]$', + r'^[A-Z][A-Za-z0-9 ,–\-:]{0,120}$', + ] + if re.search("|".join(bad_patterns), s, flags=re.I): + return False + + letters = re.findall(r'[A-Za-z]', s) + if letters: + upper_ratio = sum(1 for ch in letters if ch.isupper()) / len(letters) + if upper_ratio > 0.35: + return False + + return True + + +def sent_split(text: str): + chunks = re.split(r'(?<=[.!?])\s+(?=[A-Z0-9(“"])', text) + chunks = [c.strip() for c in chunks if c and len(c.strip()) >= 10] + + merged = [] + i = 0 + while i < len(chunks): + s = chunks[i] + if re.search(r'\b[A-Z]\.$', s) and i + 1 < len(chunks): + s = (s + " " + chunks[i+1]).strip() + i += 2 + else: + i += 1 + merged.append(s) + + keep = [m for m in merged if is_content_sentence(m)] + + if len(keep) < 10: + keep = [m for m in merged if 40 <= len(m) <= 350 and not re.search(r'(ISBN|DOI|PMID|www\.|https?://)', m, flags=re.I)] + + seen, out = set(), [] + for s in keep: + k = s.lower() + if k not in seen: + seen.add(k); out.append(s) + return out[:400] + +def swap_entity(sentence: str, target: str, replacement: str) -> str: + pattern = re.escape(target) + return re.sub(pattern, replacement, sentence, count=1) + +# ---------- Teacher components ---------- + +_fe = None +_ner = None +_t2t = None + + +def get_teacher_pipes(): + global _fe, _ner, _t2t + if _fe is None: + _fe = pipeline("feature-extraction", model="sentence-transformers/all-MiniLM-L6-v2") + if _ner is None: + _ner = pipeline("ner", model="dslim/bert-base-NER", aggregation_strategy="simple") + if _t2t is None: + _t2t = pipeline("text2text-generation", model="google/flan-t5-small", max_new_tokens=96) + return _fe, _ner, _t2t + + +def embed_sents(fe, sents): + vecs = [np.array(fe(s, truncation=True)[0]).mean(axis=0) for s in sents] + return np.stack(vecs) + + +def cosine(a, b): + return float((a @ b) / (np.linalg.norm(a) * np.linalg.norm(b) + 1e-9)) + + +def rank_sentences(sents, fe): + M = embed_sents(fe, sents) + centroid = M.mean(axis=0) + scores = [cosine(v, centroid) for v in M] + idx = np.argsort(scores)[::-1] + return [sents[i] for i in idx] + + +def pick_entity(ner_res): + pref = {"ORG":3,"PERSON":3,"LOC":3,"GPE":3,"EVENT":2,"DATE":2,"WORK_OF_ART":2,"LAW":2} + best = None; bestw = -1 + for ent in ner_res: + w = pref.get(ent.get("entity_group"), 1) + word = ent.get("word", "").strip() + if w > bestw and len(word) > 2: + best, bestw = ent, w + return best + +def parse_distractors(raw: str, sentence: str): + parts = re.split(r'[;\n]|(?:,\s(?=[A-Z]))', raw) + alts, seen = [], set() + + for p in parts: + t = p.strip(" -•\t\"'[]()") + if not (2 <= len(t) <= 80): + continue + if re.search(r'(Alt\d|##|pcs\.c1|https?://|www\.)', t, flags=re.I): + continue + if not re.search(r'[A-Za-z]{3,}', t): + continue + if t in sentence: + continue + low = t.lower() + if low in seen: + continue + seen.add(low) + alts.append(t) + + return alts[:4] + +def _norm_ent(s: str) -> str: + s = s.replace("##", "") + s = re.sub(r"\s+", " ", s).strip() + return s + +def _distinct3(a: str, b: str, c: str) -> bool: + aa, bb, cc = a.strip(), b.strip(), c.strip() + if len({aa, bb, cc}) < 3: + return False + return True + +def _bad_text(s: str) -> bool: + if not s or len(s) < 8: + return True + if re.search(r'(Alt\d|##|https?://|www\.)', s, re.I): + return True + letters = re.findall(r'[A-Za-z]', s) + if letters and (sum(ch.isupper() for ch in letters) / len(letters)) > 0.45: + return True + return False + +def display_clean(s: str) -> str: + s = re.sub(r'\s*\[(?:\d+|[a-z]|note\s*\d+|citation needed|self-published source\?)\]\s*', ' ', s, flags=re.I) + s = re.sub(r'\s*\. \.\s*', '. ', s) + s = re.sub(r'\s+', ' ', s).strip() + s = s.replace('##', '') + return s + +def _distinct3(a: str, b: str, c: str) -> bool: + a, b, c = a.strip(), b.strip(), c.strip() + if not a or not b or not c: + return False + return len({a, b, c}) == 3 + +def _looks_like_title_or_link(s: str) -> bool: + if re.search(r'(?:www\.|https?://|\.com\b|\.org\b|\.edu\b)', s, flags=re.I): + return True + if re.match(r'^[\"“][A-Z][^.!?]{0,60}[\"”]$', s): + return True + letters = re.findall(r'[A-Za-z]', s) + if letters and (sum(1 for ch in letters if ch.isupper()) / len(letters)) > 0.45: + return True + return False + + +def list_entities(sentence: str, ner): + pref = {"ORG":3,"PERSON":3,"LOC":3,"GPE":3,"EVENT":2,"DATE":2,"WORK_OF_ART":1,"LAW":1} + ents = ner(sentence) + seen = set(); out = [] + for e in ents: + t = e.get("entity_group"); w = (e.get("word") or "").strip() + if not t or len(w) < 3: + continue + if w.lower() in seen: + continue + seen.add(w.lower()) + out.append((w, t, pref.get(t, 0))) + out.sort(key=lambda x: x[2], reverse=True) + return [(w,t) for (w,t,_) in out] + +def build_entity_pool(sents, ner): + pool = {} + for s in sents: + for e in ner(s): + g = e.get("entity_group") + w = (e.get("word") or "").strip() + if g and len(w) > 2: + pool.setdefault(g, set()).add(w) + return pool + + +def teacher_make_item(sentence: str, entity_pool=None, forced_entity=None): + fe, ner, t2t = get_teacher_pipes() + + if forced_entity is not None: + ent_text, ent_type = forced_entity + else: + ents = ner(sentence) + anchor = pick_entity(ents) + if not anchor: + return None + ent_text = anchor["word"] + ent_type = anchor.get("entity_group") + + ent_text = _norm_ent(ent_text) + + prompt = ( + f"You are writing exam distractors. Given the sentence:\n" + f"'{sentence}'\n" + f"Propose TWO realistic but incorrect alternatives for the entity '{ent_text}'. " + f"Each must be the same entity TYPE and plausibly confusable in context. " + f"Return as a plain list separated by semicolons." + ) + cand = t2t(prompt)[0]["generated_text"] + alts = parse_distractors(cand, sentence) + + if len(alts) < 2: + cand2 = t2t(prompt + "\nReturn exactly two alternatives in the format: Alt1; Alt2")[0]["generated_text"] + alts = parse_distractors(cand2, sentence) + + if len(alts) < 2 and entity_pool and ent_type in entity_pool: + candidates = [x for x in list(entity_pool[ent_type]) if _norm_ent(x).lower() != ent_text.lower()] + random.shuffle(candidates) + for c in candidates: + c2 = _norm_ent(c) + if c2 not in alts: + alts.append(c2) + if len(alts) >= 2: + break + + alts = [a for a in alts if a][:2] + if len(alts) < 2: + return None + + wrong1 = swap_entity(sentence, ent_text, alts[0]) + wrong2 = swap_entity(sentence, ent_text, alts[1]) + + if wrong1 == sentence or wrong2 == sentence: + return None + + if (_bad_text(wrong1) or _bad_text(wrong2) or + _looks_like_title_or_link(wrong1) or _looks_like_title_or_link(wrong2)): + return None + + correct = sentence + if not _distinct3(correct, wrong1, wrong2): + return None + + correct = display_clean(correct) + wrong1 = display_clean(wrong1) + wrong2 = display_clean(wrong2) + + expl = t2t( + f"Explain concisely (1–2 sentences each) why these are wrong in context:\n" + f"Context: {sentence}\nWrong A: {wrong1}\nWrong B: {wrong2}" + )[0]["generated_text"] + + return { + "context": sentence, + "entity": ent_text, + "correct": correct, + "distractors": [wrong1, wrong2], + "explanations": [expl] + } + + +# ---------- Student (adapter) inference ---------- + +_student_model = None +_student_tok = None + + +def load_student(adapter_dir: str): + global _student_model, _student_tok + if _student_model is None: + base = "google/flan-t5-small" + _student_tok = AutoTokenizer.from_pretrained(base) + _student_model = AutoModelForSeq2SeqLM.from_pretrained(base) + if adapter_dir and Path(adapter_dir).exists(): + from peft import PeftModel + _student_model = PeftModel.from_pretrained(_student_model, adapter_dir) + _student_model.eval() + return _student_tok, _student_model + + +def student_generate(adapter_dir: str, sentence: str, entity_pool=None): + tok, mdl = load_student(adapter_dir) + prompt = ( + "Task: Given a factual sentence and a target entity from it, " + "produce a JSON object with keys: correct (string), distractors (array of two strings), " + "explanations (array of 1-2 short sentences). The distractors must be plausible but incorrect and of the same entity type.\n" + f"Sentence: {sentence}\n" + "Output strictly valid JSON." + ) + ids = tok(prompt, return_tensors="pt") + gen = mdl.generate(**ids, max_new_tokens=192) + out = tok.decode(gen[0], skip_special_tokens=True).strip() + try: + obj = json.loads(out) + if not isinstance(obj.get("correct"), str): + raise ValueError + if not isinstance(obj.get("distractors"), list) or len(obj["distractors"]) < 2: + raise ValueError + return { + "context": sentence, + "entity": None, + "correct": obj["correct"], + "distractors": obj["distractors"][:2], + "explanations": obj.get("explanations", []) + } + except Exception: + return teacher_make_item(sentence, entity_pool=entity_pool) + +# ---------- Emitters ---------- + +def write_anki(cards, outdir: Path): + csvp = outdir / "anki_flashcards.csv" + with open(csvp, "w", newline="", encoding="utf-8") as f: + w = csv.writer(f) + w.writerow(["Front","Back","Extra"]) + for c in cards: + q = "Which statement is correct?" + A = display_clean(c["correct"] or "") + B = display_clean(c["distractors"][0] or "") + C = display_clean(c["distractors"][1] or "") + if not _distinct3(A, B, C): + continue + + correct_slot = random.randint(0, 2) + texts = [None, None, None] + texts[correct_slot] = A + rem = [i for i in (0,1,2) if i != correct_slot] + texts[rem[0]] = B + texts[rem[1]] = C + + opts = [(lab, texts[i], i == correct_slot) for i, lab in enumerate(["A","B","C"])] + front = f"{q}

" + "
".join(f"{lab}) {html.escape(txt)}" for lab, txt, _ in opts) + correct_letter = ["A","B","C"][correct_slot] + exp = display_clean(' '.join(c.get('explanations', []))) + back = f"Correct: {correct_letter}" + (f"

{html.escape(exp)}" if exp else "") + extra = f"Context: {html.escape(display_clean(c['context']))}" + w.writerow([front, back, extra]) + + +def write_quiz(cards, outdir: Path): + htmlp = outdir / "quiz.html" + with open(htmlp, "w", encoding="utf-8") as f: + f.write("WhyWrong Quiz") + f.write("") + f.write("

WhyWrong — Counterfactual Quiz

") + f.write("

Pick the correct statement. Click 'Check' to reveal the answer & explanation.

") + f.write("") + for i, c in enumerate(cards): + q = "Which statement is correct?" + A = display_clean(c["correct"] or "") + B = display_clean(c["distractors"][0] or "") + C = display_clean(c["distractors"][1] or "") + if not _distinct3(A, B, C): + continue + + correct_slot = random.randint(0, 2) + texts = [None, None, None] + texts[correct_slot] = A + rem = [idx for idx in (0,1,2) if idx != correct_slot] + texts[rem[0]] = B + texts[rem[1]] = C + + correct_letter = ["A","B","C"][correct_slot] + + f.write(f"

Q{i+1}. {html.escape(q)}

") + for lab, txt in zip(["A","B","C"], texts): + f.write(f"
") + exp = display_clean(' '.join(c.get('explanations', []))) + f.write(f"

") + f.write(f"
Explanation: {html.escape(exp)}
Context: {html.escape(display_clean(c['context']))}
") + +# ---------- Main ---------- + +def main(): + ap = argparse.ArgumentParser() + ap.add_argument("--source", required=True, help="URL or path to text") + ap.add_argument("--k", type=int, default=10, help="number of cards to PRODUCE") + ap.add_argument("--mode", choices=["teacher","student"], default="teacher") + ap.add_argument("--adapters", default=None, help="path to LoRA adapters (for student mode)") + ap.add_argument("--outdir", default="outputs") + args = ap.parse_args() + + os.makedirs(args.outdir, exist_ok=True) + + text = get_text(args.source) + sents = sent_split(text) + print(f"[debug] sentences extracted = {len(sents)}; target cards k = {args.k}") + + cards = [] + + fe, ner, _ = get_teacher_pipes() + + ranked = rank_sentences(sents, fe) + + pool = build_entity_pool(ranked[:120], ner) + + made = 0 + if args.mode == "teacher": + for s in ranked: + item = teacher_make_item(s, entity_pool=pool) + if item: + cards.append(item) + made += 1 + if made >= args.k: + break + else: + for s in ranked: + item = student_generate(args.adapters, s, entity_pool=pool) + if item: + cards.append(item) + made += 1 + if made >= args.k: + break + + if not cards: + print(f"[debug] sentences extracted = {len(sents)}; trying to make k={args.k}") + raise SystemExit("No cards produced. Try a different source or increase --k.") + + outdir = Path(args.outdir) + write_anki(cards, outdir) + write_quiz(cards, outdir) + (outdir/"report.json").write_text(json.dumps(cards, indent=2), encoding="utf-8") + print(f"Wrote: {outdir/'anki_flashcards.csv'}, {outdir/'quiz.html'}, {outdir/'report.json'}") + + +if __name__ == "__main__": + main() From 8de34d1c6c1a8ee3347612914a71a4c9333fc2f0 Mon Sep 17 00:00:00 2001 From: akshajnad <98061663+akshajnad@users.noreply.github.com> Date: Wed, 3 Sep 2025 23:29:03 -0400 Subject: [PATCH 2/6] Create make_synth_data.py --- .../Akshaj Nadimpalli/make_synth_data.py | 143 ++++++++++++++++++ 1 file changed, 143 insertions(+) create mode 100644 submissions/Akshaj Nadimpalli/make_synth_data.py diff --git a/submissions/Akshaj Nadimpalli/make_synth_data.py b/submissions/Akshaj Nadimpalli/make_synth_data.py new file mode 100644 index 0000000..6e58301 --- /dev/null +++ b/submissions/Akshaj Nadimpalli/make_synth_data.py @@ -0,0 +1,143 @@ +import argparse, json, os, random, hashlib +from pathlib import Path + +from app import ( + get_text, sent_split, get_teacher_pipes, rank_sentences, + teacher_make_item, list_entities, build_entity_pool, swap_entity +) + +random.seed(42) + +def norm_key(sentence: str, forced_entity): + ent_txt, ent_typ = (forced_entity or ("", "")) + raw = f"{sentence.strip()}||{(ent_txt or '').strip()}||{(ent_typ or '').strip()}" + return hashlib.md5(raw.encode("utf-8")).hexdigest() + +def fabricate_item(sentence: str, ent_text: str, ent_type: str, pool_by_type: dict): + choices = [x for x in list(pool_by_type.get(ent_type, [])) if x.lower() != (ent_text or "").lower()] + random.shuffle(choices) + if len(choices) < 2: + return None + + wrong1 = swap_entity(sentence, ent_text, choices[0]) + wrong2 = swap_entity(sentence, ent_text, choices[1]) + + expl = ( + f"'{choices[0]}' and '{choices[1]}' are {ent_type} entities that do not match the original fact in context. " + f"The correct statement includes '{ent_text}' instead." + ) + + return { + "context": sentence, + "entity": ent_text, + "correct": sentence, + "distractors": [wrong1, wrong2], + "explanations": [expl] + } + +def build_example(sentence: str, entity_pool, forced_entity=None): + item = teacher_make_item(sentence, entity_pool=entity_pool, forced_entity=forced_entity) + if item: + inst = ( + "Task: Given a factual sentence, produce a JSON object with keys: " + "correct (string), distractors (array of two strings), explanations (array with 1-2 sentences). " + "The distractors must be plausible but incorrect and of the same entity type.\n" + f"Sentence: {item['context']}\nOutput strictly valid JSON." + ) + tgt = json.dumps({ + "correct": item["correct"], + "distractors": item["distractors"][:2], + "explanations": item.get("explanations", []) + }, ensure_ascii=False) + return {"input": inst, "target": tgt} + + if forced_entity is not None: + ent_text, ent_type = forced_entity + else: + ent_text, ent_type = None, None + + if ent_text and ent_type: + item2 = fabricate_item(sentence, ent_text, ent_type, entity_pool) + if item2: + inst = ( + "Task: Given a factual sentence, produce a JSON object with keys: " + "correct (string), distractors (array of two strings), explanations (array with 1-2 sentences). " + "The distractors must be plausible but incorrect and of the same entity type.\n" + f"Sentence: {item2['context']}\nOutput strictly valid JSON." + ) + tgt = json.dumps({ + "correct": item2["correct"], + "distractors": item2["distractors"][:2], + "explanations": item2.get("explanations", []) + }, ensure_ascii=False) + return {"input": inst, "target": tgt} + + return None + +def main(): + ap = argparse.ArgumentParser() + ap.add_argument("--sources", required=True, help="text file with one URL or local path per line") + ap.add_argument("--out", default="data/train.jsonl") + ap.add_argument("--k", type=int, default=30, help="TARGET items to MAKE per source") + ap.add_argument("--per_sentence_max", type=int, default=5, help="max entities to try per sentence") + ap.add_argument("--max_ranked", type=int, default=400, help="how many ranked sents to consider per source") + ap.add_argument("--target_total", type=int, default=1000, help="stop when total items reach this number") + args = ap.parse_args() + + srcs = [x.strip() for x in Path(args.sources).read_text(encoding="utf-8").splitlines() if x.strip()] + random.shuffle(srcs) + + os.makedirs(Path(args.out).parent, exist_ok=True) + + fe, ner, _ = get_teacher_pipes() + n_ok = 0 + seen_keys = set() + + with open(args.out, "w", encoding="utf-8") as f: + for si, src in enumerate(srcs, 1): + if n_ok >= args.target_total: + break + try: + text = get_text(src) + sents = sent_split(text) + if not sents: + print(f"[warn] no usable sentences in {src}; skipping") + continue + + ranked = rank_sentences(sents, fe) + pool = build_entity_pool(ranked[:min(len(ranked), args.max_ranked)], ner) + + made_here = 0 + tried_sentences = 0 + + for s in ranked[:args.max_ranked]: + if made_here >= args.k or n_ok >= args.target_total: + break + ents = list_entities(s, ner)[:args.per_sentence_max] + tried_sentences += 1 + if not ents: + continue + + for ent in ents: + if made_here >= args.k or n_ok >= args.target_total: + break + key = norm_key(s, ent) + if key in seen_keys: + continue + + ex = build_example(s, pool, forced_entity=ent) + if ex: + f.write(json.dumps(ex, ensure_ascii=False) + "\n") + seen_keys.add(key) + n_ok += 1 + made_here += 1 + + print(f"[info] [{si}/{len(srcs)}] {src}: made {made_here}/{args.k} (tried {tried_sentences} sents; total={n_ok})") + + except Exception as e: + print(f"[warn] skipping {src}: {e}") + + print(f"Wrote {n_ok} examples to {args.out}") + +if __name__ == "__main__": + main() From 8c5d1f8e78ecb9dd289889dbbcd5a810b4edd4f6 Mon Sep 17 00:00:00 2001 From: akshajnad <98061663+akshajnad@users.noreply.github.com> Date: Wed, 3 Sep 2025 23:30:24 -0400 Subject: [PATCH 3/6] Create train_lora.py --- submissions/Akshaj Nadimpalli/train_lora.py | 85 +++++++++++++++++++++ 1 file changed, 85 insertions(+) create mode 100644 submissions/Akshaj Nadimpalli/train_lora.py diff --git a/submissions/Akshaj Nadimpalli/train_lora.py b/submissions/Akshaj Nadimpalli/train_lora.py new file mode 100644 index 0000000..42ddd91 --- /dev/null +++ b/submissions/Akshaj Nadimpalli/train_lora.py @@ -0,0 +1,85 @@ +import argparse, json +from pathlib import Path +from datasets import load_dataset +from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Trainer, TrainingArguments +from peft import get_peft_model, LoraConfig, TaskType +from inspect import signature + +def main(): + ap = argparse.ArgumentParser() + ap.add_argument("--train_path", required=True) + ap.add_argument("--output_dir", default="adapters/whywrong-lora") + ap.add_argument("--epochs", type=int, default=1) + ap.add_argument("--batch_size", type=int, default=8) + ap.add_argument("--lr", type=float, default=1e-4) + ap.add_argument("--lora_r", type=int, default=16) + ap.add_argument("--lora_alpha", type=int, default=32) + ap.add_argument("--lora_dropout", type=float, default=0.05) + ap.add_argument("--max_source_len", type=int, default=768) + ap.add_argument("--max_target_len", type=int, default=256) + args = ap.parse_args() + + base = "google/flan-t5-small" + tok = AutoTokenizer.from_pretrained(base) + mdl = AutoModelForSeq2SeqLM.from_pretrained(base) + + ds = load_dataset("json", data_files=args.train_path, split="train") + ds = ds.train_test_split(test_size=0.1, seed=42) + + def tok_fn(batch): + model_in = tok(batch["input"], truncation=True, max_length=args.max_source_len) + labels = tok(text_target=batch["target"], truncation=True, max_length=args.max_target_len) + model_in["labels"] = labels["input_ids"] + return model_in + + ds_tok = ds.map(tok_fn, batched=True, remove_columns=ds["train"].column_names) + + peft_cfg = LoraConfig( + task_type=TaskType.SEQ_2_SEQ_LM, + r=args.lora_r, + lora_alpha=args.lora_alpha, + lora_dropout=args.lora_dropout, + target_modules=["q","v","k","o"] # common for T5 + ) + mdl = get_peft_model(mdl, peft_cfg) + + collator = DataCollatorForSeq2Seq(tok, model=mdl) + + kw = dict( + output_dir=args.output_dir, + per_device_train_batch_size=args.batch_size, + per_device_eval_batch_size=args.batch_size, + learning_rate=args.lr, + num_train_epochs=args.epochs, + logging_steps=50, + remove_unused_columns=True, + report_to=[], + ) + + sig = signature(TrainingArguments) + if "evaluation_strategy" in sig.parameters: + kw["evaluation_strategy"] = "epoch" + if "save_strategy" in sig.parameters: + kw["save_strategy"] = "epoch" + if "fp16" in sig.parameters: + kw["fp16"] = False + if "bf16" in sig.parameters: + kw["bf16"] = False + + train_args = TrainingArguments(**kw) + + trainer = Trainer( + model=mdl, + args=train_args, + train_dataset=ds_tok["train"], + eval_dataset=ds_tok["test"], + data_collator=collator, + tokenizer=tok, + ) + + trainer.train() + mdl.save_pretrained(args.output_dir) + print(f"Saved LoRA adapters to {args.output_dir}") + +if __name__ == "__main__": + main() From eb702b5026826b017f9b0efde5382a7f2130abb0 Mon Sep 17 00:00:00 2001 From: akshajnad <98061663+akshajnad@users.noreply.github.com> Date: Wed, 3 Sep 2025 23:30:39 -0400 Subject: [PATCH 4/6] Create requirements.txt --- submissions/Akshaj Nadimpalli/requirements.txt | 10 ++++++++++ 1 file changed, 10 insertions(+) create mode 100644 submissions/Akshaj Nadimpalli/requirements.txt diff --git a/submissions/Akshaj Nadimpalli/requirements.txt b/submissions/Akshaj Nadimpalli/requirements.txt new file mode 100644 index 0000000..8c783b8 --- /dev/null +++ b/submissions/Akshaj Nadimpalli/requirements.txt @@ -0,0 +1,10 @@ +transformers>=4.42.0 +peft>=0.11.0 +accelerate>=0.30.0 +datasets>=2.20.0 +sentencepiece>=0.2.0 +torch>=2.2.0 +numpy>=1.26.0 +beautifulsoup4>=4.12.0 +nltk>=3.8.1 +scikit-learn>=1.4.0 From 70420a879dec35bd9197024cca91f45066dc7b4c Mon Sep 17 00:00:00 2001 From: akshajnad <98061663+akshajnad@users.noreply.github.com> Date: Wed, 3 Sep 2025 23:30:56 -0400 Subject: [PATCH 5/6] Create sample_sources.txt --- .../Akshaj Nadimpalli/sample_sources.txt | 51 +++++++++++++++++++ 1 file changed, 51 insertions(+) create mode 100644 submissions/Akshaj Nadimpalli/sample_sources.txt diff --git a/submissions/Akshaj Nadimpalli/sample_sources.txt b/submissions/Akshaj Nadimpalli/sample_sources.txt new file mode 100644 index 0000000..1e7988a --- /dev/null +++ b/submissions/Akshaj Nadimpalli/sample_sources.txt @@ -0,0 +1,51 @@ +https://en.wikipedia.org/wiki/DNA +https://en.wikipedia.org/wiki/Mitochondrion +https://en.wikipedia.org/wiki/CRISPR +https://en.wikipedia.org/wiki/RNA_interference +https://en.wikipedia.org/wiki/Insulin +https://en.wikipedia.org/wiki/Photosynthesis +https://en.wikipedia.org/wiki/Great_Depression +https://en.wikipedia.org/wiki/New_Deal +https://en.wikipedia.org/wiki/Marshall_Plan +https://en.wikipedia.org/wiki/Cuban_Missile_Crisis +https://en.wikipedia.org/wiki/United_Nations +https://en.wikipedia.org/wiki/NATO +https://en.wikipedia.org/wiki/World_Wide_Web +https://en.wikipedia.org/wiki/Internet +https://en.wikipedia.org/wiki/Alan_Turing +https://en.wikipedia.org/wiki/Claude_Shannon +https://en.wikipedia.org/wiki/Unix +https://en.wikipedia.org/wiki/Linux +https://en.wikipedia.org/wiki/Algorithm +https://en.wikipedia.org/wiki/Dijkstra%27s_algorithm +https://en.wikipedia.org/wiki/Quicksort +https://en.wikipedia.org/wiki/Relational_database +https://en.wikipedia.org/wiki/SQL +https://en.wikipedia.org/wiki/General_relativity +https://en.wikipedia.org/wiki/Quantum_mechanics +https://en.wikipedia.org/wiki/Higgs_boson +https://en.wikipedia.org/wiki/Periodic_table +https://en.wikipedia.org/wiki/Haber_process +https://en.wikipedia.org/wiki/Ammonia +https://en.wikipedia.org/wiki/Photosystem_II +https://en.wikipedia.org/wiki/Vaccination +https://en.wikipedia.org/wiki/Penicillin +https://en.wikipedia.org/wiki/Human_immunodeficiency_virus +https://en.wikipedia.org/wiki/Coronavirus +https://en.wikipedia.org/wiki/Federal_Reserve +https://en.wikipedia.org/wiki/Inflation +https://en.wikipedia.org/wiki/Gross_domestic_product +https://en.wikipedia.org/wiki/Glass%E2%80%93Steagall_Legislation +https://en.wikipedia.org/wiki/Brown_v._Board_of_Education +https://en.wikipedia.org/wiki/Miranda_v._Arizona +https://en.wikipedia.org/wiki/Clean_Air_Act_(United_States) +https://en.wikipedia.org/wiki/Bill_of_Rights +https://en.wikipedia.org/wiki/Manhattan_Project +https://en.wikipedia.org/wiki/Apollo_program +https://en.wikipedia.org/wiki/International_Space_Station +https://en.wikipedia.org/wiki/Global_Positioning_System +https://en.wikipedia.org/wiki/Artificial_intelligence +https://en.wikipedia.org/wiki/Machine_learning +https://en.wikipedia.org/wiki/Neural_network +https://en.wikipedia.org/wiki/Support_vector_machine +https://en.wikipedia.org/wiki/Backpropagation From c4944535bc0b192b41622646d3e6c24438f8acf4 Mon Sep 17 00:00:00 2001 From: akshajnad <98061663+akshajnad@users.noreply.github.com> Date: Wed, 3 Sep 2025 23:31:30 -0400 Subject: [PATCH 6/6] Create README.md --- submissions/Akshaj Nadimpalli/README.md | 26 +++++++++++++++++++++++++ 1 file changed, 26 insertions(+) create mode 100644 submissions/Akshaj Nadimpalli/README.md diff --git a/submissions/Akshaj Nadimpalli/README.md b/submissions/Akshaj Nadimpalli/README.md new file mode 100644 index 0000000..7382548 --- /dev/null +++ b/submissions/Akshaj Nadimpalli/README.md @@ -0,0 +1,26 @@ +Project: WhyWrong (Akshaj Nadimpalli) +Generates deceptive, counterfactual flashcards from any article or local text. (currently changes a specific word of the phrase and provide 3 choices, of which one is true). + +0) Setup +python -m venv venv +source venv/bin/activate # (Windows: venv\Scripts\activate) +pip install -r requirements.txt + +1) Build tiny dataset +python make_synth_data.py --sources sample_sources.txt \ + --out data/train.jsonl \ + --k 30 --per_sentence_max 5 --max_ranked 500 --target_total 1000 + +2) Train LoRA +python train_lora.py \ + --train_path data/train.jsonl \ + --output_dir adapters/whywrong-lora \ + --epochs 1 \ + --batch_size 8 \ + --lr 1e-4 \ + --lora_r 16 --lora_alpha 32 --lora_dropout 0.05 + +3) Run generator on new article (replace link with any wikipedia article) +Teacher (no training, just do this directly after the setup): python app.py --source "https://en.wikipedia.org/wiki/Psychology" \ + --k 12 --mode teacher --outdir outputs +Student (with trained model): python app.py --source "https://en.wikipedia.org/wiki/Psychology" --k 10 --mode student --adapters adapters/whywrong-lora --outdir outputs