Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions submissions/Submission: Rohan Konanki/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
This application works as a simple classifier between fake news and real news, trained on a news dataset from Hugging Face (using the distillbert base model).
Given a URL input from the user, it parses the contents at the URL and predicts whether the news is real or fake based on the context in the dataset.
The model is not entirely acccurate for newer articles, since it lacks the grounding data to determine the truth of the information.
The libraries required are as follows:
- datasets
- transformers
- bs4 (URL text parser)
- scikit-learn
- pandas
- requests
223 changes: 223 additions & 0 deletions submissions/Submission: Rohan Konanki/Research_Application.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,223 @@
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"provenance": []
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
}
},
"cells": [
{
"cell_type": "code",
"execution_count": 17,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "Qe_spw0enTcU",
"outputId": "b037c15f-b656-4b06-a8ce-b641d861a72d"
},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Requirement already satisfied: transformers in /usr/local/lib/python3.12/dist-packages (4.55.4)\n",
"Requirement already satisfied: datasets in /usr/local/lib/python3.12/dist-packages (4.0.0)\n",
"Requirement already satisfied: filelock in /usr/local/lib/python3.12/dist-packages (from transformers) (3.19.1)\n",
"Requirement already satisfied: huggingface-hub<1.0,>=0.34.0 in /usr/local/lib/python3.12/dist-packages (from transformers) (0.34.4)\n",
"Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.12/dist-packages (from transformers) (2.0.2)\n",
"Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.12/dist-packages (from transformers) (25.0)\n",
"Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.12/dist-packages (from transformers) (6.0.2)\n",
"Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.12/dist-packages (from transformers) (2024.11.6)\n",
"Requirement already satisfied: requests in /usr/local/lib/python3.12/dist-packages (from transformers) (2.32.4)\n",
"Requirement already satisfied: tokenizers<0.22,>=0.21 in /usr/local/lib/python3.12/dist-packages (from transformers) (0.21.4)\n",
"Requirement already satisfied: safetensors>=0.4.3 in /usr/local/lib/python3.12/dist-packages (from transformers) (0.6.2)\n",
"Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.12/dist-packages (from transformers) (4.67.1)\n",
"Requirement already satisfied: pyarrow>=15.0.0 in /usr/local/lib/python3.12/dist-packages (from datasets) (18.1.0)\n",
"Requirement already satisfied: dill<0.3.9,>=0.3.0 in /usr/local/lib/python3.12/dist-packages (from datasets) (0.3.8)\n",
"Requirement already satisfied: pandas in /usr/local/lib/python3.12/dist-packages (from datasets) (2.2.2)\n",
"Requirement already satisfied: xxhash in /usr/local/lib/python3.12/dist-packages (from datasets) (3.5.0)\n",
"Requirement already satisfied: multiprocess<0.70.17 in /usr/local/lib/python3.12/dist-packages (from datasets) (0.70.16)\n",
"Requirement already satisfied: fsspec<=2025.3.0,>=2023.1.0 in /usr/local/lib/python3.12/dist-packages (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets) (2025.3.0)\n",
"Requirement already satisfied: aiohttp!=4.0.0a0,!=4.0.0a1 in /usr/local/lib/python3.12/dist-packages (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets) (3.12.15)\n",
"Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.12/dist-packages (from huggingface-hub<1.0,>=0.34.0->transformers) (4.14.1)\n",
"Requirement already satisfied: hf-xet<2.0.0,>=1.1.3 in /usr/local/lib/python3.12/dist-packages (from huggingface-hub<1.0,>=0.34.0->transformers) (1.1.8)\n",
"Requirement already satisfied: charset_normalizer<4,>=2 in /usr/local/lib/python3.12/dist-packages (from requests->transformers) (3.4.3)\n",
"Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.12/dist-packages (from requests->transformers) (3.10)\n",
"Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.12/dist-packages (from requests->transformers) (2.5.0)\n",
"Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.12/dist-packages (from requests->transformers) (2025.8.3)\n",
"Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.12/dist-packages (from pandas->datasets) (2.9.0.post0)\n",
"Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.12/dist-packages (from pandas->datasets) (2025.2)\n",
"Requirement already satisfied: tzdata>=2022.7 in /usr/local/lib/python3.12/dist-packages (from pandas->datasets) (2025.2)\n",
"Requirement already satisfied: aiohappyeyeballs>=2.5.0 in /usr/local/lib/python3.12/dist-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<=2025.3.0,>=2023.1.0->datasets) (2.6.1)\n",
"Requirement already satisfied: aiosignal>=1.4.0 in /usr/local/lib/python3.12/dist-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<=2025.3.0,>=2023.1.0->datasets) (1.4.0)\n",
"Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.12/dist-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<=2025.3.0,>=2023.1.0->datasets) (25.3.0)\n",
"Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.12/dist-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<=2025.3.0,>=2023.1.0->datasets) (1.7.0)\n",
"Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.12/dist-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<=2025.3.0,>=2023.1.0->datasets) (6.6.4)\n",
"Requirement already satisfied: propcache>=0.2.0 in /usr/local/lib/python3.12/dist-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<=2025.3.0,>=2023.1.0->datasets) (0.3.2)\n",
"Requirement already satisfied: yarl<2.0,>=1.17.0 in /usr/local/lib/python3.12/dist-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<=2025.3.0,>=2023.1.0->datasets) (1.20.1)\n",
"Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.12/dist-packages (from python-dateutil>=2.8.2->pandas->datasets) (1.17.0)\n",
"Requirement already satisfied: bs4 in /usr/local/lib/python3.12/dist-packages (0.0.2)\n",
"Requirement already satisfied: beautifulsoup4 in /usr/local/lib/python3.12/dist-packages (from bs4) (4.13.5)\n",
"Requirement already satisfied: soupsieve>1.2 in /usr/local/lib/python3.12/dist-packages (from beautifulsoup4->bs4) (2.7)\n",
"Requirement already satisfied: typing-extensions>=4.0.0 in /usr/local/lib/python3.12/dist-packages (from beautifulsoup4->bs4) (4.14.1)\n"
]
}
],
"source": [
"!pip install transformers datasets\n",
"!pip install bs4"
]
},
{
"cell_type": "code",
"source": [
"import pandas as pd\n",
"from datasets import load_dataset\n",
"ds = load_dataset(\"GonzaloA/fake_news\", split=\"train\")\n",
"df = ds.to_pandas()\n"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "cYbinwQwoNQp",
"outputId": "429ac36d-55cc-4a91-bd61-bdc912537af4"
},
"execution_count": 18,
"outputs": [
{
"output_type": "stream",
"name": "stderr",
"text": [
"Repo card metadata block was not found. Setting CardData to empty.\n",
"WARNING:huggingface_hub.repocard:Repo card metadata block was not found. Setting CardData to empty.\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"from sklearn.model_selection import train_test_split\n",
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
"from sklearn.metrics import accuracy_score, classification_report\n",
"\n",
"df = df.dropna(subset=['text', 'label'])\n",
"X = df['text']\n",
"y = df['label']\n",
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)"
],
"metadata": {
"id": "4s7DM9wrpI0R"
},
"execution_count": 19,
"outputs": []
},
{
"cell_type": "code",
"source": [
"tfidf_vectorizer = TfidfVectorizer(max_features=2500, stop_words='english')\n",
"X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)\n",
"X_test_tfidf = tfidf_vectorizer.transform(X_test)"
],
"metadata": {
"id": "326OYLglp20I"
},
"execution_count": 20,
"outputs": []
},
{
"cell_type": "code",
"source": [
"from transformers import pipeline\n",
"classifier = pipeline(\"text-classification\", model=\"distilbert-base-uncased-finetuned-sst-2-english\", truncation=True)\n",
"sample_size = 50\n",
"X_sample = X_test.head(sample_size).tolist()\n",
"y_sample = y_test.head(sample_size).tolist()\n",
"y_pred_raw = classifier(X_sample)\n",
"y_pred = [1 if p['label'] == 'POSITIVE' else 0 for p in y_pred_raw]\n",
"accuracy = accuracy_score(y_sample, y_pred)\n",
"report = classification_report(y_sample, y_pred, target_names=['True', 'Fake'])"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "2KAgTdG7qeI5",
"outputId": "e5e6754e-b4fd-491f-ec6b-1f3b154ee002"
},
"execution_count": 22,
"outputs": [
{
"output_type": "stream",
"name": "stderr",
"text": [
"Device set to use cpu\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"import requests\n",
"from bs4 import BeautifulSoup\n",
"def predict_article(url):\n",
" try:\n",
" response = requests.get(url, timeout=10)\n",
" response.raise_for_status()\n",
" soup = BeautifulSoup(response.text, 'html.parser')\n",
" text = soup.get_text()\n",
" #data is truncated for token reasons\n",
" text_to_predict = text[:512]\n",
" prediction_raw = classifier(text_to_predict)\n",
" prediction = prediction_raw[0]['label']\n",
" return \"Real\" if prediction == 'POSITIVE' else \"Fake\"\n",
" except requests.exceptions.RequestException as e:\n",
" return f\"Error: Could not retrieve article from URL. {e}\"\n",
" except Exception as e:\n",
" return f\"Error: An unexpected error occurred. {e}\""
],
"metadata": {
"id": "xAT82LcUquTg"
},
"execution_count": 24,
"outputs": []
},
{
"cell_type": "code",
"source": [
"url_to_check = input(\"Please enter a URL to check: \")\n",
"prediction = predict_article(url_to_check)\n",
"print(f\"The article at {url_to_check} is predicted to be {prediction}.\")"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "2fMCoL8xrbIr",
"outputId": "f4316ed6-7878-4805-9f45-3d875d3cd218"
},
"execution_count": 25,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Please enter a URL to check: https://abcnews.go.com/Politics/trump-calls-epstein-files-irrelevant-push-release-gains/story?id=125225706\n",
"The article at https://abcnews.go.com/Politics/trump-calls-epstein-files-irrelevant-push-release-gains/story?id=125225706 is predicted to be Fake.\n"
]
}
]
}
]
}