diff --git a/submissions/Submission: Rohan Konanki/README.md b/submissions/Submission: Rohan Konanki/README.md new file mode 100644 index 0000000..e634063 --- /dev/null +++ b/submissions/Submission: Rohan Konanki/README.md @@ -0,0 +1,10 @@ +This application works as a simple classifier between fake news and real news, trained on a news dataset from Hugging Face (using the distillbert base model). +Given a URL input from the user, it parses the contents at the URL and predicts whether the news is real or fake based on the context in the dataset. +The model is not entirely acccurate for newer articles, since it lacks the grounding data to determine the truth of the information. +The libraries required are as follows: +- datasets +- transformers +- bs4 (URL text parser) +- scikit-learn +- pandas +- requests diff --git a/submissions/Submission: Rohan Konanki/Research_Application.ipynb b/submissions/Submission: Rohan Konanki/Research_Application.ipynb new file mode 100644 index 0000000..003a4a5 --- /dev/null +++ b/submissions/Submission: Rohan Konanki/Research_Application.ipynb @@ -0,0 +1,223 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "code", + "execution_count": 17, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "Qe_spw0enTcU", + "outputId": "b037c15f-b656-4b06-a8ce-b641d861a72d" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Requirement already satisfied: transformers in /usr/local/lib/python3.12/dist-packages (4.55.4)\n", + "Requirement already satisfied: datasets in /usr/local/lib/python3.12/dist-packages (4.0.0)\n", + "Requirement already satisfied: filelock in /usr/local/lib/python3.12/dist-packages (from transformers) (3.19.1)\n", + "Requirement already satisfied: huggingface-hub<1.0,>=0.34.0 in /usr/local/lib/python3.12/dist-packages (from transformers) (0.34.4)\n", + "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.12/dist-packages (from transformers) (2.0.2)\n", + "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.12/dist-packages (from transformers) (25.0)\n", + "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.12/dist-packages (from transformers) (6.0.2)\n", + "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.12/dist-packages (from transformers) (2024.11.6)\n", + "Requirement already satisfied: requests in /usr/local/lib/python3.12/dist-packages (from transformers) (2.32.4)\n", + "Requirement already satisfied: tokenizers<0.22,>=0.21 in /usr/local/lib/python3.12/dist-packages (from transformers) (0.21.4)\n", + "Requirement already satisfied: safetensors>=0.4.3 in /usr/local/lib/python3.12/dist-packages (from transformers) (0.6.2)\n", + "Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.12/dist-packages (from transformers) (4.67.1)\n", + "Requirement already satisfied: pyarrow>=15.0.0 in /usr/local/lib/python3.12/dist-packages (from datasets) (18.1.0)\n", + "Requirement already satisfied: dill<0.3.9,>=0.3.0 in /usr/local/lib/python3.12/dist-packages (from datasets) (0.3.8)\n", + "Requirement already satisfied: pandas in /usr/local/lib/python3.12/dist-packages (from datasets) (2.2.2)\n", + "Requirement already satisfied: xxhash in /usr/local/lib/python3.12/dist-packages (from datasets) (3.5.0)\n", + "Requirement already satisfied: multiprocess<0.70.17 in /usr/local/lib/python3.12/dist-packages (from datasets) (0.70.16)\n", + "Requirement already satisfied: fsspec<=2025.3.0,>=2023.1.0 in /usr/local/lib/python3.12/dist-packages (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets) (2025.3.0)\n", + "Requirement already satisfied: aiohttp!=4.0.0a0,!=4.0.0a1 in /usr/local/lib/python3.12/dist-packages (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets) (3.12.15)\n", + "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.12/dist-packages (from huggingface-hub<1.0,>=0.34.0->transformers) (4.14.1)\n", + "Requirement already satisfied: hf-xet<2.0.0,>=1.1.3 in /usr/local/lib/python3.12/dist-packages (from huggingface-hub<1.0,>=0.34.0->transformers) (1.1.8)\n", + "Requirement already satisfied: charset_normalizer<4,>=2 in /usr/local/lib/python3.12/dist-packages (from requests->transformers) (3.4.3)\n", + "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.12/dist-packages (from requests->transformers) (3.10)\n", + "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.12/dist-packages (from requests->transformers) (2.5.0)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.12/dist-packages (from requests->transformers) (2025.8.3)\n", + "Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.12/dist-packages (from pandas->datasets) (2.9.0.post0)\n", + "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.12/dist-packages (from pandas->datasets) (2025.2)\n", + "Requirement already satisfied: tzdata>=2022.7 in /usr/local/lib/python3.12/dist-packages (from pandas->datasets) (2025.2)\n", + "Requirement already satisfied: aiohappyeyeballs>=2.5.0 in /usr/local/lib/python3.12/dist-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<=2025.3.0,>=2023.1.0->datasets) (2.6.1)\n", + "Requirement already satisfied: aiosignal>=1.4.0 in /usr/local/lib/python3.12/dist-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<=2025.3.0,>=2023.1.0->datasets) (1.4.0)\n", + "Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.12/dist-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<=2025.3.0,>=2023.1.0->datasets) (25.3.0)\n", + "Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.12/dist-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<=2025.3.0,>=2023.1.0->datasets) (1.7.0)\n", + "Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.12/dist-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<=2025.3.0,>=2023.1.0->datasets) (6.6.4)\n", + "Requirement already satisfied: propcache>=0.2.0 in /usr/local/lib/python3.12/dist-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<=2025.3.0,>=2023.1.0->datasets) (0.3.2)\n", + "Requirement already satisfied: yarl<2.0,>=1.17.0 in /usr/local/lib/python3.12/dist-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<=2025.3.0,>=2023.1.0->datasets) (1.20.1)\n", + "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.12/dist-packages (from python-dateutil>=2.8.2->pandas->datasets) (1.17.0)\n", + "Requirement already satisfied: bs4 in /usr/local/lib/python3.12/dist-packages (0.0.2)\n", + "Requirement already satisfied: beautifulsoup4 in /usr/local/lib/python3.12/dist-packages (from bs4) (4.13.5)\n", + "Requirement already satisfied: soupsieve>1.2 in /usr/local/lib/python3.12/dist-packages (from beautifulsoup4->bs4) (2.7)\n", + "Requirement already satisfied: typing-extensions>=4.0.0 in /usr/local/lib/python3.12/dist-packages (from beautifulsoup4->bs4) (4.14.1)\n" + ] + } + ], + "source": [ + "!pip install transformers datasets\n", + "!pip install bs4" + ] + }, + { + "cell_type": "code", + "source": [ + "import pandas as pd\n", + "from datasets import load_dataset\n", + "ds = load_dataset(\"GonzaloA/fake_news\", split=\"train\")\n", + "df = ds.to_pandas()\n" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "cYbinwQwoNQp", + "outputId": "429ac36d-55cc-4a91-bd61-bdc912537af4" + }, + "execution_count": 18, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Repo card metadata block was not found. Setting CardData to empty.\n", + "WARNING:huggingface_hub.repocard:Repo card metadata block was not found. Setting CardData to empty.\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "from sklearn.model_selection import train_test_split\n", + "from sklearn.feature_extraction.text import TfidfVectorizer\n", + "from sklearn.metrics import accuracy_score, classification_report\n", + "\n", + "df = df.dropna(subset=['text', 'label'])\n", + "X = df['text']\n", + "y = df['label']\n", + "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)" + ], + "metadata": { + "id": "4s7DM9wrpI0R" + }, + "execution_count": 19, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "tfidf_vectorizer = TfidfVectorizer(max_features=2500, stop_words='english')\n", + "X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)\n", + "X_test_tfidf = tfidf_vectorizer.transform(X_test)" + ], + "metadata": { + "id": "326OYLglp20I" + }, + "execution_count": 20, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "from transformers import pipeline\n", + "classifier = pipeline(\"text-classification\", model=\"distilbert-base-uncased-finetuned-sst-2-english\", truncation=True)\n", + "sample_size = 50\n", + "X_sample = X_test.head(sample_size).tolist()\n", + "y_sample = y_test.head(sample_size).tolist()\n", + "y_pred_raw = classifier(X_sample)\n", + "y_pred = [1 if p['label'] == 'POSITIVE' else 0 for p in y_pred_raw]\n", + "accuracy = accuracy_score(y_sample, y_pred)\n", + "report = classification_report(y_sample, y_pred, target_names=['True', 'Fake'])" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "2KAgTdG7qeI5", + "outputId": "e5e6754e-b4fd-491f-ec6b-1f3b154ee002" + }, + "execution_count": 22, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Device set to use cpu\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "import requests\n", + "from bs4 import BeautifulSoup\n", + "def predict_article(url):\n", + " try:\n", + " response = requests.get(url, timeout=10)\n", + " response.raise_for_status()\n", + " soup = BeautifulSoup(response.text, 'html.parser')\n", + " text = soup.get_text()\n", + " #data is truncated for token reasons\n", + " text_to_predict = text[:512]\n", + " prediction_raw = classifier(text_to_predict)\n", + " prediction = prediction_raw[0]['label']\n", + " return \"Real\" if prediction == 'POSITIVE' else \"Fake\"\n", + " except requests.exceptions.RequestException as e:\n", + " return f\"Error: Could not retrieve article from URL. {e}\"\n", + " except Exception as e:\n", + " return f\"Error: An unexpected error occurred. {e}\"" + ], + "metadata": { + "id": "xAT82LcUquTg" + }, + "execution_count": 24, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "url_to_check = input(\"Please enter a URL to check: \")\n", + "prediction = predict_article(url_to_check)\n", + "print(f\"The article at {url_to_check} is predicted to be {prediction}.\")" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "2fMCoL8xrbIr", + "outputId": "f4316ed6-7878-4805-9f45-3d875d3cd218" + }, + "execution_count": 25, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Please enter a URL to check: https://abcnews.go.com/Politics/trump-calls-epstein-files-irrelevant-push-release-gains/story?id=125225706\n", + "The article at https://abcnews.go.com/Politics/trump-calls-epstein-files-irrelevant-push-release-gains/story?id=125225706 is predicted to be Fake.\n" + ] + } + ] + } + ] +} \ No newline at end of file