gpsaggese · vaibhavdevarapalli · Apr 1, 2026 · May 6, 2026 · May 6, 2026 · May 7, 2026
diff --git a/...Spring2026/projects/UmdTask458_DATA605_Spring2026_FastText_text_classification/.gitignore b/...Spring2026/projects/UmdTask458_DATA605_Spring2026_FastText_text_classification/.gitignore
@@ -0,0 +1 @@
+fasttext_best_model.bin
diff --git a/...Spring2026/projects/UmdTask458_DATA605_Spring2026_FastText_text_classification/Dockerfile b/...Spring2026/projects/UmdTask458_DATA605_Spring2026_FastText_text_classification/Dockerfile
@@ -0,0 +1,30 @@
+# Use Python 3.12 slim (already has Python and pip).
+FROM python:3.12-slim
+
+# Avoid interactive prompts during apt operations.
+ENV DEBIAN_FRONTEND=noninteractive
+
+# Install CA certificates (needed for HTTPS).
+RUN apt-get update && apt-get install -y \
+    ca-certificates \
+    && rm -rf /var/lib/apt/lists/*
+
+# Install project specific packages.
+RUN mkdir -p /install
+COPY requirements.txt /install/requirements.txt
+RUN pip install --upgrade pip && \
+    pip install --no-cache-dir jupyterlab jupyterlab_vim jupytext -r /install/requirements.txt
+
+# Config.
+COPY etc_sudoers /install/
+COPY etc_sudoers /etc/sudoers
+COPY bashrc /root/.bashrc
+
+# Report package versions.
+COPY version.sh /install/
+RUN /install/version.sh 2>&1 | tee version.log
+
+# Jupyter.
+EXPOSE 8888
+
+CMD ["/bin/bash"]
diff --git a/...rojects/UmdTask458_DATA605_Spring2026_FastText_text_classification/Dockerfile.python_slim b/...rojects/UmdTask458_DATA605_Spring2026_FastText_text_classification/Dockerfile.python_slim
@@ -0,0 +1,28 @@
+# Use Python 3.12 slim (already has Python and pip).
+FROM python:3.12-slim
+
+# Avoid interactive prompts during apt operations.
+ENV DEBIAN_FRONTEND=noninteractive
+
+# Install CA certificates (needed for HTTPS).
+RUN apt-get update && apt-get install -y \
+    ca-certificates \
+    && rm -rf /var/lib/apt/lists/*
+
+# Install project specific packages.
+RUN mkdir -p /install
+COPY requirements.txt /install/requirements.txt
+RUN pip install --upgrade pip && \
+    pip install --no-cache-dir jupyterlab jupyterlab_vim jupytext -r /install/requirements.txt
+
+# Config.
+COPY etc_sudoers /install/
+COPY etc_sudoers /etc/sudoers
+COPY bashrc /root/.bashrc
+
+# Report package versions.
+COPY version.sh /install/
+RUN /install/version.sh 2>&1 | tee version.log
+
+# Jupyter.
+EXPOSE 8888
diff --git a/...026/projects/UmdTask458_DATA605_Spring2026_FastText_text_classification/Dockerfile.ubuntu b/...026/projects/UmdTask458_DATA605_Spring2026_FastText_text_classification/Dockerfile.ubuntu
@@ -0,0 +1,40 @@
+FROM ubuntu:24.04
+ENV DEBIAN_FRONTEND noninteractive
+
+# Install system utilities and Python in a single layer.
+RUN apt-get update && \
+    apt-get upgrade -y && \
+    apt-get install -y --no-install-recommends \
+      sudo \
+      curl \
+      git \
+      build-essential \
+      python3 \
+      python3-pip \
+      python3-dev \
+      python3-venv \
+    && rm -rf /var/lib/apt/lists/*
+
+# Create virtual environment.
+RUN python3 -m venv /opt/venv
+
+# Make the venv the default Python.
+ENV PATH="/opt/venv/bin:$PATH"
+
+# Install project specific packages.
+RUN mkdir /install
+COPY requirements.txt /install/requirements.txt
+RUN pip install --upgrade pip && \
+    pip install --no-cache-dir jupyterlab jupyterlab_vim jupytext -r /install/requirements.txt
+
+# Config.
+COPY etc_sudoers /install/
+COPY etc_sudoers /etc/sudoers
+COPY bashrc /root/.bashrc
+
+# Report package versions.
+COPY version.sh /install/
+RUN /install/version.sh 2>&1 | tee version.log
+
+# Jupyter.
+EXPOSE 8888
diff --git a/...ing2026/projects/UmdTask458_DATA605_Spring2026_FastText_text_classification/Dockerfile.uv b/...ing2026/projects/UmdTask458_DATA605_Spring2026_FastText_text_classification/Dockerfile.uv
@@ -0,0 +1,49 @@
+FROM ubuntu:24.04
+ENV DEBIAN_FRONTEND noninteractive
+
+# Install system utilities and Python in a single layer.
+RUN apt-get update && \
+  apt-get upgrade -y && \
+  apt-get install -y --no-install-recommends \
+    sudo \
+    curl \
+    git \
+    build-essential \
+    python3 \
+    python3-pip \
+    python3-dev \
+    python3-venv \
+    libgomp1 \
+    g++ \
+  && rm -rf /var/lib/apt/lists/*
+
+# Install uv for package management.
+RUN curl -LsSf https://astral.sh/uv/install.sh | sh
+ENV PATH="/root/.local/bin:$PATH"
+
+# Install project specific packages using uv.
+COPY pyproject.toml uv.lock /app/
+WORKDIR /app
+RUN uv sync
+ENV PATH="/app/.venv/bin:$PATH"
+
+# Install Jupyter.
+RUN pip install --upgrade pip && \
+    pip install --no-cache-dir jupyterlab jupyterlab_vim jupytext
+
+# Copy project files.
+COPY . /app
+
+RUN mkdir /install
+
+# Config.
+COPY etc_sudoers /install/
+COPY etc_sudoers /etc/sudoers
+COPY bashrc /root/.bashrc
+
+# Report package versions.
+COPY version.sh /install/
+RUN /install/version.sh 2>&1 | tee version.log
+
+# Jupyter.
+EXPOSE 8888
diff --git a/...6/projects/UmdTask458_DATA605_Spring2026_FastText_text_classification/README.md b/...6/projects/UmdTask458_DATA605_Spring2026_FastText_text_classification/README.md
@@ -0,0 +1,138 @@
+# FastText Text Classification
+
+## What is FastText?
+
+- FastText is an open-source library developed by Facebook AI Research for
+  efficient text classification and representation.
+- It allows users to create word embeddings and perform supervised learning
+  tasks such as text classification with high accuracy and speed.
+- FastText can handle large datasets and provides pre-trained models for various
+  languages, making it accessible for multilingual applications.
+- The tool supports subword information, which allows it to generate embeddings
+  for out-of-vocabulary words, improving its robustness in natural language
+  processing tasks.
+- FastText is designed to be easy to use, with a command-line interface and
+  Python bindings, making it suitable for both beginners and advanced users.
+
+
+## Project Overview
+
+This project demonstrates text classification using FastText, an open-source
+library developed by Facebook AI Research. We train a model to classify news
+articles from the 20 Newsgroups dataset into 20 categories, covering topics
+such as politics, religion, sports, science, and technology.
+
+
+## Dataset
+
+We use the 20 Newsgroups dataset, which contains approximately 18,000 newsgroup
+posts across 20 categories. The dataset is loaded directly via scikit-learn and
+requires no manual download.
+
+- Training samples: 11,314
+- Test samples: 7,532
+- Categories: 20
+
+Headers, footers, and quoted text are removed to make the classification task
+more realistic and challenging.
+
+## Project Structure
+
+    UmdTask458_DATA605_Spring2026_FastText_text_classification/
+    Dockerfile                  - Docker environment setup
+    requirements.txt            - Python dependencies
+    fasttext_utils.py           - Utility functions for training and evaluation
+    fasttext.example.ipynb      - Full project walkthrough notebook
+    fasttext.API.ipynb          - FastText API usage examples
+    confusion_matrix_baseline.png
+    confusion_matrix_best.png
+    hyperparameter_tuning.png
+    model_comparison.png
+    error_analysis.png
+    README.md
+
+## Setup and Installation
+
+### Prerequisites
+- Docker Desktop installed and running
+- Git
+
+Note: docker_build.sh has Windows path issues. Use docker build directly instead.
+FastText is incompatible with NumPy 2.0, so numpy<2.0 is pinned in requirements.txt.
+
+### Build the Docker Container
+
+    docker build -t gpsaggese/umd_data605_fasttext .
+
+### Run the Container
+
+    docker run -it --rm -p 8888:8888 -v "$(pwd):/home/user" gpsaggese/umd_data605_fasttext bash
+
+### Start Jupyter
+
+    pip install "numpy<2.0" -q && jupyter lab --ip=0.0.0.0 --port=8888 --no-browser --allow-root --notebook-dir=/home/user
+
+Then open the token URL shown in the terminal in your browser.
+
+## Notebooks
+
+### fasttext.example.ipynb
+The main project notebook covering:
+1. Dataset loading and exploration
+2. Text preprocessing for FastText format
+3. Baseline model training and evaluation
+4. Hyperparameter tuning across 5 configurations
+5. Best model evaluation with confusion matrix and error analysis
+6. Comparison with Logistic Regression + TF-IDF
+
+### fasttext.API.ipynb
+Demonstrates the FastText API including:
+- Training a model from scratch
+- Making single and top-k predictions
+- Evaluating model performance
+- Saving and loading models
+- Accessing word vectors and nearest neighbors
+
+## FastText API Reference
+
+The core FastText API used in this project:
+
+- `fasttext.train_supervised()` — trains a supervised classification model
+- `model.predict()` — predicts the category of a text input
+- `model.test()` — evaluates model performance on a labeled dataset
+- `model.save_model()` / `fasttext.load_model()` — saves and loads a trained model
+- `model.get_word_vector()` — returns the vector representation of a word
+- `model.get_nearest_neighbors()` — returns words most similar to a given word
+
+## Results
+
+| Model | F1 Score | Training Time |
+|---|---|---|
+| FastText Baseline (e=25, lr=0.5, ng=2) | 0.60 | ~5s |
+| FastText Best (e=75, lr=0.5, ng=2) | 0.62 | ~8s |
+| Logistic Regression + TF-IDF | 0.66 | ~19s |
+
+FastText trains 2.5x faster than Logistic Regression while achieving
+competitive accuracy. On larger datasets, FastText's speed advantage
+becomes significantly more pronounced.
+
+## Key Findings
+
+1. Hyperparameter tuning improved F1 from 0.60 (baseline) to 0.62 (best
+   configuration). Poor configuration choices such as low epoch count and
+   learning rate can drop performance as low as 0.31.
+2. The most common misclassifications occur between semantically similar
+   categories such as talk.politics.misc and talk.politics.guns.
+3. Logistic Regression + TF-IDF slightly outperforms FastText on this
+   small dataset, but FastText scales significantly better to large datasets.
+4. FastText is best suited for large-scale text classification where
+   training speed and memory efficiency are critical.
+
+## Dependencies
+
+- fasttext-wheel
+- scikit-learn
+- numpy<2.0
+- pandas
+- matplotlib
+- seaborn
diff --git a/...605/Spring2026/projects/UmdTask458_DATA605_Spring2026_FastText_text_classification/bashrc b/...605/Spring2026/projects/UmdTask458_DATA605_Spring2026_FastText_text_classification/bashrc
@@ -0,0 +1 @@
+set -o vi
diff --git a/...8_DATA605_Spring2026_FastText_text_classification/confusion_matrix_baseline.png b/...8_DATA605_Spring2026_FastText_text_classification/confusion_matrix_baseline.png
diff --git a/...sk458_DATA605_Spring2026_FastText_text_classification/confusion_matrix_best.png b/...sk458_DATA605_Spring2026_FastText_text_classification/confusion_matrix_best.png