From 4ee407dc7ee420a26441768036df98675fe721ff Mon Sep 17 00:00:00 2001 From: riyaapuri Date: Wed, 1 Apr 2026 23:02:30 -0400 Subject: [PATCH 1/8] Assignment 2: Initial Project Setup --- .../.dockerignore | 143 +++++ .../Dockerfile | 30 + .../Dockerfile.python_slim | 28 + .../Dockerfile.ubuntu | 40 ++ .../Dockerfile.uv | 49 ++ .../README.md | 15 + .../bashrc | 1 + .../copy_docker_files.py | 140 ++++ .../docker_bash.sh | 34 + .../docker_build.sh | 40 ++ .../docker_build.version.log | 1 + .../docker_clean.sh | 26 + .../docker_cmd.sh | 41 ++ .../docker_exec.sh | 25 + .../docker_jupyter.sh | 39 ++ .../docker_name.sh | 12 + .../docker_push.sh | 25 + .../etc_sudoers | 31 + .../requirements.txt | 4 + .../run_jupyter.sh | 35 + .../template.API.ipynb | 215 +++++++ .../template.API.py | 129 ++++ .../template.example.ipynb | 198 ++++++ .../template.example.py | 125 ++++ .../template_utils.py | 72 +++ .../test/test_docker_all.py | 48 ++ .../utils.sh | 607 ++++++++++++++++++ .../version.sh | 28 + 28 files changed, 2181 insertions(+) create mode 100644 class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/.dockerignore create mode 100644 class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/Dockerfile create mode 100644 class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/Dockerfile.python_slim create mode 100644 class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/Dockerfile.ubuntu create mode 100644 class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/Dockerfile.uv create mode 100644 class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/README.md create mode 100644 class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/bashrc create mode 100644 class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/copy_docker_files.py create mode 100755 class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/docker_bash.sh create mode 100755 class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/docker_build.sh create mode 100644 class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/docker_build.version.log create mode 100755 class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/docker_clean.sh create mode 100755 class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/docker_cmd.sh create mode 100755 class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/docker_exec.sh create mode 100755 class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/docker_jupyter.sh create mode 100644 class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/docker_name.sh create mode 100755 class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/docker_push.sh create mode 100644 class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/etc_sudoers create mode 100644 class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/requirements.txt create mode 100755 class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/run_jupyter.sh create mode 100644 class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/template.API.ipynb create mode 100644 class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/template.API.py create mode 100644 class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/template.example.ipynb create mode 100644 class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/template.example.py create mode 100644 class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/template_utils.py create mode 100644 class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/test/test_docker_all.py create mode 100644 class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/utils.sh create mode 100755 class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/version.sh diff --git a/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/.dockerignore b/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/.dockerignore new file mode 100644 index 000000000..fd85b2584 --- /dev/null +++ b/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/.dockerignore @@ -0,0 +1,143 @@ +# Exclude files from Docker build context. This prevents unnecessary files from +# being sent to Docker daemon, reducing build time and image size. + +# Python artifacts +__pycache__/ +*.pyc +*.pyo +*.pyd +*.egg-info/ + +# Virtual environments +venv/ +.venv/ +env/ +.env +.envrc +client_venv.helpers/ +ENV/ + +# Jupyter +.ipynb_checkpoints/ +.jupyter/ + +# Build artifacts +build/ +dist/ +*.eggs/ +.eggs/ + +# Cache and temporary files +*.log +*.tmp +*.cache +.pytest_cache/ +.mypy_cache/ +.coverage +htmlcov/ + +# Git and version control +.git/ +.gitignore +.gitattributes +.github/ + +# Docker build scripts (not needed at runtime) +docker_build.sh +docker_push.sh +docker_clean.sh +docker_exec.sh +docker_cmd.sh +docker_bash.sh +docker_jupyter.sh +docker_name.sh +run_jupyter.sh +Dockerfile.* +.dockerignore + +# Documentation +README.md +README.admin.md +docs/ +*.md +CHANGELOG.md +LICENSE + +# Configuration and secrets +.env.* +.env.local +.env.development +.env.production +.DS_Store +Thumbs.db + +# Shell configuration +.bashrc +.bash_history +.zshrc + +# Large data files (mount via volume instead) +data/ +*.csv +*.pkl +*.h5 +*.parquet +*.feather +*.arrow +*.npy +*.npz + +# Generated images +*.png +*.jpg +*.jpeg +*.gif +*.svg +*.pdf + +# Test files and examples +tests/ +test_* +*_test.py +tutorials/ +examples/ + +# IDE and editor files +.vscode/ +.idea/ +*.swp +*.swo +*~ +.project +.pydevproject +.settings/ +*.iml +.sublime-project +.sublime-workspace + +# Node and frontend (if applicable) +node_modules/ +npm-debug.log +yarn-error.log +.npm + +# Requirements management +requirements.in +Pipfile +Pipfile.lock +poetry.lock +setup.py +setup.cfg + +# CI/CD configuration +.gitlab-ci.yml +.travis.yml +Jenkinsfile +.circleci/ + +# Miscellaneous +*.bak +.venv.bak/ +*.whl +*.tar.gz +*.zip diff --git a/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/Dockerfile b/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/Dockerfile new file mode 100644 index 000000000..f5c02c562 --- /dev/null +++ b/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/Dockerfile @@ -0,0 +1,30 @@ +# Use Python 3.12 slim (already has Python and pip). +FROM python:3.12-slim + +# Avoid interactive prompts during apt operations. +ENV DEBIAN_FRONTEND=noninteractive + +# Install CA certificates (needed for HTTPS). +RUN apt-get update && apt-get install -y \ + ca-certificates \ + && rm -rf /var/lib/apt/lists/* + +# Install project specific packages. +RUN mkdir -p /install +COPY requirements.txt /install/requirements.txt +RUN pip install --upgrade pip && \ + pip install --no-cache-dir jupyterlab jupyterlab_vim jupytext -r /install/requirements.txt + +# Config. +COPY etc_sudoers /install/ +COPY etc_sudoers /etc/sudoers +COPY bashrc /root/.bashrc + +# Report package versions. +COPY version.sh /install/ +RUN /install/version.sh 2>&1 | tee version.log + +# Jupyter. +EXPOSE 8888 + +CMD ["/bin/bash"] diff --git a/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/Dockerfile.python_slim b/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/Dockerfile.python_slim new file mode 100644 index 000000000..cc8f18f2f --- /dev/null +++ b/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/Dockerfile.python_slim @@ -0,0 +1,28 @@ +# Use Python 3.12 slim (already has Python and pip). +FROM python:3.12-slim + +# Avoid interactive prompts during apt operations. +ENV DEBIAN_FRONTEND=noninteractive + +# Install CA certificates (needed for HTTPS). +RUN apt-get update && apt-get install -y \ + ca-certificates \ + && rm -rf /var/lib/apt/lists/* + +# Install project specific packages. +RUN mkdir -p /install +COPY requirements.txt /install/requirements.txt +RUN pip install --upgrade pip && \ + pip install --no-cache-dir jupyterlab jupyterlab_vim jupytext -r /install/requirements.txt + +# Config. +COPY etc_sudoers /install/ +COPY etc_sudoers /etc/sudoers +COPY bashrc /root/.bashrc + +# Report package versions. +COPY version.sh /install/ +RUN /install/version.sh 2>&1 | tee version.log + +# Jupyter. +EXPOSE 8888 diff --git a/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/Dockerfile.ubuntu b/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/Dockerfile.ubuntu new file mode 100644 index 000000000..705105d91 --- /dev/null +++ b/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/Dockerfile.ubuntu @@ -0,0 +1,40 @@ +FROM ubuntu:24.04 +ENV DEBIAN_FRONTEND noninteractive + +# Install system utilities and Python in a single layer. +RUN apt-get update && \ + apt-get upgrade -y && \ + apt-get install -y --no-install-recommends \ + sudo \ + curl \ + git \ + build-essential \ + python3 \ + python3-pip \ + python3-dev \ + python3-venv \ + && rm -rf /var/lib/apt/lists/* + +# Create virtual environment. +RUN python3 -m venv /opt/venv + +# Make the venv the default Python. +ENV PATH="/opt/venv/bin:$PATH" + +# Install project specific packages. +RUN mkdir /install +COPY requirements.txt /install/requirements.txt +RUN pip install --upgrade pip && \ + pip install --no-cache-dir jupyterlab jupyterlab_vim jupytext -r /install/requirements.txt + +# Config. +COPY etc_sudoers /install/ +COPY etc_sudoers /etc/sudoers +COPY bashrc /root/.bashrc + +# Report package versions. +COPY version.sh /install/ +RUN /install/version.sh 2>&1 | tee version.log + +# Jupyter. +EXPOSE 8888 diff --git a/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/Dockerfile.uv b/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/Dockerfile.uv new file mode 100644 index 000000000..d3b2a0abc --- /dev/null +++ b/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/Dockerfile.uv @@ -0,0 +1,49 @@ +FROM ubuntu:24.04 +ENV DEBIAN_FRONTEND noninteractive + +# Install system utilities and Python in a single layer. +RUN apt-get update && \ + apt-get upgrade -y && \ + apt-get install -y --no-install-recommends \ + sudo \ + curl \ + git \ + build-essential \ + python3 \ + python3-pip \ + python3-dev \ + python3-venv \ + libgomp1 \ + g++ \ + && rm -rf /var/lib/apt/lists/* + +# Install uv for package management. +RUN curl -LsSf https://astral.sh/uv/install.sh | sh +ENV PATH="/root/.local/bin:$PATH" + +# Install project specific packages using uv. +COPY pyproject.toml uv.lock /app/ +WORKDIR /app +RUN uv sync +ENV PATH="/app/.venv/bin:$PATH" + +# Install Jupyter. +RUN pip install --upgrade pip && \ + pip install --no-cache-dir jupyterlab jupyterlab_vim jupytext + +# Copy project files. +COPY . /app + +RUN mkdir /install + +# Config. +COPY etc_sudoers /install/ +COPY etc_sudoers /etc/sudoers +COPY bashrc /root/.bashrc + +# Report package versions. +COPY version.sh /install/ +RUN /install/version.sh 2>&1 | tee version.log + +# Jupyter. +EXPOSE 8888 diff --git a/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/README.md b/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/README.md new file mode 100644 index 000000000..2953bc5f7 --- /dev/null +++ b/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/README.md @@ -0,0 +1,15 @@ +# HuggingFace Text Classification Model + +## Description + +HuggingFace is an open-source platform that provides ready-to-use, state-of-the-art language models along with the tools needed to fine-tune, evaluate, and deploy them for any natural language task, without needing to build models from scratch. + +This project builds a News Article Classification Pipeline on top of HuggingFace. Given a raw news article, the system ingests data from public news datasets (AG News, BBC News), fine-tunes transformer models, covering BERT, DistilBERT, and RoBERTa for multi-class topic classification, and serves predictions through a live inference endpoint and dashboard. + +The full stack uses HuggingFace Transformers and Datasets for tokenization and fine-tuning, PyTorch as the training backend, Scikit-learn for evaluation metrics, FastAPI for inference serving, and Streamlit for the prediction dashboard. + +## Project Specs: +https://github.com/gpsaggese/gpsaggese.github.io/blob/master/class_project/data605/Spring2026/projects_descriptions/HuggingFace_Project_Description.md + +**Authors**: @riyaapuri @stupatel17 +**Assigned to**: @riyaapuri @stupatel17 @protocorn @gpsaggese \ No newline at end of file diff --git a/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/bashrc b/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/bashrc new file mode 100644 index 000000000..4b7ff4c49 --- /dev/null +++ b/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/bashrc @@ -0,0 +1 @@ +set -o vi diff --git a/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/copy_docker_files.py b/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/copy_docker_files.py new file mode 100644 index 000000000..0e97c194c --- /dev/null +++ b/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/copy_docker_files.py @@ -0,0 +1,140 @@ +#!/usr/bin/env python + +""" +Copy Docker-related files from the source directory to a destination directory. + +This script copies all Docker configuration and utility files from +class_project/project_template/ to a specified destination directory. + +Usage examples: + # Copy all files to a target directory. + > ./copy_docker_files.py --dst_dir /path/to/destination + + # Copy with verbose logging. + > ./copy_docker_files.py --dst_dir /path/to/destination -v DEBUG + +Import as: + +import class_project.project_template.copy_docker_files as cpdccodo +""" + +import argparse +import logging +import os +from typing import List + +import helpers.hdbg as hdbg +import helpers.hio as hio +import helpers.hparser as hparser +import helpers.hsystem as hsystem + +_LOG = logging.getLogger(__name__) + +# ############################################################################# +# Constants +# ############################################################################# + +# List of files to copy from the source directory. +_FILES_TO_COPY = [ + "bashrc", + "docker_bash.sh", + "docker_build.sh", + "docker_clean.sh", + "docker_cmd.sh", + "docker_exec.sh", + "docker_jupyter.sh", + "docker_name.sh", + "docker_push.sh", + "etc_sudoers", + "install_jupyter_extensions.sh", + "run_jupyter.sh" + "version.sh", +] + + +# ############################################################################# +# Helper functions +# ############################################################################# + + +def _get_source_dir() -> str: + """ + Get the absolute path to the source directory containing Docker files. + + :return: absolute path to class_project/project_template/ + """ + # Get the directory where this script is located. + script_dir = os.path.dirname(os.path.abspath(__file__)) + _LOG.debug("Script directory='%s'", script_dir) + return script_dir + + +def _copy_files( + *, + src_dir: str, + dst_dir: str, + files: List[str], +) -> None: + """ + Copy specified files from source directory to destination directory. + + :param src_dir: source directory path + :param dst_dir: destination directory path + :param files: list of filenames to copy + """ + # Verify source directory exists. + hdbg.dassert_dir_exists(src_dir, "Source directory does not exist:", src_dir) + # Create destination directory if it doesn't exist. + hio.create_dir(dst_dir, incremental=True) + _LOG.info("Copying %d files from '%s' to '%s'", len(files), src_dir, dst_dir) + # Copy each file. + copied_count = 0 + for filename in files: + src_path = os.path.join(src_dir, filename) + dst_path = os.path.join(dst_dir, filename) + # Verify source file exists. + hdbg.dassert_path_exists( + src_path, "Source file does not exist:", src_path + ) + # Copy the file using cp -a to preserve all permissions and attributes. + _LOG.debug("Copying '%s' -> '%s'", src_path, dst_path) + cmd = f"cp -a {src_path} {dst_path}" + hsystem.system(cmd) + copied_count += 1 + # + _LOG.info("Successfully copied %d files", copied_count) + + +# ############################################################################# + + +def _parse() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser( + description=__doc__, + formatter_class=argparse.RawDescriptionHelpFormatter, + ) + parser.add_argument( + "--dst_dir", + action="store", + required=True, + help="Destination directory where files will be copied", + ) + hparser.add_verbosity_arg(parser) + return parser + + +def _main(parser: argparse.ArgumentParser) -> None: + args = parser.parse_args() + hdbg.init_logger(verbosity=args.log_level, use_exec_path=True) + # Get source directory. + src_dir = _get_source_dir() + # Copy files to destination. + _copy_files( + src_dir=src_dir, + dst_dir=args.dst_dir, + files=_FILES_TO_COPY, + ) + + +if __name__ == "__main__": + _main(_parse()) diff --git a/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/docker_bash.sh b/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/docker_bash.sh new file mode 100755 index 000000000..0025e81f4 --- /dev/null +++ b/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/docker_bash.sh @@ -0,0 +1,34 @@ +#!/bin/bash +# """ +# This script launches a Docker container with an interactive bash shell for +# development. +# """ + +# Exit immediately if any command exits with a non-zero status. +set -e + +# Import the utility functions from the project template. +GIT_ROOT=$(git rev-parse --show-toplevel) +source $GIT_ROOT/class_project/project_template/utils.sh + +# Parse default args (-h, -v) and enable set -x if -v is passed. +parse_default_args "$@" + +# Load Docker configuration variables for this script. +get_docker_vars_script ${BASH_SOURCE[0]} +source $DOCKER_NAME +print_docker_vars + +# List the available Docker images matching the expected image name. +run "docker image ls $FULL_IMAGE_NAME" + +# Configure and run the Docker container with interactive bash shell. +# - Container is removed automatically on exit (--rm) +# - Interactive mode with TTY allocation (-ti) +# - Port forwarding for Jupyter or other services +# - Git root mounted to /git_root inside container +CONTAINER_NAME=${IMAGE_NAME}_bash +PORT= +DOCKER_CMD=$(get_docker_bash_command) +DOCKER_CMD_OPTS=$(get_docker_bash_options $CONTAINER_NAME $PORT) +run "$DOCKER_CMD $DOCKER_CMD_OPTS $FULL_IMAGE_NAME" diff --git a/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/docker_build.sh b/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/docker_build.sh new file mode 100755 index 000000000..5b0957a99 --- /dev/null +++ b/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/docker_build.sh @@ -0,0 +1,40 @@ +#!/bin/bash +# """ +# Build a Docker container image for the project. +# +# This script sets up the build environment with error handling and command +# tracing, loads Docker configuration from docker_name.sh, and builds the +# Docker image using the build_container_image utility function. It supports +# both single-architecture and multi-architecture builds via the +# DOCKER_BUILD_MULTI_ARCH environment variable. +# """ + +# Exit immediately if any command exits with a non-zero status. +set -e + +# Import the utility functions. +GIT_ROOT=$(git rev-parse --show-toplevel) +source $GIT_ROOT/class_project/project_template/utils.sh + +# Parse default args (-h, -v) and enable set -x if -v is passed. +# Shift processed option flags so remaining args are passed to the build. +parse_default_args "$@" +shift $((OPTIND-1)) + +# Load Docker configuration variables (REPO_NAME, IMAGE_NAME, FULL_IMAGE_NAME). +get_docker_vars_script ${BASH_SOURCE[0]} +source $DOCKER_NAME +print_docker_vars + +# Configure Docker build settings. +# Enable BuildKit for improved build performance and features. +export DOCKER_BUILDKIT=1 +#export DOCKER_BUILDKIT=0 + +# Configure single-architecture build (set to 1 for multi-arch build). +#export DOCKER_BUILD_MULTI_ARCH=1 +export DOCKER_BUILD_MULTI_ARCH=0 + +# Build the container image. +# Pass extra arguments (e.g., --no-cache) via command line after -v. +build_container_image "$@" diff --git a/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/docker_build.version.log b/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/docker_build.version.log new file mode 100644 index 000000000..8315eefe2 --- /dev/null +++ b/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/docker_build.version.log @@ -0,0 +1 @@ +the input device is not a TTY diff --git a/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/docker_clean.sh b/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/docker_clean.sh new file mode 100755 index 000000000..7e40839ae --- /dev/null +++ b/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/docker_clean.sh @@ -0,0 +1,26 @@ +#!/bin/bash +# """ +# Remove Docker container image for the project. +# +# This script cleans up Docker images by removing the container image +# matching the project configuration. Useful for freeing disk space or +# ensuring a fresh build. +# """ + +# Exit immediately if any command exits with a non-zero status. +set -e + +# Import the utility functions. +GIT_ROOT=$(git rev-parse --show-toplevel) +source $GIT_ROOT/class_project/project_template/utils.sh + +# Parse default args (-h, -v) and enable set -x if -v is passed. +parse_default_args "$@" + +# Load Docker configuration variables for this script. +get_docker_vars_script ${BASH_SOURCE[0]} +source $DOCKER_NAME +print_docker_vars + +# Remove the container image. +remove_container_image diff --git a/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/docker_cmd.sh b/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/docker_cmd.sh new file mode 100755 index 000000000..906d7a77b --- /dev/null +++ b/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/docker_cmd.sh @@ -0,0 +1,41 @@ +#!/bin/bash +# """ +# Execute a command in a Docker container. +# +# This script runs a specified command inside a new Docker container instance. +# The container is removed automatically after the command completes. The +# git root is mounted to /git_root inside the container. +# """ + +# Exit immediately if any command exits with a non-zero status. +set -e + +# Import the utility functions. +GIT_ROOT=$(git rev-parse --show-toplevel) +source $GIT_ROOT/class_project/project_template/utils.sh + +# Parse default args (-h, -v) and enable set -x if -v is passed. +# Shift processed option flags so remaining args form the command. +parse_default_args "$@" +shift $((OPTIND-1)) + +# Capture the command to execute from remaining arguments. +CMD="$@" +echo "Executing: '$CMD'" + +# Load Docker configuration variables for this script. +get_docker_vars_script ${BASH_SOURCE[0]} +source $DOCKER_NAME +print_docker_vars + +# List available Docker images matching the expected image name. +run "docker image ls $FULL_IMAGE_NAME" +#(docker manifest inspect $FULL_IMAGE_NAME | grep arch) || true + +# Configure and run the Docker container with the specified command. +CONTAINER_NAME=$IMAGE_NAME +DOCKER_CMD=$(get_docker_cmd_command) +PORT="" +DOCKER_RUN_OPTS="" +DOCKER_CMD_OPTS=$(get_docker_bash_options $CONTAINER_NAME $PORT $DOCKER_RUN_OPTS) +run "$DOCKER_CMD $DOCKER_CMD_OPTS $FULL_IMAGE_NAME bash -c '$CMD'" diff --git a/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/docker_exec.sh b/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/docker_exec.sh new file mode 100755 index 000000000..24f8e401a --- /dev/null +++ b/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/docker_exec.sh @@ -0,0 +1,25 @@ +#!/bin/bash +# """ +# Execute a bash shell in a running Docker container. +# +# This script connects to an already running Docker container and opens an +# interactive bash session for debugging or inspection purposes. +# """ + +# Exit immediately if any command exits with a non-zero status. +set -e + +# Import the utility functions. +GIT_ROOT=$(git rev-parse --show-toplevel) +source $GIT_ROOT/class_project/project_template/utils.sh + +# Parse default args (-h, -v) and enable set -x if -v is passed. +parse_default_args "$@" + +# Load Docker configuration variables for this script. +get_docker_vars_script ${BASH_SOURCE[0]} +source $DOCKER_NAME +print_docker_vars + +# Execute bash shell in the running container. +exec_container diff --git a/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/docker_jupyter.sh b/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/docker_jupyter.sh new file mode 100755 index 000000000..1a60dfd3a --- /dev/null +++ b/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/docker_jupyter.sh @@ -0,0 +1,39 @@ +#!/bin/bash +# """ +# Execute Jupyter Lab in a Docker container. +# +# This script launches a Docker container running Jupyter Lab with +# configurable port, directory mounting, and vim bindings. It passes +# command-line options to the run_jupyter.sh script inside the container. +# +# Usage: +# > docker_jupyter.sh [options] +# """ + +# Exit immediately if any command exits with a non-zero status. +set -e + +# Import the utility functions. +GIT_ROOT=$(git rev-parse --show-toplevel) +source $GIT_ROOT/class_project/project_template/utils.sh + +# Parse command-line options and set Jupyter configuration variables. +parse_docker_jupyter_args "$@" + +# Load Docker configuration variables for this script. +get_docker_vars_script ${BASH_SOURCE[0]} +source $DOCKER_NAME +print_docker_vars + +# List available Docker images and inspect architecture. +list_and_inspect_docker_image + +# Run the Docker container with Jupyter Lab. +CMD=$(get_run_jupyter_cmd "${BASH_SOURCE[0]}" "$OLD_CMD_OPTS") +CONTAINER_NAME=$IMAGE_NAME +# Kill existing container if -f flag is set. +kill_existing_container_if_forced + +DOCKER_CMD=$(get_docker_jupyter_command) +DOCKER_CMD_OPTS=$(get_docker_jupyter_options $CONTAINER_NAME $JUPYTER_HOST_PORT $JUPYTER_USE_VIM) +run "$DOCKER_CMD $DOCKER_CMD_OPTS $FULL_IMAGE_NAME $CMD" diff --git a/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/docker_name.sh b/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/docker_name.sh new file mode 100644 index 000000000..32a546cf3 --- /dev/null +++ b/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/docker_name.sh @@ -0,0 +1,12 @@ +#!/bin/bash +# """ +# Docker image naming configuration. +# +# This file defines the repository name, image name, and full image name +# variables used by all docker_*.sh scripts in the project template. +# """ + +REPO_NAME=gpsaggese +# The file should be all lower case. +IMAGE_NAME=umd_project_template +FULL_IMAGE_NAME=$REPO_NAME/$IMAGE_NAME diff --git a/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/docker_push.sh b/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/docker_push.sh new file mode 100755 index 000000000..27d752dd9 --- /dev/null +++ b/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/docker_push.sh @@ -0,0 +1,25 @@ +#!/bin/bash +# """ +# Push Docker container image to Docker Hub or registry. +# +# This script authenticates with the Docker registry using credentials from +# ~/.docker/passwd.$REPO_NAME.txt and pushes the locally built container +# image to the remote repository. +# """ + +# Exit immediately if any command exits with a non-zero status. +set -e + +# Import the utility functions. +GIT_ROOT=$(git rev-parse --show-toplevel) +source $GIT_ROOT/class_project/project_template/utils.sh + +# Parse default args (-h, -v) and enable set -x if -v is passed. +parse_default_args "$@" + +# Load Docker image naming configuration. +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +source $SCRIPT_DIR/docker_name.sh + +# Push the container image to the registry. +push_container_image diff --git a/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/etc_sudoers b/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/etc_sudoers new file mode 100644 index 000000000..ee0816a15 --- /dev/null +++ b/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/etc_sudoers @@ -0,0 +1,31 @@ +# +# This file MUST be edited with the 'visudo' command as root. +# +# Please consider adding local content in /etc/sudoers.d/ instead of +# directly modifying this file. +# +# See the man page for details on how to write a sudoers file. +# +Defaults env_reset +Defaults mail_badpass +Defaults secure_path="/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/snap/bin" + +# Host alias specification + +# User alias specification + +# Cmnd alias specification + +# User privilege specification +root ALL=(ALL:ALL) ALL + +# Members of the admin group may gain root privileges +%admin ALL=(ALL) ALL + +# Allow members of group sudo to execute any command +%sudo ALL=(ALL:ALL) ALL + +# See sudoers(5) for more information on "#include" directives: +postgres ALL=(ALL) NOPASSWD:ALL + +#includedir /etc/sudoers.d diff --git a/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/requirements.txt b/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/requirements.txt new file mode 100644 index 000000000..49aca3901 --- /dev/null +++ b/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/requirements.txt @@ -0,0 +1,4 @@ +matplotlib +numpy +pandas +seaborn diff --git a/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/run_jupyter.sh b/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/run_jupyter.sh new file mode 100755 index 000000000..d725c3fe7 --- /dev/null +++ b/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/run_jupyter.sh @@ -0,0 +1,35 @@ +#!/bin/bash +# """ +# Launch Jupyter Lab server. +# +# This script starts Jupyter Lab on port 8888 with the following configuration: +# - No browser auto-launch (useful for Docker containers) +# - Accessible from any IP address (0.0.0.0) +# - Root user allowed (required for Docker environments) +# - No authentication token or password (for development convenience) +# - Vim keybindings can be enabled via JUPYTER_USE_VIM environment variable +# """ + +# Exit immediately if any command exits with a non-zero status. +set -e + +# Print each command to stdout before executing it. +#set -x + +# Import the utility functions from /git_root. +GIT_ROOT=/git_root +source $GIT_ROOT/class_project/project_template/utils.sh + +# Load Docker configuration variables for this script. +get_docker_vars_script ${BASH_SOURCE[0]} +source $DOCKER_NAME +print_docker_vars + +# Setup Jupyter Lab environment. +setup_jupyter_environment + +# Initialize Jupyter Lab command with base configuration. +JUPYTER_ARGS=$(get_jupyter_args) + +# Start Jupyter Lab with development-friendly settings. +run "jupyter lab $JUPYTER_ARGS" diff --git a/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/template.API.ipynb b/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/template.API.ipynb new file mode 100644 index 000000000..3afca937c --- /dev/null +++ b/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/template.API.ipynb @@ -0,0 +1,215 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "183c2248-ea3d-43ba-b87e-d821bba1bbc6", + "metadata": {}, + "source": [ + "# Template API Notebook\n", + "\n", + "This is a template notebook. The first heading should be the title of what notebook is about. For example, if it is a neo4j tutorial the heading should be `Neo4j API`.\n", + "\n", + "- Add description of what the notebook does.\n", + "- Point to references, e.g. (neo4j.API.md)\n", + "- Add citations.\n", + "- Keep the notebook flow clear.\n", + "- Comments should be imperative and have a period at the end.\n", + "- Your code should be well commented.\n", + "\n", + "The name of this notebook should in the following format:\n", + "- if the notebook is exploring `pycaret API`, then it is `pycaret.API.ipynb`\n", + "\n", + "Follow the reference to write notebooks in a clear manner: https://github.com/causify-ai/helpers/blob/master/docs/coding/all.jupyter_notebook.how_to_guide.md" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "265e0d58-a7cd-4edf-a0b4-96b60220e801", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The autoreload extension is already loaded. To reload it, use:\n", + " %reload_ext autoreload\n" + ] + } + ], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2\n", + "%matplotlib inline" + ] + }, + { + "cell_type": "markdown", + "id": "d3b2f997-5c9b-4238-b6d5-e5f2cea43809", + "metadata": {}, + "source": [ + "## Imports" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "d1480ee9-d6a6-437d-b927-da6cbb05bdf5", + "metadata": {}, + "outputs": [], + "source": [ + "import logging\n", + "# Import libraries in this section.\n", + "# Avoid imports like import *, from ... import ..., from ... import *, etc.\n", + "\n", + "import helpers.hdbg as hdbg\n", + "import helpers.hnotebook as hnotebo" + ] + }, + { + "cell_type": "markdown", + "id": "f9208cc9-837d-4fec-a312-9c4aa5b7648d", + "metadata": {}, + "source": [ + "## Configuration" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "9a2d7a9c-c6c5-48c9-8445-11c97045d00b", + "metadata": { + "lines_to_next_cell": 2 + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[0mWARNING: Running in Jupyter\n", + "INFO > cmd='/venv/lib/python3.12/site-packages/ipykernel_launcher.py -f /home/.local/share/jupyter/runtime/kernel-085a2ce7-6161-4c8a-92d5-492051832f3c.json'\n" + ] + } + ], + "source": [ + "hdbg.init_logger(verbosity=logging.INFO)\n", + "\n", + "_LOG = logging.getLogger(__name__)\n", + "\n", + "hnotebo.config_notebook()" + ] + }, + { + "cell_type": "markdown", + "id": "79c37ba3-bd5d-4a44-87df-645eee54977a", + "metadata": { + "lines_to_next_cell": 2 + }, + "source": [ + "## Make the notebook flow clear\n", + "Each notebook needs to follow a clear and logical flow, e.g:\n", + "- Load data\n", + "- Compute stats\n", + "- Clean data\n", + "- Compute stats\n", + "- Do analysis\n", + "- Show results\n", + "\n", + "\n", + "\n", + "\n", + "#############################################################################\n", + "Template\n", + "#############################################################################" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "a8a109cd-fc8e-4b9e-9dc0-4fc8d4126ad8", + "metadata": { + "lines_to_next_cell": 2 + }, + "outputs": [], + "source": [ + "class Template:\n", + " \"\"\"\n", + " Brief imperative description of what the class does in one line, if needed.\n", + " \"\"\"\n", + "\n", + " def __init__(self):\n", + " pass\n", + "\n", + " def method1(self, arg1: int) -> None:\n", + " \"\"\"\n", + " Brief imperative description of what the method does in one line.\n", + "\n", + " You can elaborate more in the method docstring in this section, for e.g. explaining\n", + " the formula/algorithm. Every method/function should have a docstring, typehints and include the\n", + " parameters and return as follows:\n", + "\n", + " :param arg1: description of arg1\n", + " :return: description of return\n", + " \"\"\"\n", + " # Code bloks go here.\n", + " # Make sure to include comments to explain what the code is doing.\n", + " # No empty lines between code blocks.\n", + " pass\n", + "\n", + "\n", + "def template_function(arg1: int) -> None:\n", + " \"\"\"\n", + " Brief imperative description of what the function does in one line.\n", + "\n", + " You can elaborate more in the function docstring in this section, for e.g. explaining\n", + " the formula/algorithm. Every function should have a docstring, typehints and include the\n", + " parameters and return as follows:\n", + "\n", + " :param arg1: description of arg1\n", + " :return: description of return\n", + " \"\"\"\n", + " # Code bloks go here.\n", + " # Make sure to include comments to explain what the code is doing.\n", + " # No empty lines between code blocks.\n", + " pass" + ] + }, + { + "cell_type": "markdown", + "id": "00926523-ae59-497d-bba8-b22e58333849", + "metadata": {}, + "source": [ + "## The flow should be highlighted using headings in markdown\n", + "```\n", + "# Level 1\n", + "## Level 2\n", + "### Level 3\n", + "```" + ] + } + ], + "metadata": { + "jupytext": { + "formats": "ipynb,py:percent" + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/template.API.py b/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/template.API.py new file mode 100644 index 000000000..4192ef8fe --- /dev/null +++ b/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/template.API.py @@ -0,0 +1,129 @@ +# --- +# jupyter: +# jupytext: +# formats: ipynb,py:percent +# text_representation: +# extension: .py +# format_name: percent +# format_version: '1.3' +# jupytext_version: 1.19.0 +# kernelspec: +# display_name: Python 3 (ipykernel) +# language: python +# name: python3 +# --- + +# %% [markdown] +# # Template API Notebook +# +# This is a template notebook. The first heading should be the title of what notebook is about. For example, if it is a neo4j tutorial the heading should be `Neo4j API`. +# +# - Add description of what the notebook does. +# - Point to references, e.g. (neo4j.API.md) +# - Add citations. +# - Keep the notebook flow clear. +# - Comments should be imperative and have a period at the end. +# - Your code should be well commented. +# +# The name of this notebook should in the following format: +# - if the notebook is exploring `pycaret API`, then it is `pycaret.API.ipynb` +# +# Follow the reference to write notebooks in a clear manner: https://github.com/causify-ai/helpers/blob/master/docs/coding/all.jupyter_notebook.how_to_guide.md + +# %% +# %load_ext autoreload +# %autoreload 2 +# %matplotlib inline + +# %% [markdown] +# ## Imports + +# %% +import logging +# Import libraries in this section. +# Avoid imports like import *, from ... import ..., from ... import *, etc. + +import helpers.hdbg as hdbg +import helpers.hnotebook as hnotebo + +# %% [markdown] +# ## Configuration + +# %% +hdbg.init_logger(verbosity=logging.INFO) + +_LOG = logging.getLogger(__name__) + +hnotebo.config_notebook() + + +# %% [markdown] +# ## Make the notebook flow clear +# Each notebook needs to follow a clear and logical flow, e.g: +# - Load data +# - Compute stats +# - Clean data +# - Compute stats +# - Do analysis +# - Show results +# +# +# +# + + +# ############################################################################# +# Template +# ############################################################################# + + +# %% +class Template: + """ + Brief imperative description of what the class does in one line, if needed. + """ + + def __init__(self): + pass + + def method1(self, arg1: int) -> None: + """ + Brief imperative description of what the method does in one line. + + You can elaborate more in the method docstring in this section, for e.g. explaining + the formula/algorithm. Every method/function should have a docstring, typehints and include the + parameters and return as follows: + + :param arg1: description of arg1 + :return: description of return + """ + # Code bloks go here. + # Make sure to include comments to explain what the code is doing. + # No empty lines between code blocks. + pass + + +def template_function(arg1: int) -> None: + """ + Brief imperative description of what the function does in one line. + + You can elaborate more in the function docstring in this section, for e.g. explaining + the formula/algorithm. Every function should have a docstring, typehints and include the + parameters and return as follows: + + :param arg1: description of arg1 + :return: description of return + """ + # Code bloks go here. + # Make sure to include comments to explain what the code is doing. + # No empty lines between code blocks. + pass + + +# %% [markdown] +# ## The flow should be highlighted using headings in markdown +# ``` +# # Level 1 +# ## Level 2 +# ### Level 3 +# ``` diff --git a/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/template.example.ipynb b/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/template.example.ipynb new file mode 100644 index 000000000..a2e9aedd7 --- /dev/null +++ b/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/template.example.ipynb @@ -0,0 +1,198 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "50f78f7e-2dee-45d6-9d37-7a55eeaae283", + "metadata": {}, + "source": [ + "# Template Example Notebook\n", + "\n", + "This is a template notebook. The first heading should be the title of what notebook is about. For example, if it is a project on neo4j tutorial the heading should be `Project Title`.\n", + "\n", + "- Add description of what the notebook does.\n", + "- Point to references, e.g. (neo4j.example.md)\n", + "- Add citations.\n", + "- Keep the notebook flow clear.\n", + "- Comments should be imperative and have a period at the end.\n", + "- Your code should be well commented.\n", + "\n", + "The name of this notebook should in the following format:\n", + "- if the notebook is exploring `pycaret API`, then it is `pycaret.example.ipynb`\n", + "\n", + "Follow the reference to write notebooks in a clear manner: https://github.com/causify-ai/helpers/blob/master/docs/coding/all.jupyter_notebook.how_to_guide.md" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "6226667e-cab5-479c-be6a-6b7d6f580a97", + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2\n", + "%matplotlib inline" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "8020901a-4bc7-4b73-95e8-aaa462b4fc19", + "metadata": {}, + "outputs": [], + "source": [ + "import logging\n", + "# Import libraries in this section.\n", + "# Avoid imports like import *, from ... import ..., from ... import *, etc.\n", + "\n", + "import helpers.hdbg as hdbg\n", + "import helpers.hnotebook as hnotebo" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "4ecb72b2-b21d-4fb0-ac92-e7174da390e6", + "metadata": { + "lines_to_next_cell": 2 + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[0mWARNING: Running in Jupyter\n", + "INFO > cmd='/venv/lib/python3.12/site-packages/ipykernel_launcher.py -f /home/.local/share/jupyter/runtime/kernel-783e0930-1631-4d64-8bb4-f3a98bb74fcd.json'\n" + ] + } + ], + "source": [ + "hdbg.init_logger(verbosity=logging.INFO)\n", + "\n", + "_LOG = logging.getLogger(__name__)\n", + "\n", + "hnotebo.config_notebook()" + ] + }, + { + "cell_type": "markdown", + "id": "1ede6422-bff2-4f0a-8d28-29a01d4786b2", + "metadata": { + "lines_to_next_cell": 2 + }, + "source": [ + "## Make the notebook flow clear\n", + "Each notebook needs to follow a clear and logical flow, e.g:\n", + "- Load data\n", + "- Compute stats\n", + "- Clean data\n", + "- Compute stats\n", + "- Do analysis\n", + "- Show results\n", + "\n", + "\n", + "\n", + "\n", + "#############################################################################\n", + "Template\n", + "#############################################################################" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "8bbd660d-d22f-44fa-bf53-dd622dee0f53", + "metadata": { + "lines_to_next_cell": 2 + }, + "outputs": [], + "source": [ + "class Template:\n", + " \"\"\"\n", + " Brief imperative description of what the class does in one line, if needed.\n", + " \"\"\"\n", + "\n", + " def __init__(self):\n", + " pass\n", + "\n", + " def method1(self, arg1: int) -> None:\n", + " \"\"\"\n", + " Brief imperative description of what the method does in one line.\n", + "\n", + " You can elaborate more in the method docstring in this section, for e.g. explaining\n", + " the formula/algorithm. Every method/function should have a docstring, typehints and include the\n", + " parameters and return as follows:\n", + "\n", + " :param arg1: description of arg1\n", + " :return: description of return\n", + " \"\"\"\n", + " # Code bloks go here.\n", + " # Make sure to include comments to explain what the code is doing.\n", + " # No empty lines between code blocks.\n", + " pass\n", + "\n", + "\n", + "def template_function(arg1: int) -> None:\n", + " \"\"\"\n", + " Brief imperative description of what the function does in one line.\n", + "\n", + " You can elaborate more in the function docstring in this section, for e.g. explaining\n", + " the formula/algorithm. Every function should have a docstring, typehints and include the\n", + " parameters and return as follows:\n", + "\n", + " :param arg1: description of arg1\n", + " :return: description of return\n", + " \"\"\"\n", + " # Code bloks go here.\n", + " # Make sure to include comments to explain what the code is doing.\n", + " # No empty lines between code blocks.\n", + " pass" + ] + }, + { + "cell_type": "markdown", + "id": "103f6e36-54cf-442c-b137-8091d48805a7", + "metadata": {}, + "source": [ + "## The flow should be highlighted using headings in markdown\n", + "```\n", + "# Level 1\n", + "## Level 2\n", + "### Level 3\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d05d52af-67ba-4a4f-a561-af453e43854f", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "jupytext": { + "formats": "ipynb,py:percent" + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/template.example.py b/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/template.example.py new file mode 100644 index 000000000..8566ff277 --- /dev/null +++ b/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/template.example.py @@ -0,0 +1,125 @@ +# --- +# jupyter: +# jupytext: +# formats: ipynb,py:percent +# text_representation: +# extension: .py +# format_name: percent +# format_version: '1.3' +# jupytext_version: 1.19.0 +# kernelspec: +# display_name: Python 3 (ipykernel) +# language: python +# name: python3 +# --- + +# %% [markdown] +# # Template Example Notebook +# +# This is a template notebook. The first heading should be the title of what notebook is about. For example, if it is a project on neo4j tutorial the heading should be `Project Title`. +# +# - Add description of what the notebook does. +# - Point to references, e.g. (neo4j.example.md) +# - Add citations. +# - Keep the notebook flow clear. +# - Comments should be imperative and have a period at the end. +# - Your code should be well commented. +# +# The name of this notebook should in the following format: +# - if the notebook is exploring `pycaret API`, then it is `pycaret.example.ipynb` +# +# Follow the reference to write notebooks in a clear manner: https://github.com/causify-ai/helpers/blob/master/docs/coding/all.jupyter_notebook.how_to_guide.md + +# %% +# %load_ext autoreload +# %autoreload 2 +# %matplotlib inline + +# %% +import logging +# Import libraries in this section. +# Avoid imports like import *, from ... import ..., from ... import *, etc. + +import helpers.hdbg as hdbg +import helpers.hnotebook as hnotebo + +# %% +hdbg.init_logger(verbosity=logging.INFO) + +_LOG = logging.getLogger(__name__) + +hnotebo.config_notebook() + + +# %% [markdown] +# ## Make the notebook flow clear +# Each notebook needs to follow a clear and logical flow, e.g: +# - Load data +# - Compute stats +# - Clean data +# - Compute stats +# - Do analysis +# - Show results +# +# +# +# + + +# ############################################################################# +# Template +# ############################################################################# + + +# %% +class Template: + """ + Brief imperative description of what the class does in one line, if needed. + """ + + def __init__(self): + pass + + def method1(self, arg1: int) -> None: + """ + Brief imperative description of what the method does in one line. + + You can elaborate more in the method docstring in this section, for e.g. explaining + the formula/algorithm. Every method/function should have a docstring, typehints and include the + parameters and return as follows: + + :param arg1: description of arg1 + :return: description of return + """ + # Code bloks go here. + # Make sure to include comments to explain what the code is doing. + # No empty lines between code blocks. + pass + + +def template_function(arg1: int) -> None: + """ + Brief imperative description of what the function does in one line. + + You can elaborate more in the function docstring in this section, for e.g. explaining + the formula/algorithm. Every function should have a docstring, typehints and include the + parameters and return as follows: + + :param arg1: description of arg1 + :return: description of return + """ + # Code bloks go here. + # Make sure to include comments to explain what the code is doing. + # No empty lines between code blocks. + pass + + +# %% [markdown] +# ## The flow should be highlighted using headings in markdown +# ``` +# # Level 1 +# ## Level 2 +# ### Level 3 +# ``` + +# %% diff --git a/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/template_utils.py b/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/template_utils.py new file mode 100644 index 000000000..f8916102e --- /dev/null +++ b/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/template_utils.py @@ -0,0 +1,72 @@ +""" +template_utils.py + +This file contains utility functions that support the tutorial notebooks. + +- Notebooks should call these functions instead of writing raw logic inline. +- This helps keep the notebooks clean, modular, and easier to debug. +- Students should implement functions here for data preprocessing, + model setup, evaluation, or any reusable logic. + +Import as: + +import class_project.project_template.template_utils as cpptteut +""" + +import pandas as pd +import logging +from sklearn.model_selection import train_test_split +from pycaret.classification import compare_models + +# ----------------------------------------------------------------------------- +# Logging +# ----------------------------------------------------------------------------- + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +# ----------------------------------------------------------------------------- +# Example 1: Split the dataset into train and test sets +# ----------------------------------------------------------------------------- + + +def split_data(df: pd.DataFrame, target_column: str, test_size: float = 0.2): + """ + Split the dataset into training and testing sets. + + :param df: full dataset + :param target_column: name of the target column + :param test_size: proportion of test data (default = 0.2) + + :return: X_train, X_test, y_train, y_test + """ + logger.info("Splitting data into train and test sets") + X = df.drop(columns=[target_column]) + y = df[target_column] + return train_test_split(X, y, test_size=test_size, random_state=42) + + +# ----------------------------------------------------------------------------- +# Example 2: PyCaret classification pipeline +# ----------------------------------------------------------------------------- + + +def run_pycaret_classification( + df: pd.DataFrame, target_column: str +) -> pd.DataFrame: + """ + Run a basic PyCaret classification experiment. + + :param df: dataset containing features and target + :param target_column: name of the target column + + :return: comparison of top-performing models + """ + logger.info("Initializing PyCaret classification setup") + ... + + logger.info("Comparing models") + results = compare_models() + ... + + return results diff --git a/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/test/test_docker_all.py b/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/test/test_docker_all.py new file mode 100644 index 000000000..904cdd7af --- /dev/null +++ b/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/test/test_docker_all.py @@ -0,0 +1,48 @@ +""" +Run each notebook in class_project/project_template/ inside Docker using docker_cmd.sh. + +Import as: + +import class_project.project_template.test.test_docker_all as tptdal +""" + +import logging + +import pytest + +import helpers.hdocker_tests as hdoctest + +_LOG = logging.getLogger(__name__) + + +# ############################################################################# +# Test_docker +# ############################################################################# + + +class Test_docker(hdoctest.DockerTestCase): + """ + Run all Docker tests for class_project/project_template/. + """ + + _test_file = __file__ + + @pytest.mark.slow + def test1(self) -> None: + """ + Test that template.example.ipynb runs without error inside Docker. + """ + # Prepare inputs. + notebook_name = "template.example.ipynb" + # Run test. + self._helper(notebook_name) + + @pytest.mark.slow + def test2(self) -> None: + """ + Test that template.API.ipynb runs without error inside Docker. + """ + # Prepare inputs. + notebook_name = "template.API.ipynb" + # Run test. + self._helper(notebook_name) diff --git a/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/utils.sh b/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/utils.sh new file mode 100644 index 000000000..cc0ed8c4a --- /dev/null +++ b/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/utils.sh @@ -0,0 +1,607 @@ +#!/bin/bash +# """ +# Utility functions for Docker container management. +# """ + + +# ############################################################################# +# General utilities +# ############################################################################# + + +run() { + # """ + # Execute a command with echo output. + # + # :param cmd: Command string to execute + # :return: Exit status of the executed command + # """ + cmd="$*" + echo "> $cmd" + eval "$cmd" +} + + +enable_verbose_mode() { + # """ + # Enable shell command tracing (set -x) when VERBOSE is set to 1. + # + # Reads the VERBOSE variable set by parse_docker_jupyter_args. + # Call this after parsing args to activate tracing for the rest of the script. + # """ + if [[ $VERBOSE == 1 ]]; then + set -x + fi +} + + +# ############################################################################# +# Argument parsing +# ############################################################################# + + +_print_default_help() { + # """ + # Print usage information and available default options for docker scripts. + # """ + echo "Usage: $(basename $0) [options]" + echo "" + echo "Options:" + echo " -f Force kill existing container with same name before starting" + echo " -h Print this help message and exit" + echo " -v Enable verbose output (set -x)" +} + + +parse_default_args() { + # """ + # Parse default command-line arguments for docker scripts. + # + # Sets VERBOSE and FORCE variables in the caller's scope. Enables set -x + # when -v is passed. Prints help and exits when -h is passed. + # Updates OPTIND so the caller can shift away processed arguments. + # + # :param @: command-line arguments forwarded from the calling script + # """ + VERBOSE=0 + FORCE=0 + while getopts "fhv" flag; do + case "${flag}" in + f) FORCE=1;; + h) _print_default_help; exit 0;; + v) VERBOSE=1;; + *) _print_default_help; exit 1;; + esac + done + enable_verbose_mode +} + + +_print_docker_jupyter_help() { + # """ + # Print usage information and available options for docker_jupyter.sh. + # """ + echo "Usage: $(basename $0) [options]" + echo "" + echo "Launch Jupyter Lab inside a Docker container." + echo "" + echo "Options:" + echo " -f Force kill existing container with same name before starting" + echo " -h Print this help message and exit" + echo " -p PORT Host port to forward to Jupyter Lab (default: 8888)" + echo " -u Enable vim keybindings in Jupyter Lab" + echo " -v Enable verbose output (set -x)" +} + + +parse_docker_jupyter_args() { + # """ + # Parse command-line arguments for docker_jupyter.sh. + # + # Sets JUPYTER_HOST_PORT, JUPYTER_USE_VIM, TARGET_DIR, VERBOSE, FORCE, and + # OLD_CMD_OPTS in the caller's scope. Enables set -x when -v is passed. + # Prints help and exits when -h is passed. + # + # :param @: command-line arguments forwarded from the calling script + # """ + # Set defaults. + JUPYTER_HOST_PORT=8888 + JUPYTER_USE_VIM=0 + VERBOSE=0 + FORCE=0 + # Save original args to pass through to run_jupyter.sh. + OLD_CMD_OPTS="$*" + # Parse options. + while getopts "fhp:uv" flag; do + case "${flag}" in + f) FORCE=1;; + h) _print_docker_jupyter_help; exit 0;; + p) JUPYTER_HOST_PORT=${OPTARG};; # Port for Jupyter Lab. + u) JUPYTER_USE_VIM=1;; # Enable vim bindings. + v) VERBOSE=1;; # Enable verbose output. + *) _print_docker_jupyter_help; exit 1;; + esac + done + # Enable command tracing if verbose mode is requested. + enable_verbose_mode +} + + +# ############################################################################# +# Docker image management +# ############################################################################# + + +get_docker_vars_script() { + # """ + # Load Docker variables from docker_name.sh script. + # + # :param script_path: Path to the script to determine the Docker configuration directory + # :return: Sources REPO_NAME, IMAGE_NAME, and FULL_IMAGE_NAME variables + # """ + local script_path=$1 + # Find the name of the container. + SCRIPT_DIR=$(dirname $script_path) + DOCKER_NAME="$SCRIPT_DIR/docker_name.sh" + if [[ ! -e $SCRIPT_DIR ]]; then + echo "Can't find $DOCKER_NAME" + exit -1 + fi; + source $DOCKER_NAME +} + + +print_docker_vars() { + # """ + # Print current Docker variables to stdout. + # """ + echo "REPO_NAME=$REPO_NAME" + echo "IMAGE_NAME=$IMAGE_NAME" + echo "FULL_IMAGE_NAME=$FULL_IMAGE_NAME" +} + + +build_container_image() { + # """ + # Build a Docker container image. + # + # Supports both single-architecture and multi-architecture builds. + # Creates temporary build directory, copies files, and builds the image. + # + # :param @: Additional options to pass to docker build/buildx build + # """ + echo "# ${FUNCNAME[0]} ..." + FULL_IMAGE_NAME=$REPO_NAME/$IMAGE_NAME + echo "FULL_IMAGE_NAME=$FULL_IMAGE_NAME" + # Prepare build area. + #tar -czh . | docker build $OPTS -t $IMAGE_NAME - + DIR="../tmp.build" + if [[ -d $DIR ]]; then + rm -rf $DIR + fi; + cp -Lr . $DIR || true + # Build container. + echo "DOCKER_BUILDKIT=$DOCKER_BUILDKIT" + echo "DOCKER_BUILD_MULTI_ARCH=$DOCKER_BUILD_MULTI_ARCH" + if [[ $DOCKER_BUILD_MULTI_ARCH != 1 ]]; then + # Build for a single architecture. + echo "Building for current architecture..." + OPTS="--progress plain $@" + (cd $DIR; docker build $OPTS -t $FULL_IMAGE_NAME . 2>&1 | tee ../docker_build.log; exit ${PIPESTATUS[0]}) + else + # Build for multiple architectures. + echo "Building for multiple architectures..." + OPTS="$@" + export DOCKER_CLI_EXPERIMENTAL=enabled + # Create a new builder. + #docker buildx rm --all-inactive --force + #docker buildx create --name mybuilder + #docker buildx use mybuilder + # Use the default builder. + docker buildx use multiarch + docker buildx inspect --bootstrap + # Note that one needs to push to the repo since otherwise it is not + # possible to keep multiple. + (cd $DIR; docker buildx build --push --platform linux/arm64,linux/amd64 $OPTS --tag $FULL_IMAGE_NAME . 2>&1 | tee ../docker_build.log; exit ${PIPESTATUS[0]}) + # Report the status. + docker buildx imagetools inspect $FULL_IMAGE_NAME + fi; + # Report build version. + if [ -f docker_build.version.log ]; then + rm docker_build.version.log + fi + (cd $DIR; docker run --rm -it -v $(pwd):/data $FULL_IMAGE_NAME bash -c "/data/version.sh") 2>&1 | tee docker_build.version.log + # + docker image ls $REPO_NAME/$IMAGE_NAME + rm -rf $DIR + echo "*****************************" + echo "SUCCESS" + echo "*****************************" +} + + +remove_container_image() { + # """ + # Remove Docker container image(s) matching the current configuration. + # """ + echo "# ${FUNCNAME[0]} ..." + FULL_IMAGE_NAME=$REPO_NAME/$IMAGE_NAME + echo "FULL_IMAGE_NAME=$FULL_IMAGE_NAME" + docker image ls | grep $FULL_IMAGE_NAME + docker image ls | grep $FULL_IMAGE_NAME | awk '{print $1}' | xargs -n 1 -t docker image rm -f + docker image ls + echo "${FUNCNAME[0]} ... done" +} + + +push_container_image() { + # """ + # Push Docker container image to registry. + # + # Authenticates using credentials from ~/.docker/passwd.$REPO_NAME.txt. + # """ + echo "# ${FUNCNAME[0]} ..." + FULL_IMAGE_NAME=$REPO_NAME/$IMAGE_NAME + echo "FULL_IMAGE_NAME=$FULL_IMAGE_NAME" + docker login --username $REPO_NAME --password-stdin <~/.docker/passwd.$REPO_NAME.txt + docker images $FULL_IMAGE_NAME + docker push $FULL_IMAGE_NAME + echo "${FUNCNAME[0]} ... done" +} + + +pull_container_image() { + # """ + # Pull Docker container image from registry. + # """ + echo "# ${FUNCNAME[0]} ..." + FULL_IMAGE_NAME=$REPO_NAME/$IMAGE_NAME + echo "FULL_IMAGE_NAME=$FULL_IMAGE_NAME" + docker pull $FULL_IMAGE_NAME + echo "${FUNCNAME[0]} ... done" +} + + +# ############################################################################# +# Docker container management +# ############################################################################# + + +kill_container() { + # """ + # Kill and remove Docker container(s) matching the current configuration. + # """ + echo "# ${FUNCNAME[0]} ..." + FULL_IMAGE_NAME=$REPO_NAME/$IMAGE_NAME + echo "FULL_IMAGE_NAME=$FULL_IMAGE_NAME" + docker container ls + # + CONTAINER_ID=$(docker container ls -a | grep $FULL_IMAGE_NAME | awk '{print $1}') + echo "CONTAINER_ID=$CONTAINER_ID" + if [[ ! -z $CONTAINER_ID ]]; then + docker container rm -f $CONTAINER_ID + docker container ls + fi; + echo "${FUNCNAME[0]} ... done" +} + + +kill_container_by_name() { + # """ + # Kill and remove a Docker container by its name. + # + # :param container_name: Name of the container to kill + # """ + local container_name=$1 + echo "# ${FUNCNAME[0]}: $container_name" + # Check if container exists (running or stopped). + local container_id=$(docker container ls -a --filter "name=^${container_name}$" --format "{{.ID}}") + if [[ -n $container_id ]]; then + echo "Killing container: $container_name (ID: $container_id)" + docker container rm -f $container_id + else + echo "Container '$container_name' not found" + fi + echo "${FUNCNAME[0]} ... done" +} + + +exec_container() { + # """ + # Execute bash shell in running Docker container. + # + # Opens an interactive bash session in the first container matching the + # current configuration. + # """ + echo "# ${FUNCNAME[0]} ..." + FULL_IMAGE_NAME=$REPO_NAME/$IMAGE_NAME + echo "FULL_IMAGE_NAME=$FULL_IMAGE_NAME" + docker container ls + # + CONTAINER_ID=$(docker container ls -a | grep $FULL_IMAGE_NAME | awk '{print $1}') + echo "CONTAINER_ID=$CONTAINER_ID" + docker exec -it $CONTAINER_ID bash + echo "${FUNCNAME[0]} ... done" +} + + +# ############################################################################# +# Docker common options +# ############################################################################# + + +get_docker_common_options() { + # """ + # Return docker run options common to all container types. + # + # Includes volume mount for the git root, plus environment variables for + # PYTHONPATH and host OS name. + # + # :return: docker run options string with volume mounts and env vars + # """ + echo "-v $GIT_ROOT:/git_root \ + -e PYTHONPATH=/git_root:/git_root/helpers_root:/git_root/msml610/tutorials \ + -e CSFY_GIT_ROOT_PATH=/git_root \ + -e CSFY_HOST_OS_NAME=$(uname -s) \ + -e CSFY_HOST_NAME=$(uname -n)" +} + + +# ############################################################################# +# Docker bash +# ############################################################################# + + +get_docker_bash_command() { + # """ + # Return the base docker run command for an interactive bash shell. + # + # :return: docker run command string with --rm and -ti flags + # """ + if [ -t 0 ]; then + echo "docker run --rm -ti" + else + echo "docker run --rm -i" + fi +} + + +get_docker_bash_options() { + # """ + # Return docker run options for a Docker container. + # + # :param container_name: Name for the Docker container + # :param port: Port number to forward (optional, skipped if empty) + # :param extra_opts: Additional docker run options (optional) + # :return: docker run options string with name, volume mounts, and env vars + # """ + local container_name=$1 + local port=$2 + local extra_opts=$3 + local port_opt="" + if [[ -n $port ]]; then + port_opt="-p $port:$port" + fi + echo "--name $container_name \ + $port_opt \ + $extra_opts \ + $(get_docker_common_options)" +} + + +# ############################################################################# +# Docker cmd +# ############################################################################# + + +get_docker_cmd_command() { + # """ + # Return the base docker run command for executing a non-interactive command. + # + # :return: docker run command string with --rm and -i flags + # """ + echo "docker run --rm -i" +} + + +# ############################################################################# +# Docker Jupyter +# ############################################################################# + + +get_docker_jupyter_command() { + # """ + # Return the base docker run command for running Jupyter Lab interactively. + # + # :return: docker run command string with --rm and -ti flags (if TTY available) + # """ + local docker_cmd="docker run --rm" + # Add interactive and TTY flags only if stdin is a TTY. + if [[ -t 0 ]]; then + docker_cmd="$docker_cmd -ti" + fi + echo "$docker_cmd" +} + + +get_docker_jupyter_options() { + # """ + # Return docker run options for a Jupyter Lab container. + # + # :param container_name: Name for the Docker container + # :param host_port: Host port to forward to container port 8888 + # :param jupyter_use_vim: 0 or 1 to enable vim bindings + # :return: docker run options string + # """ + local container_name=$1 + local host_port=$2 + local jupyter_use_vim=$3 + # Run as the current user when user is saggese. + if [[ "$(whoami)" == "saggese" ]]; then + echo "Overwriting jupyter_use_vim since user='saggese'" >&2 + jupyter_use_vim=1 + fi + echo "--name $container_name \ + -p $host_port:8888 \ + $(get_docker_common_options) \ + -e JUPYTER_USE_VIM=$jupyter_use_vim" +} + + +configure_jupyter_vim_keybindings() { + # """ + # Configure JupyterLab vim keybindings based on JUPYTER_USE_VIM env var. + # + # Reads JUPYTER_USE_VIM; if 1, verifies jupyterlab_vim is installed and + # writes enabled settings; otherwise writes disabled settings. + # """ + mkdir -p ~/.jupyter/lab/user-settings/@axlair/jupyterlab_vim + if [[ $JUPYTER_USE_VIM == 1 ]]; then + # Check that jupyterlab_vim is installed before trying to enable it. + if ! pip show jupyterlab_vim > /dev/null 2>&1; then + echo "ERROR: jupyterlab_vim is not installed but vim bindings were requested." + echo "Install it with: pip install jupyterlab_vim" + exit 1 + fi + echo "Enabling vim." + cat < ~/.jupyter/lab/user-settings/\@axlair/jupyterlab_vim/plugin.jupyterlab-settings +{ + "enabled": true, + "enabledInEditors": true, + "extraKeybindings": [], + "autosaveInterval": 6 +} +EOF + else + echo "Disabling vim." + cat < ~/.jupyter/lab/user-settings/\@axlair/jupyterlab_vim/plugin.jupyterlab-settings +{ + "enabled": false, + "enabledInEditors": false, + "extraKeybindings": [], + "autosaveInterval": 6 +} +EOF + fi; +} + + +configure_jupyter_notifications() { + # """ + # Disable JupyterLab news fetching and update checks. + # """ + mkdir -p ~/.jupyter/lab/user-settings/@jupyterlab/apputils-extension + cat < ~/.jupyter/lab/user-settings/\@jupyterlab/apputils-extension/notification.jupyterlab-settings +{ + // Notifications + // @jupyterlab/apputils-extension:notification + // Notifications settings. + + // Fetch official Jupyter news + // Whether to fetch news from the Jupyter news feed. If Always (`true`), it will make a request to a website. + "fetchNews": "false", + "checkForUpdates": false +} +EOF +} + + +configure_jupyter_autosave() { + # """ + # Configure JupyterLab global autosave interval to 6 seconds. + # """ + mkdir -p ~/.jupyter/lab/user-settings/@jupyterlab/docmanager-extension + cat < ~/.jupyter/lab/user-settings/\@jupyterlab/docmanager-extension/plugin.jupyterlab-settings +{ + "autosaveInterval": 6 +} +EOF +} + + +check_jupytext_installed() { + # """ + # Verify that jupytext is installed before starting Jupyter Lab. + # + # Jupytext is required for pair notebook/Python file functionality. + # Exits with error if jupytext is not installed. + # """ + if ! pip show jupytext > /dev/null 2>&1; then + echo "ERROR: jupytext is not installed but is required to run Jupyter Lab." + echo "Install it with: pip install jupytext" + exit 1 + fi +} + + +setup_jupyter_environment() { + # """ + # Configure Jupyter Lab environment before launching. + # + # Performs all necessary setup steps: + # - Configure vim keybindings + # - Disable notifications + # - Configure autosave interval + # - Verify jupytext is installed + # """ + configure_jupyter_vim_keybindings + configure_jupyter_notifications + configure_jupyter_autosave + check_jupytext_installed +} + + +get_jupyter_args() { + # """ + # Print the standard Jupyter Lab command-line arguments. + # + # :return: space-separated Jupyter Lab args for port 8888 with no browser, + # allow root, and no authentication + # """ + echo "--port=8888 --no-browser --ip=0.0.0.0 --allow-root --ServerApp.token='' --ServerApp.password=''" +} + + +get_run_jupyter_cmd() { + # """ + # Return the command to run run_jupyter.sh inside a container. + # + # Computes the script's path relative to GIT_ROOT and builds the + # corresponding /git_root/... path used inside the container. + # + # :param script_path: path of the calling script (pass ${BASH_SOURCE[0]}) + # :param cmd_opts: options to forward to run_jupyter.sh + # :return: full command string to run run_jupyter.sh + # """ + local script_path=$1 + local cmd_opts=$2 + local script_dir + script_dir=$(cd "$(dirname "$script_path")" && pwd) + local rel_dir="${script_dir#${GIT_ROOT}/}" + echo "/git_root/${rel_dir}/run_jupyter.sh $cmd_opts" +} + + +list_and_inspect_docker_image() { + # """ + # List available Docker images and inspect their architecture. + # + # Lists all images matching FULL_IMAGE_NAME and attempts to inspect + # their architecture using docker manifest inspect. + # """ + run "docker image ls $FULL_IMAGE_NAME" + (docker manifest inspect $FULL_IMAGE_NAME | grep arch) || true +} + + +kill_existing_container_if_forced() { + # """ + # Kill existing container if FORCE flag is set. + # + # If FORCE is set to 1, kills and removes the container with name + # CONTAINER_NAME. This is typically set by the -f flag. + # """ + if [[ $FORCE == 1 ]]; then + kill_container_by_name $CONTAINER_NAME + fi +} diff --git a/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/version.sh b/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/version.sh new file mode 100755 index 000000000..c46ed254c --- /dev/null +++ b/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/version.sh @@ -0,0 +1,28 @@ +#!/bin/bash +# """ +# Display versions of installed tools and packages. +# +# This script prints version information for Python, pip, Jupyter, and all +# installed Python packages. Used for debugging and documentation purposes +# to verify the Docker container environment setup. +# """ + +# Display Python 3 version. +echo "# Python3" +python3 --version + +# Display pip version. +echo "# pip3" +pip3 --version + +# Display Jupyter version. +echo "# jupyter" +jupyter --version + +# List all installed Python packages and their versions. +echo "# Python packages" +pip3 list + +# Template for adding additional tool versions. +# echo "# mongo" +# mongod --version From 570446d21edaf6fc586896533c824fe995c377af Mon Sep 17 00:00:00 2001 From: riyaapuri Date: Wed, 29 Apr 2026 16:31:47 -0400 Subject: [PATCH 2/8] Added dataloading and preprocessing scripts --- .../README.md | 38 +++++++- .../project_files/config.py | 28 ++++++ .../project_files/requirements.txt | 13 +++ .../project_files/utils/dataset_loader.py | 72 ++++++++++++++ .../project_files/utils/preprocessing.py | 94 +++++++++++++++++++ 5 files changed, 244 insertions(+), 1 deletion(-) create mode 100644 class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/project_files/config.py create mode 100644 class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/project_files/requirements.txt create mode 100644 class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/project_files/utils/dataset_loader.py create mode 100644 class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/project_files/utils/preprocessing.py diff --git a/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/README.md b/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/README.md index 2953bc5f7..c6fd1077f 100644 --- a/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/README.md +++ b/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/README.md @@ -12,4 +12,40 @@ The full stack uses HuggingFace Transformers and Datasets for tokenization and f https://github.com/gpsaggese/gpsaggese.github.io/blob/master/class_project/data605/Spring2026/projects_descriptions/HuggingFace_Project_Description.md **Authors**: @riyaapuri @stupatel17 -**Assigned to**: @riyaapuri @stupatel17 @protocorn @gpsaggese \ No newline at end of file +**Assigned to**: @riyaapuri @stupatel17 @protocorn @gpsaggese + +## Stack + +| Layer | Library | +|---|---| +| Modeling & Tokenization | HuggingFace Transformers, Datasets | +| Training Backend | PyTorch, Accelerate | +| Evaluation | Scikit-learn, HuggingFace Evaluate | +| Hyperparameter Tuning | Optuna | +| Serving | FastAPI | +| Dashboard | Streamlit | + +--- + +## Release v1.0 + +### `config.py` +Central config for all constants — dataset name, label mappings, model checkpoints, training hyperparameters, and paths. Imported by every other module to avoid hardcoded values. + +### `dataset_loader.py` +Loads AG News from the HuggingFace hub. Optionally subsets train/test splits for faster iteration. Includes `summarize_dataset()` for label-distribution stats and `get_sample_articles()` for spot-checking raw examples. + +### `preprocessing.py` +Three-stage pipeline: **clean → tokenize → format**. +- `clean_text()` strips HTML entities, URLs, and excess whitespace. Punctuation and casing are intentionally preserved for the tokenizer. +- `get_tokenizer()` loads an `AutoTokenizer` for any HF checkpoint. +- `make_tokenize_fn()` returns a closure for use with `dataset.map()`, applying cleaning + tokenization with `padding="max_length"` and `truncation=True` at `MAX_LENGTH=128`. +- `tokenize_dataset()` runs batched tokenization over the full `DatasetDict` and sets torch format on the output columns (`input_ids`, `attention_mask`, `label`). + +### `metrics.py` +Two evaluation utilities: +- `compute_metrics()` — Trainer callback returning `accuracy` and `f1_macro` after each eval step. +- `full_report()` — generates a detailed sklearn classification report and confusion matrix, used during final model evaluation. + +### `requirements.txt` +Pins all dependencies. Key versions: `transformers>=4.35`, `torch>=2.0`, `datasets>=2.14`, `scikit-learn>=1.3`, `optuna>=3.3`. \ No newline at end of file diff --git a/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/project_files/config.py b/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/project_files/config.py new file mode 100644 index 000000000..e6043a234 --- /dev/null +++ b/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/project_files/config.py @@ -0,0 +1,28 @@ +#Dataset +DATASET_NAME = "ag_news" +NUM_LABELS = 4 +LABEL_NAMES = ["World", "Sports", "Business", "Sci/Tech"] +LABEL2ID = {name: i for i, name in enumerate(LABEL_NAMES)} +ID2LABEL = {i: name for i, name in enumerate(LABEL_NAMES)} + +#Model +DEFAULT_MODEL = "distilbert-base-uncased" +BERT_MODEL = "bert-base-uncased" +ROBERTA_MODEL = "roberta-base" + +#Training +OUTPUT_DIR = "models/distilbert-ag-news" +EPOCHS = 3 +BATCH_SIZE = 16 +LEARNING_RATE = 2e-5 +WEIGHT_DECAY = 0.01 +WARMUP_STEPS = 500 +MAX_LENGTH = 128 #Max token length per article +TRAIN_SUBSET = None +EVAL_SUBSET = None + +#Evaluation +RESULTS_DIR = "results" + +#Reproducibility +SEED = 42 diff --git a/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/project_files/requirements.txt b/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/project_files/requirements.txt new file mode 100644 index 000000000..6a5304f04 --- /dev/null +++ b/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/project_files/requirements.txt @@ -0,0 +1,13 @@ +transformers>=4.35.0 +datasets>=2.14.0 +torch>=2.0.0 +scikit-learn>=1.3.0 +pandas>=2.0.0 +numpy>=1.24.0 +matplotlib>=3.7.0 +seaborn>=0.12.0 +accelerate>=0.24.0 +evaluate>=0.4.0 +tqdm>=4.65.0 +optuna>=3.3.0 +jupyterlab>=4.0.0 diff --git a/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/project_files/utils/dataset_loader.py b/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/project_files/utils/dataset_loader.py new file mode 100644 index 000000000..28f940a31 --- /dev/null +++ b/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/project_files/utils/dataset_loader.py @@ -0,0 +1,72 @@ +#Data Loading + +import random +import sys +import os + +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from datasets import load_dataset +import pandas as pd +from config import ( + DATASET_NAME, LABEL_NAMES, ID2LABEL, + TRAIN_SUBSET, EVAL_SUBSET, SEED +) + + +def load_ag_news(): + + #Load the AG News dataset from HuggingFace hub + #Each split has columns: 'text', 'label' + + print(f"Loading '{DATASET_NAME}' from HuggingFace") + dataset = load_dataset(DATASET_NAME) + print(f"Train size : {len(dataset['train']):,}") + print(f"Test size : {len(dataset['test']):,}") + return dataset + + +def get_subsets(dataset): + if TRAIN_SUBSET: + dataset["train"] = dataset["train"].shuffle(seed=SEED).select(range(TRAIN_SUBSET)) + print(f"Using train subset: {TRAIN_SUBSET}") + if EVAL_SUBSET: + dataset["test"] = dataset["test"].shuffle(seed=SEED).select(range(EVAL_SUBSET)) + print(f"Using test subset: {EVAL_SUBSET}") + return dataset + + +def summarize_dataset(dataset): + + # Print summary + print("\nDataset Summary:") + for split_name, split in dataset.items(): + df = pd.DataFrame(split) + print(f"\n Split: {split_name} ({len(df):,} examples)") + counts = df["label"].value_counts().sort_index() + for label_id, count in counts.items(): + label_name = ID2LABEL[label_id] + bar = "█" * (count // 1000) + print(f" [{label_id}] {label_name:<12} {count:>6,} {bar}") + print("─" * 52 + "\n") + + +def get_sample_articles(dataset, n=3, split="train"): + + # Return n random sample articles with their labels + indices = random.sample(range(len(dataset[split])), n) + samples = dataset[split].select(indices) + print(f"\n{n} Sample Articles from '{split}' split") + for i, row in enumerate(samples): + label_name = ID2LABEL[row["label"]] + print(f"\n [{i+1}] Label: {label_name}") + print(f" Text : {row['text'][:200]}...") + print("─" * 52 + "\n") + return samples + + +if __name__ == "__main__": + dataset = load_ag_news() + dataset = get_subsets(dataset) + summarize_dataset(dataset) + get_sample_articles(dataset) diff --git a/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/project_files/utils/preprocessing.py b/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/project_files/utils/preprocessing.py new file mode 100644 index 000000000..6d0a0f0f8 --- /dev/null +++ b/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/project_files/utils/preprocessing.py @@ -0,0 +1,94 @@ +# utils/preprocessing.py +import re +import sys +import os + +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from transformers import AutoTokenizer +from config import DEFAULT_MODEL, MAX_LENGTH + + +# Text Cleaning + +def clean_text(text: str) -> str: + # Collapse whitespace + text = re.sub(r"\s+", " ", text) + # Remove HTML entities + text = re.sub(r"&[a-zA-Z]+;|&#\d+;", " ", text) + # Remove URLs + text = re.sub(r"https?://\S+|www\.\S+", "", text) + return text.strip() + + +# Tokenizer Setup + +def get_tokenizer(model_name: str = DEFAULT_MODEL): + """ + Load a HuggingFace AutoTokenizer for the given model checkpoint. + + Parameters + ---------- + model_name : str + HuggingFace model hub ID (e.g. 'distilbert-base-uncased') + + Returns + ------- + tokenizer : PreTrainedTokenizer + """ + print(f"[preprocessing] Loading tokenizer: {model_name}") + tokenizer = AutoTokenizer.from_pretrained(model_name) + return tokenizer + + +# Tokenization Pipeline + +def make_tokenize_fn(tokenizer, max_length: int = MAX_LENGTH): + # Creates dataset.map() -> ready tokenizer that cleans text, pads/truncates to max_length, + # Returns input_ids, attention_mask (token_type_ids for BERT). + def tokenize_fn(examples): + # Clean all texts in the batch + cleaned = [clean_text(t) for t in examples["text"]] + # Tokenize + encoded = tokenizer( + cleaned, + padding="max_length", + truncation=True, + max_length=max_length, + ) + return encoded + + return tokenize_fn + + +def tokenize_dataset(dataset, tokenizer, max_length: int = MAX_LENGTH): + + #Apply tokenization to an entire HuggingFace DatasetDict. + #Returns a tokenized_dataset : DatasetDict + print(f"[preprocessing] Tokenizing dataset (max_length={max_length})...") + tokenize_fn = make_tokenize_fn(tokenizer, max_length) + + tokenized = dataset.map( + tokenize_fn, + batched=True, + desc="Tokenizing", + remove_columns=["text"], # keep 'label', add token columns + ) + + # Set output format for PyTorch + tokenized.set_format(type="torch", columns=["input_ids", "attention_mask", "label"]) + print("[preprocessing] Tokenization complete.") + return tokenized + + +# Quick test + +if __name__ == "__main__": + sample = " NASA launches new satellite & rover into orbit. https://nasa.gov " + print("Raw :", repr(sample)) + print("Clean :", repr(clean_text(sample))) + + tok = get_tokenizer() + fn = make_tokenize_fn(tok) + result = fn({"text": [sample]}) + print("Tokens:", result["input_ids"]) From f6fb5af5ef976064ef3815cb10e8ae50787ba919 Mon Sep 17 00:00:00 2001 From: riyaapuri Date: Tue, 5 May 2026 01:21:17 -0400 Subject: [PATCH 3/8] Model training and prediction scripts added. Updated template files to create Docker pipeline. --- .../Dockerfile | 38 ++-- .../README.md | 104 +++++++++- .../docker_bash.sh | 36 +--- .../docker_build.sh | 46 ++--- .../docker_clean.sh | 33 ++- .../docker_jupyter.sh | 45 ++-- .../docker_name.sh | 17 +- .../docker_predict.sh | 32 +++ .../docker_train.sh | 25 +++ .../docker_utils.sh | 35 ++++ .../project_files/requirements.txt | 6 +- .../project_files/scripts/predict.py | 128 ++++++++++++ .../project_files/scripts/train.py | 195 ++++++++++++++++++ .../project_files/utils/dataset_loader.py | 14 +- .../project_files/utils/metrics.py | 68 ++++++ .../run.sh | 67 ++++++ .../run_jupyter.sh | 40 +--- .../utils.sh | 36 ---- .../version.sh | 47 ++--- 19 files changed, 775 insertions(+), 237 deletions(-) mode change 100644 => 100755 class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/docker_name.sh create mode 100755 class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/docker_predict.sh create mode 100755 class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/docker_train.sh create mode 100755 class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/docker_utils.sh create mode 100644 class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/project_files/scripts/predict.py create mode 100644 class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/project_files/scripts/train.py create mode 100644 class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/project_files/utils/metrics.py create mode 100755 class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/run.sh mode change 100644 => 100755 class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/utils.sh diff --git a/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/Dockerfile b/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/Dockerfile index f5c02c562..2b73a9278 100644 --- a/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/Dockerfile +++ b/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/Dockerfile @@ -1,30 +1,40 @@ -# Use Python 3.12 slim (already has Python and pip). FROM python:3.12-slim -# Avoid interactive prompts during apt operations. ENV DEBIAN_FRONTEND=noninteractive -# Install CA certificates (needed for HTTPS). -RUN apt-get update && apt-get install -y \ +RUN apt-get update && apt-get install -y --no-install-recommends \ ca-certificates \ + git \ + build-essential \ + g++ \ + libgomp1 \ && rm -rf /var/lib/apt/lists/* -# Install project specific packages. RUN mkdir -p /install -COPY requirements.txt /install/requirements.txt +COPY project_files/requirements.txt /install/requirements.txt RUN pip install --upgrade pip && \ - pip install --no-cache-dir jupyterlab jupyterlab_vim jupytext -r /install/requirements.txt + pip install --no-cache-dir \ + jupyterlab \ + jupyterlab_vim \ + jupytext \ + -r /install/requirements.txt -# Config. -COPY etc_sudoers /install/ COPY etc_sudoers /etc/sudoers -COPY bashrc /root/.bashrc +COPY bashrc /root/.bashrc -# Report package versions. -COPY version.sh /install/ -RUN /install/version.sh 2>&1 | tee version.log +# Version report +COPY version.sh /install/version.sh +RUN chmod +x /install/version.sh && \ + /install/version.sh 2>&1 | tee /install/version.log -# Jupyter. +# Working directory +WORKDIR /app + +# HuggingFace cache dir +ENV HF_HOME=/hf_cache +ENV TRANSFORMERS_CACHE=/hf_cache + +# Jupyter port EXPOSE 8888 CMD ["/bin/bash"] diff --git a/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/README.md b/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/README.md index c6fd1077f..660b6939c 100644 --- a/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/README.md +++ b/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/README.md @@ -48,4 +48,106 @@ Two evaluation utilities: - `full_report()` — generates a detailed sklearn classification report and confusion matrix, used during final model evaluation. ### `requirements.txt` -Pins all dependencies. Key versions: `transformers>=4.35`, `torch>=2.0`, `datasets>=2.14`, `scikit-learn>=1.3`, `optuna>=3.3`. \ No newline at end of file +Pins all dependencies. Key versions: `transformers>=4.35`, `torch>=2.0`, `datasets>=2.14`, `scikit-learn>=1.3`, `optuna>=3.3`. + +--- + +## Release v2.0 + +### How to run (this will be cleaned up before the final commit) + +**Prerequisites:** Docker Desktop installed and running. No other local dependencies needed from this commit onwards. + +```bash +# Full pipeline — build, load data, train, predict +./run.sh + +# With a custom model and prediction text +./run.sh --model roberta-base --epochs 5 --text "Fed raises interest rates again" + +# Skip rebuild if image already exists +./run.sh --skip-build --text "Apple reports record iPhone sales" + +# Run individual steps +./docker_build.sh # build image +./docker_dataloader.sh # inspect dataset +./docker_train.sh --model bert-base-uncased --epochs 3 # fine-tune +./docker_predict.sh --text "NASA launches new satellite" # single prediction +./docker_predict.sh # interactive mode +./docker_jupyter.sh # open Jupyter Lab +``` + +--- + +### New Files + +#### `scripts/train.py` +Fine-tunes a transformer model on AG News end-to-end. Orchestrates the full training pipeline in five steps: +1. Loads and preprocesses the dataset via the `dataset_loader` and `preprocessing` utilities. +2. Instantiates `AutoModelForSequenceClassification` with a classification head (dropout + linear projection to 4 labels) on top of the pre-trained transformer backbone. +3. Configures HuggingFace `Trainer` with: linear LR warmup over 500 steps, weight decay regularization, per-epoch evaluation, and macro-F1 as the checkpoint selection metric. +4. Runs training with optional `fp16` on CUDA for speed. +5. Saves the best checkpoint to `models//best/` and writes a training log to `train_results.txt`. +Accepts CLI flags `--model`, `--epochs`, `--batch_size`, `--lr` so any backbone (DistilBERT, BERT, RoBERTa) can be swapped without touching code. + +#### `scripts/predict.py` +Loads a fine-tuned checkpoint and runs inference in three modes: +- `--text` — classify a single article passed as a string. +- `--file` — classify all articles in a text file (one per line). +- Interactive — prompts for articles in a loop until `Ctrl+C`. +Outputs the predicted label, confidence percentage, and a score bar for all four classes. Falls back gracefully with a clear error if the model checkpoint is not found. + +#### `run.sh` +Unified pipeline wrapper that runs the entire workflow in a single command. Executes four steps in order: build image → load dataset → train model → run prediction. Accepts `--skip-build` to avoid rebuilding when the image already exists, `--text` to set a custom prediction article, and all `train.py` flags (`--model`, `--epochs`, `--batch_size`, `--lr`) which are forwarded directly to the training step. + +#### `docker_utils.sh` +Shared helper library sourced by every `docker_*.sh` script. Provides three functions: `run()` to echo and execute commands, `load_docker_vars()` to source `docker_name.sh` and print resolved image names, and `base_run_opts()` to build the standard `docker run` flags including the code volume mount and HuggingFace cache volume. + +#### `docker_train.sh` +Runs `scripts/train.py` inside the container. Forwards all CLI arguments directly to the script, so any combination of `--model`, `--epochs`, `--batch_size`, and `--lr` works without modifying the script. The trained model is saved to `./models/` on the host via the volume mount. + +#### `docker_predict.sh` +Runs `scripts/predict.py` inside the container. Allocates a TTY only when called with no arguments (interactive mode), allowing it to also be called non-interactively from `run.sh` without a "not a TTY" error. + +#### `docker_dataloader.sh` +Runs `utils/dataset_loader.py` inside the container. Prints dataset size, per-label distribution, and sample articles so the data can be inspected before committing to a full training run. + +#### `version.sh` +Executed inside the container at image build time. Prints and logs the versions of all key packages (torch, transformers, datasets, etc.) to `/install/version.log`, making the image reproducible and debuggable. + +--- + +### Modified Files + +#### `config.py` +Added `BERT_MODEL` and `ROBERTA_MODEL` checkpoint constants alongside the existing `DEFAULT_MODEL` (DistilBERT) so alternative backbones can be referenced by name across the codebase without hardcoding strings. `RESULTS_DIR` path added for evaluation outputs in future commits. + +#### `utils/dataset_loader.py` +Added a validation split to test and find the best fit model before saving it. Since AGNews does not have a default validation split to ensure the final evaluation is done on unseen test data. This split (90/10 train, validation) helps us achieve that. + +#### `utils/metrics.py` +Updated import paths to reflect the new `project_files/` directory structure. No logic changes — `compute_metrics()` and `full_report()` behave identically to v1.0. + +#### `utils/preprocessing.py` +Fixed the `sys.path` insert to correctly locate `config.py` when the script is called from inside the `utils/` subdirectory as part of the restructured project layout. + +#### `Dockerfile` +Updated from the bare template to a full ML image: +- Base stays `python:3.12-slim`; added system packages `git`, `build-essential`, `g++`, `libgomp1` +- `COPY` path for `requirements.txt` changed to `project_files/requirements.txt`. +- Added `ENV HF_HOME=/hf_cache` and `ENV TRANSFORMERS_CACHE=/hf_cache` so all HuggingFace model downloads survive container restarts. +- Project code is not copied into the image — it is volume-mounted at runtime, so code edits require no rebuild. +#### `docker_build.sh` +Rewritten to be fully standalone — removed the dependency on the monorepo `utils.sh` via `git rev-parse`. Sources `docker_utils.sh` instead. Enables `DOCKER_BUILDKIT=1` for faster cached layer builds. Extra args (e.g. `--no-cache`) are passed through to `docker build`. + +#### `docker_bash.sh` +Rewritten standalone. Opens an interactive bash shell inside the container with the project directory live-mounted at `/app`. Used for manual debugging and exploration. + +#### `docker_jupyter.sh` +Rewritten standalone. Launches Jupyter Lab in a detached container with port-forwarding (`host:8888 → container:8888`). Port can be overridden via `JUPYTER_PORT` env var. + +#### `docker_clean.sh` +Rewritten standalone. Gains a `--volumes` flag that additionally removes the `hf_cache` named volume, wiping downloaded model weights for a full clean slate. + +#### `run_jupyter.sh` +Simplified — removed monorepo framework calls. Retains the same Jupyter flags: `--no-browser`, `--ip=0.0.0.0`, `--allow-root`, no token/password. \ No newline at end of file diff --git a/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/docker_bash.sh b/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/docker_bash.sh index 0025e81f4..54f34f78a 100755 --- a/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/docker_bash.sh +++ b/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/docker_bash.sh @@ -1,34 +1,14 @@ #!/bin/bash -# """ -# This script launches a Docker container with an interactive bash shell for -# development. -# """ +# The current directory is mounted at /app so all code changes are live. +# ./docker_bash.sh -# Exit immediately if any command exits with a non-zero status. set -e -# Import the utility functions from the project template. -GIT_ROOT=$(git rev-parse --show-toplevel) -source $GIT_ROOT/class_project/project_template/utils.sh +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +source "$SCRIPT_DIR/docker_utils.sh" +load_docker_vars -# Parse default args (-h, -v) and enable set -x if -v is passed. -parse_default_args "$@" +CONTAINER_NAME="${IMAGE_NAME}_bash" +OPTS=$(base_run_opts "$CONTAINER_NAME" "-it") -# Load Docker configuration variables for this script. -get_docker_vars_script ${BASH_SOURCE[0]} -source $DOCKER_NAME -print_docker_vars - -# List the available Docker images matching the expected image name. -run "docker image ls $FULL_IMAGE_NAME" - -# Configure and run the Docker container with interactive bash shell. -# - Container is removed automatically on exit (--rm) -# - Interactive mode with TTY allocation (-ti) -# - Port forwarding for Jupyter or other services -# - Git root mounted to /git_root inside container -CONTAINER_NAME=${IMAGE_NAME}_bash -PORT= -DOCKER_CMD=$(get_docker_bash_command) -DOCKER_CMD_OPTS=$(get_docker_bash_options $CONTAINER_NAME $PORT) -run "$DOCKER_CMD $DOCKER_CMD_OPTS $FULL_IMAGE_NAME" +run "docker run $OPTS $FULL_IMAGE_NAME /bin/bash" diff --git a/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/docker_build.sh b/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/docker_build.sh index 5b0957a99..aaa0ac4ee 100755 --- a/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/docker_build.sh +++ b/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/docker_build.sh @@ -1,40 +1,22 @@ #!/bin/bash -# """ -# Build a Docker container image for the project. -# -# This script sets up the build environment with error handling and command -# tracing, loads Docker configuration from docker_name.sh, and builds the -# Docker image using the build_container_image utility function. It supports -# both single-architecture and multi-architecture builds via the -# DOCKER_BUILD_MULTI_ARCH environment variable. -# """ +# ./docker_build.sh --no-cache # force full rebuild (re-installs all deps) -# Exit immediately if any command exits with a non-zero status. set -e -# Import the utility functions. -GIT_ROOT=$(git rev-parse --show-toplevel) -source $GIT_ROOT/class_project/project_template/utils.sh +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +source "$SCRIPT_DIR/docker_utils.sh" +load_docker_vars -# Parse default args (-h, -v) and enable set -x if -v is passed. -# Shift processed option flags so remaining args are passed to the build. -parse_default_args "$@" -shift $((OPTIND-1)) - -# Load Docker configuration variables (REPO_NAME, IMAGE_NAME, FULL_IMAGE_NAME). -get_docker_vars_script ${BASH_SOURCE[0]} -source $DOCKER_NAME -print_docker_vars - -# Configure Docker build settings. -# Enable BuildKit for improved build performance and features. +# Enable BuildKit for faster, cached layer builds. export DOCKER_BUILDKIT=1 -#export DOCKER_BUILDKIT=0 -# Configure single-architecture build (set to 1 for multi-arch build). -#export DOCKER_BUILD_MULTI_ARCH=1 -export DOCKER_BUILD_MULTI_ARCH=0 +# Pass any extra args (e.g. --no-cache) straight through to docker build. +EXTRA_ARGS="$*" + +run "docker build $EXTRA_ARGS -t $FULL_IMAGE_NAME $SCRIPT_DIR" -# Build the container image. -# Pass extra arguments (e.g., --no-cache) via command line after -v. -build_container_image "$@" +echo "" +echo "✅ Image built: $FULL_IMAGE_NAME" +echo " Run './docker_bash.sh' to open an interactive shell." +echo " Run './docker_train.sh' to start fine-tuning." +echo " Run './docker_jupyter.sh' to launch Jupyter Lab." diff --git a/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/docker_clean.sh b/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/docker_clean.sh index 7e40839ae..ff086b13e 100755 --- a/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/docker_clean.sh +++ b/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/docker_clean.sh @@ -1,26 +1,19 @@ #!/bin/bash -# """ -# Remove Docker container image for the project. -# -# This script cleans up Docker images by removing the container image -# matching the project configuration. Useful for freeing disk space or -# ensuring a fresh build. -# """ - -# Exit immediately if any command exits with a non-zero status. +# Remove the project Docker image +# ./docker_clean.sh --volumes # also removes the HF cache named volume set -e -# Import the utility functions. -GIT_ROOT=$(git rev-parse --show-toplevel) -source $GIT_ROOT/class_project/project_template/utils.sh +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +source "$SCRIPT_DIR/docker_utils.sh" +load_docker_vars -# Parse default args (-h, -v) and enable set -x if -v is passed. -parse_default_args "$@" +run "docker image rm -f $FULL_IMAGE_NAME" || true -# Load Docker configuration variables for this script. -get_docker_vars_script ${BASH_SOURCE[0]} -source $DOCKER_NAME -print_docker_vars +if [[ "$1" == "--volumes" ]]; then + echo "Also removing named volume: $HF_CACHE_VOLUME" + run "docker volume rm $HF_CACHE_VOLUME" || true +fi -# Remove the container image. -remove_container_image +echo "" +run "docker ps -a" +echo "✅ Cleanup complete. Run './docker_build.sh' to rebuild." diff --git a/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/docker_jupyter.sh b/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/docker_jupyter.sh index 1a60dfd3a..d70263216 100755 --- a/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/docker_jupyter.sh +++ b/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/docker_jupyter.sh @@ -1,39 +1,24 @@ #!/bin/bash -# """ -# Execute Jupyter Lab in a Docker container. -# -# This script launches a Docker container running Jupyter Lab with -# configurable port, directory mounting, and vim bindings. It passes -# command-line options to the run_jupyter.sh script inside the container. -# -# Usage: -# > docker_jupyter.sh [options] -# """ -# Exit immediately if any command exits with a non-zero status. +# JUPYTER_PORT=8889 ./docker_jupyter.sh +# http://localhost:8888/lab set -e -# Import the utility functions. -GIT_ROOT=$(git rev-parse --show-toplevel) -source $GIT_ROOT/class_project/project_template/utils.sh +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +source "$SCRIPT_DIR/docker_utils.sh" +load_docker_vars -# Parse command-line options and set Jupyter configuration variables. -parse_docker_jupyter_args "$@" +CONTAINER_NAME="${IMAGE_NAME}_jupyter" -# Load Docker configuration variables for this script. -get_docker_vars_script ${BASH_SOURCE[0]} -source $DOCKER_NAME -print_docker_vars +# -p maps host:container port; -d runs detached so the terminal stays free. +OPTS=$(base_run_opts "$CONTAINER_NAME" "-d -p ${JUPYTER_PORT}:8888") -# List available Docker images and inspect architecture. -list_and_inspect_docker_image +echo "🔬 Starting Jupyter Lab on http://localhost:${JUPYTER_PORT}/lab" +echo " (container: $CONTAINER_NAME)" +echo "" -# Run the Docker container with Jupyter Lab. -CMD=$(get_run_jupyter_cmd "${BASH_SOURCE[0]}" "$OLD_CMD_OPTS") -CONTAINER_NAME=$IMAGE_NAME -# Kill existing container if -f flag is set. -kill_existing_container_if_forced +run "docker run $OPTS $FULL_IMAGE_NAME /bin/bash run_jupyter.sh" -DOCKER_CMD=$(get_docker_jupyter_command) -DOCKER_CMD_OPTS=$(get_docker_jupyter_options $CONTAINER_NAME $JUPYTER_HOST_PORT $JUPYTER_USE_VIM) -run "$DOCKER_CMD $DOCKER_CMD_OPTS $FULL_IMAGE_NAME $CMD" +echo "" +echo "✅ Jupyter is running. Open: http://localhost:${JUPYTER_PORT}/lab" +echo " Stop with: docker stop $CONTAINER_NAME" diff --git a/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/docker_name.sh b/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/docker_name.sh old mode 100644 new mode 100755 index 32a546cf3..5c746a9c4 --- a/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/docker_name.sh +++ b/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/docker_name.sh @@ -1,12 +1,11 @@ #!/bin/bash -# """ -# Docker image naming configuration. -# -# This file defines the repository name, image name, and full image name -# variables used by all docker_*.sh scripts in the project template. -# """ -REPO_NAME=gpsaggese -# The file should be all lower case. -IMAGE_NAME=umd_project_template +REPO_NAME=data605_class_project +IMAGE_NAME=huggingface_text_classifier FULL_IMAGE_NAME=$REPO_NAME/$IMAGE_NAME + +# Named volume for HuggingFace model downloads (shared across all containers). +HF_CACHE_VOLUME=hf_cache + +# Default Jupyter port (override with JUPYTER_PORT env var if needed). +JUPYTER_PORT=${JUPYTER_PORT:-8888} diff --git a/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/docker_predict.sh b/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/docker_predict.sh new file mode 100755 index 000000000..1ebcf913f --- /dev/null +++ b/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/docker_predict.sh @@ -0,0 +1,32 @@ +#!/bin/bash +# ───────────────────────────────────────────────────────────────────────────── +# Run inference with the fine-tuned model inside Docker. +# +# All CLI flags are forwarded directly to predict.py: +# +# Usage: +# ./docker_predict.sh --text "Apple reports record iPhone sales" +# ./docker_predict.sh --file /app/articles.txt # file must be inside ./ +# ./docker_predict.sh # interactive mode +# +# Note: run ./docker_train.sh first so the model checkpoint exists. +# ───────────────────────────────────────────────────────────────────────────── +set -e + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +source "$SCRIPT_DIR/docker_utils.sh" +load_docker_vars + +CONTAINER_NAME="${IMAGE_NAME}_predict" + +# Only allocate a TTY for interactive mode (no args). +# When --text or --file is passed it runs non-interactively (e.g. from run.sh). +if [[ $# -eq 0 ]]; then + EXTRA="-it" +else + EXTRA="" +fi + +OPTS=$(base_run_opts "$CONTAINER_NAME" "$EXTRA") + +run "docker run $OPTS $FULL_IMAGE_NAME python project_files/scripts/predict.py $*" \ No newline at end of file diff --git a/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/docker_train.sh b/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/docker_train.sh new file mode 100755 index 000000000..4fe1c06c3 --- /dev/null +++ b/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/docker_train.sh @@ -0,0 +1,25 @@ +#!/bin/bash +# Fine-tune the transformer model inside Docker. +# ./docker_train.sh # DistilBERT default +# ./docker_train.sh --model bert-base-uncased # swap backbone +# ./docker_train.sh --epochs 5 --batch_size 32 # override hyperparams +# ./docker_train.sh --model roberta-base --lr 3e-5 + +set -e + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +source "$SCRIPT_DIR/docker_utils.sh" +load_docker_vars + +CONTAINER_NAME="${IMAGE_NAME}_train" +# No -it so it can be run non-interactively (e.g. in CI or nohup). +OPTS=$(base_run_opts "$CONTAINER_NAME") + +echo "🚀 Starting training — args: $*" +echo " Fine-tuned model will be saved to ./models/ on your host." +echo "" + +run "docker run $OPTS $FULL_IMAGE_NAME python project_files/scripts/train.py $*" + +echo "" +echo "✅ Training complete. Check ./models/ for the saved checkpoint." diff --git a/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/docker_utils.sh b/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/docker_utils.sh new file mode 100755 index 000000000..75ccc5a08 --- /dev/null +++ b/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/docker_utils.sh @@ -0,0 +1,35 @@ +#!/bin/bash +# docker_utils.sh — shared helpers sourced by all docker_*.sh scripts. + +# - run() : echo + execute a command +# - load_docker_vars() : source docker_name.sh and print the resolved names +# - base_run_opts() : common `docker run` flags used by every script + +run() { + echo "+ $*" + eval "$*" +} + +# Source docker_name.sh (always relative to the script calling this file). +load_docker_vars() { + local script_dir + script_dir="$(cd "$(dirname "${BASH_SOURCE[1]}")" && pwd)" + # shellcheck source=docker_name.sh + source "$script_dir/docker_name.sh" + echo "──────────────────────────────────────────" + echo " REPO : $REPO_NAME" + echo " IMAGE : $IMAGE_NAME" + echo " FULL IMAGE : $FULL_IMAGE_NAME" + echo " HF VOLUME : $HF_CACHE_VOLUME" + echo "──────────────────────────────────────────" +} + +base_run_opts() { + local container_name="$1" + local extra="${2:-}" + echo "--rm \ + --name $container_name \ + -v \"$(pwd):/app\" \ + -v \"$HF_CACHE_VOLUME:/hf_cache\" \ + $extra" +} diff --git a/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/project_files/requirements.txt b/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/project_files/requirements.txt index 6a5304f04..8beb6c981 100644 --- a/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/project_files/requirements.txt +++ b/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/project_files/requirements.txt @@ -1,13 +1,13 @@ transformers>=4.35.0 datasets>=2.14.0 torch>=2.0.0 +accelerate>=0.24.0 +evaluate>=0.4.0 scikit-learn>=1.3.0 pandas>=2.0.0 numpy>=1.24.0 matplotlib>=3.7.0 seaborn>=0.12.0 -accelerate>=0.24.0 -evaluate>=0.4.0 tqdm>=4.65.0 optuna>=3.3.0 -jupyterlab>=4.0.0 +jupyterlab>=4.0.0 \ No newline at end of file diff --git a/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/project_files/scripts/predict.py b/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/project_files/scripts/predict.py new file mode 100644 index 000000000..7777ea7d8 --- /dev/null +++ b/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/project_files/scripts/predict.py @@ -0,0 +1,128 @@ +# scripts/predict.py +""" +COMMIT 2 — Inference +====================== +Run the fine-tuned model on custom text input. + +Usage +----- + python scripts/predict.py --text "Apple reports record iPhone sales" + python scripts/predict.py --file articles.txt # one article per line + python scripts/predict.py # interactive mode +""" + +import argparse +import os +import sys + +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +import torch +from transformers import AutoTokenizer, AutoModelForSequenceClassification + +from config import OUTPUT_DIR, MAX_LENGTH, ID2LABEL +from utils.preprocessing import clean_text + + +def load_model(model_dir: str): + """ + Load a fine-tuned model and tokenizer from disk. + Falls back to the HuggingFace hub if local path not found. + """ + best_path = os.path.join(model_dir, "best") + load_path = best_path if os.path.isdir(best_path) else model_dir + + if not os.path.isdir(load_path): + print(f"[predict] ⚠️ Model not found at '{load_path}'.") + print("[predict] Please run `python scripts/train.py` first.") + sys.exit(1) + + print(f"[predict] Loading model from: {load_path}") + tokenizer = AutoTokenizer.from_pretrained(load_path) + model = AutoModelForSequenceClassification.from_pretrained(load_path) + model.eval() + return tokenizer, model + + +def predict(texts, tokenizer, model, device="cpu"): + """ + Predict category labels for a list of texts. + + Returns + ------- + results : list of dict + Each dict has 'text', 'label', 'confidence', 'all_scores' + """ + cleaned = [clean_text(t) for t in texts] + inputs = tokenizer( + cleaned, + padding=True, + truncation=True, + max_length=MAX_LENGTH, + return_tensors="pt", + ).to(device) + + with torch.no_grad(): + outputs = model(**inputs) + + probs = torch.softmax(outputs.logits, dim=-1).cpu().numpy() + + results = [] + for text, prob_row in zip(texts, probs): + pred_id = prob_row.argmax() + results.append({ + "text": text[:120] + "..." if len(text) > 120 else text, + "label": ID2LABEL[pred_id], + "confidence": round(float(prob_row[pred_id]) * 100, 2), + "all_scores": {ID2LABEL[i]: round(float(p) * 100, 2) for i, p in enumerate(prob_row)}, + }) + return results + + +def display_results(results): + for i, r in enumerate(results, 1): + print(f"\n── Result {i} {'─'*45}") + print(f" Text : {r['text']}") + print(f" Prediction : {r['label']} ({r['confidence']}% confidence)") + print(f" All scores :") + for label, score in sorted(r["all_scores"].items(), key=lambda x: -x[1]): + bar = "▓" * int(score // 5) + print(f" {label:<12} {score:>6.2f}% {bar}") + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--text", type=str, default=None, help="Article text to classify.") + parser.add_argument("--file", type=str, default=None, help="Path to a text file (one article per line).") + parser.add_argument("--model_dir", type=str, default=OUTPUT_DIR, help="Directory of the fine-tuned model.") + args = parser.parse_args() + + device = "cuda" if torch.cuda.is_available() else "cpu" + tokenizer, model = load_model(args.model_dir) + model = model.to(device) + + if args.text: + texts = [args.text] + elif args.file: + with open(args.file) as f: + texts = [line.strip() for line in f if line.strip()] + else: + print("[predict] Interactive mode — type an article and press Enter (Ctrl+C to quit).") + texts = [] + while True: + try: + t = input("\n📰 Article: ").strip() + if t: + results = predict([t], tokenizer, model, device) + display_results(results) + except KeyboardInterrupt: + print("\nBye!") + break + return + + results = predict(texts, tokenizer, model, device) + display_results(results) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/project_files/scripts/train.py b/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/project_files/scripts/train.py new file mode 100644 index 000000000..44ffa555d --- /dev/null +++ b/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/project_files/scripts/train.py @@ -0,0 +1,195 @@ +# scripts/train.py +""" +COMMIT 2 — Model Selection & Fine-Tuning +========================================== +What this script does +--------------------- +1. Loads AG News dataset and applies the preprocessing pipeline from Commit 1. +2. Instantiates a pre-trained DistilBERT model with a classification head + (AutoModelForSequenceClassification). +3. Configures HuggingFace Trainer with TrainingArguments. +4. Fine-tunes the model for `EPOCHS` epochs. +5. Saves the best checkpoint to OUTPUT_DIR. + +Why DistilBERT as default? + - 40% smaller and 60% faster than BERT-base with ~97% of its accuracy. + - Great baseline for a 4-class news classification task. + - Can swap in BERT or RoBERTa via config.py for Commit 4 comparisons. + +Fine-tuning strategy + - All transformer layers are trainable (full fine-tune, not frozen). + - Linear learning rate warmup for 500 steps avoids large early gradient updates. + - WeightDecay regularization prevents overfitting on a relatively small task. + - evaluation_strategy="epoch" saves checkpoints every epoch and picks the best. + +Usage +----- + python scripts/train.py # default DistilBERT + python scripts/train.py --model bert-base-uncased +""" + +import argparse +import os +import sys + +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from transformers import ( + AutoModelForSequenceClassification, + TrainingArguments, + Trainer, + DataCollatorWithPadding, +) +import torch + +from config import ( + DEFAULT_MODEL, OUTPUT_DIR, EPOCHS, BATCH_SIZE, + LEARNING_RATE, WEIGHT_DECAY, WARMUP_STEPS, + NUM_LABELS, LABEL_NAMES, ID2LABEL, LABEL2ID, SEED, +) +from utils.dataset_loader import load_ag_news, get_subsets +from utils.preprocessing import get_tokenizer, tokenize_dataset +from utils.metrics import compute_metrics + + +def parse_args(): + parser = argparse.ArgumentParser(description="Fine-tune a transformer for news classification.") + parser.add_argument( + "--model", type=str, default=DEFAULT_MODEL, + help="HuggingFace model checkpoint to fine-tune." + ) + parser.add_argument( + "--output_dir", type=str, default=None, + help="Where to save the fine-tuned model. Defaults to config OUTPUT_DIR." + ) + parser.add_argument( + "--epochs", type=int, default=EPOCHS, + help="Number of training epochs." + ) + parser.add_argument( + "--batch_size", type=int, default=BATCH_SIZE, + help="Per-device batch size." + ) + parser.add_argument( + "--lr", type=float, default=LEARNING_RATE, + help="Peak learning rate." + ) + return parser.parse_args() + + +def build_model(model_name: str): + """ + Load a pre-trained transformer model with a sequence classification head. + + AutoModelForSequenceClassification adds: + - A dropout layer + - A linear projection: hidden_size → num_labels + on top of the transformer backbone. Only the classification head is randomly + initialized; the transformer weights come from the pre-trained checkpoint. + """ + print(f"\n[train] Loading model: {model_name}") + model = AutoModelForSequenceClassification.from_pretrained( + model_name, + num_labels=NUM_LABELS, + id2label=ID2LABEL, + label2id=LABEL2ID, + ) + total_params = sum(p.numel() for p in model.parameters()) + trainable = sum(p.numel() for p in model.parameters() if p.requires_grad) + print(f"[train] Total params : {total_params:,}") + print(f"[train] Trainable params: {trainable:,}") + return model + + +def build_training_args(output_dir: str, epochs: int, batch_size: int, lr: float): + """ + Configure HuggingFace TrainingArguments. + + Key decisions + ------------- + - evaluation_strategy = "epoch" → evaluate on val set after every epoch + - load_best_model_at_end = True → restore best checkpoint after training + - metric_for_best_model = "f1_macro" → choose checkpoints by macro-F1 + - fp16 = auto-detected based on CUDA availability → faster on GPU + """ + use_fp16 = torch.cuda.is_available() + return TrainingArguments( + output_dir=output_dir, + num_train_epochs=epochs, + per_device_train_batch_size=batch_size, + per_device_eval_batch_size=batch_size * 2, + learning_rate=lr, + weight_decay=WEIGHT_DECAY, + warmup_steps=WARMUP_STEPS, + eval_strategy="epoch", + save_strategy="epoch", + load_best_model_at_end=True, + metric_for_best_model="f1_macro", + greater_is_better=True, + logging_dir=os.path.join(output_dir, "logs"), + logging_steps=100, + seed=SEED, + fp16=use_fp16, + report_to="none", # disable W&B / MLflow unless you want them + ) + + +def main(): + args = parse_args() + model_name = args.model + out_dir = args.output_dir or OUTPUT_DIR + os.makedirs(out_dir, exist_ok=True) + + # ── Step 1: Load & preprocess data ───────────────────────────────────────── + dataset = load_ag_news() + dataset = get_subsets(dataset) + tokenizer = get_tokenizer(model_name) + tokenized = tokenize_dataset(dataset, tokenizer) + + train_ds = tokenized["train"] + eval_ds = tokenized["validation"] + + # ── Step 2: Build model ───────────────────────────────────────────────────── + model = build_model(model_name) + + # ── Step 3: Configure training ────────────────────────────────────────────── + training_args = build_training_args(out_dir, args.epochs, args.batch_size, args.lr) + + # DataCollatorWithPadding pads each batch to its longest sequence + # (more efficient than global max_length padding) + data_collator = DataCollatorWithPadding(tokenizer=tokenizer) + + # ── Step 4: Train ─────────────────────────────────────────────────────────── + trainer = Trainer( + model=model, + args=training_args, + train_dataset=train_ds, + eval_dataset=eval_ds, + processing_class=tokenizer, + data_collator=data_collator, + compute_metrics=compute_metrics, +) + + print(f"\n[train] Starting fine-tuning — {args.epochs} epoch(s)...") + trainer.train() + + # ── Step 5: Save best model ───────────────────────────────────────────────── + best_path = os.path.join(out_dir, "best") + trainer.save_model(best_path) + tokenizer.save_pretrained(best_path) + print(f"\n[train] ✅ Best model saved to: {best_path}") + + # Save training metrics summary + metrics_path = os.path.join(out_dir, "train_results.txt") + with open(metrics_path, "w") as f: + f.write(f"Model: {model_name}\n") + f.write(f"Epochs: {args.epochs}\n") + f.write(f"Batch size: {args.batch_size}\n") + f.write(f"Learning rate: {args.lr}\n\n") + for log in trainer.state.log_history: + f.write(str(log) + "\n") + print(f"[train] Training log saved to: {metrics_path}") + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/project_files/utils/dataset_loader.py b/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/project_files/utils/dataset_loader.py index 28f940a31..c11962a0d 100644 --- a/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/project_files/utils/dataset_loader.py +++ b/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/project_files/utils/dataset_loader.py @@ -13,19 +13,15 @@ TRAIN_SUBSET, EVAL_SUBSET, SEED ) - def load_ag_news(): - - #Load the AG News dataset from HuggingFace hub - #Each split has columns: 'text', 'label' - - print(f"Loading '{DATASET_NAME}' from HuggingFace") dataset = load_dataset(DATASET_NAME) - print(f"Train size : {len(dataset['train']):,}") - print(f"Test size : {len(dataset['test']):,}") + # Split train into train + validation (90/10) + split = dataset["train"].train_test_split(test_size=0.1, seed=SEED) + dataset["train"] = split["train"] + dataset["validation"] = split["test"] + # Keep the original test set completely untouched return dataset - def get_subsets(dataset): if TRAIN_SUBSET: dataset["train"] = dataset["train"].shuffle(seed=SEED).select(range(TRAIN_SUBSET)) diff --git a/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/project_files/utils/metrics.py b/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/project_files/utils/metrics.py new file mode 100644 index 000000000..74513f37f --- /dev/null +++ b/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/project_files/utils/metrics.py @@ -0,0 +1,68 @@ +# utils/metrics.py +""" + - compute_metrics() → used as callback during HuggingFace Trainer training + - full_report() → detailed sklearn classification report + +These are separated from the training script to keep concerns clean and allow +the same metric logic to be reused across multiple models in Commit 4. +""" + +import numpy as np +from sklearn.metrics import ( + accuracy_score, + f1_score, + classification_report, + confusion_matrix, +) +import evaluate # HuggingFace evaluate library + + +# Load HuggingFace accuracy metric (used inside Trainer) +_hf_accuracy = evaluate.load("accuracy") + + +def compute_metrics(eval_pred): + """ + Callback passed to HuggingFace Trainer. + Called at the end of every evaluation step. + + Parameters + ---------- + eval_pred : EvalPrediction + .predictions → raw logits, shape (N, num_labels) + .label_ids → true labels, shape (N,) + + Returns + ------- + dict with keys: 'accuracy', 'f1_macro' + """ + logits, labels = eval_pred + predictions = np.argmax(logits, axis=-1) + + acc = _hf_accuracy.compute(predictions=predictions, references=labels)["accuracy"] + f1 = f1_score(labels, predictions, average="macro") + + return { + "accuracy": round(acc, 4), + "f1_macro": round(f1, 4), + } + + +def full_report(y_true, y_pred, label_names): + """ + Generate a detailed classification report and confusion matrix. + + Parameters + ---------- + y_true : list or np.ndarray of true label IDs + y_pred : list or np.ndarray of predicted label IDs + label_names : list of str (e.g. ['World', 'Sports', 'Business', 'Sci/Tech']) + + Returns + ------- + report : str (sklearn classification_report string) + cm : np.ndarray (confusion matrix) + """ + report = classification_report(y_true, y_pred, target_names=label_names, digits=4) + cm = confusion_matrix(y_true, y_pred) + return report, cm diff --git a/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/run.sh b/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/run.sh new file mode 100755 index 000000000..4dd2b5652 --- /dev/null +++ b/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/run.sh @@ -0,0 +1,67 @@ +#!/bin/bash + +# ./run.sh # full pipeline, all defaults +# ./run.sh --model roberta-base # swap backbone +# ./run.sh --model bert-base-uncased --epochs 5 +# ./run.sh --text "Apple reports record sales" # custom prediction text +# ./run.sh --skip-build # skip image build step + +set -e + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +SKIP_BUILD=0 +TRAIN_ARGS="" +PREDICT_TEXT="" + +while [[ $# -gt 0 ]]; do + case "$1" in + --skip-build) + SKIP_BUILD=1 + shift + ;; + --text) + PREDICT_TEXT="$2" + shift 2 + ;; + --model|--epochs|--batch_size|--lr) + TRAIN_ARGS="$TRAIN_ARGS $1 $2" + shift 2 + ;; + *) + echo "Unknown argument: $1" + echo "Usage: ./run.sh [--skip-build] [--model ] [--epochs ] [--batch_size ] [--lr ] [--text
]" + exit 1 + ;; + esac +done + +step() { + echo "" + echo "════════════════════════════════════════════════" + echo " $1" + echo "════════════════════════════════════════════════" +} + +# Step 1: Build +if [[ $SKIP_BUILD -eq 0 ]]; then + step "Step 1/4 — Building Docker image" + bash "$SCRIPT_DIR/docker_build.sh" +else + step "Step 1/4 — Skipping build (--skip-build)" +fi + +# Step 2: Train +step "Step 3/4 — Training model" +bash "$SCRIPT_DIR/docker_train.sh" $TRAIN_ARGS + +# Step 3: Predict +step "Step 4/4 — Running inference" +if [[ -n "$PREDICT_TEXT" ]]; then + bash "$SCRIPT_DIR/docker_predict.sh" --text "$PREDICT_TEXT" +else + bash "$SCRIPT_DIR/docker_predict.sh" --text "Apple reports record iPhone sales in Q3" +fi + +echo "" +echo "Pipeline complete!!!!" \ No newline at end of file diff --git a/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/run_jupyter.sh b/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/run_jupyter.sh index d725c3fe7..a6130fec2 100755 --- a/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/run_jupyter.sh +++ b/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/run_jupyter.sh @@ -1,35 +1,11 @@ #!/bin/bash -# """ -# Launch Jupyter Lab server. -# -# This script starts Jupyter Lab on port 8888 with the following configuration: -# - No browser auto-launch (useful for Docker containers) -# - Accessible from any IP address (0.0.0.0) -# - Root user allowed (required for Docker environments) -# - No authentication token or password (for development convenience) -# - Vim keybindings can be enabled via JUPYTER_USE_VIM environment variable -# """ - -# Exit immediately if any command exits with a non-zero status. set -e -# Print each command to stdout before executing it. -#set -x - -# Import the utility functions from /git_root. -GIT_ROOT=/git_root -source $GIT_ROOT/class_project/project_template/utils.sh - -# Load Docker configuration variables for this script. -get_docker_vars_script ${BASH_SOURCE[0]} -source $DOCKER_NAME -print_docker_vars - -# Setup Jupyter Lab environment. -setup_jupyter_environment - -# Initialize Jupyter Lab command with base configuration. -JUPYTER_ARGS=$(get_jupyter_args) - -# Start Jupyter Lab with development-friendly settings. -run "jupyter lab $JUPYTER_ARGS" +jupyter lab \ + --no-browser \ + --ip=0.0.0.0 \ + --port=8888 \ + --allow-root \ + --NotebookApp.token='' \ + --NotebookApp.password='' \ + --notebook-dir=/app diff --git a/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/utils.sh b/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/utils.sh old mode 100644 new mode 100755 index cc0ed8c4a..ce5eac693 --- a/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/utils.sh +++ b/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/utils.sh @@ -1,21 +1,7 @@ #!/bin/bash -# """ -# Utility functions for Docker container management. -# """ - -# ############################################################################# # General utilities -# ############################################################################# - - run() { - # """ - # Execute a command with echo output. - # - # :param cmd: Command string to execute - # :return: Exit status of the executed command - # """ cmd="$*" echo "> $cmd" eval "$cmd" @@ -23,27 +9,14 @@ run() { enable_verbose_mode() { - # """ - # Enable shell command tracing (set -x) when VERBOSE is set to 1. - # - # Reads the VERBOSE variable set by parse_docker_jupyter_args. - # Call this after parsing args to activate tracing for the rest of the script. - # """ if [[ $VERBOSE == 1 ]]; then set -x fi } -# ############################################################################# # Argument parsing -# ############################################################################# - - _print_default_help() { - # """ - # Print usage information and available default options for docker scripts. - # """ echo "Usage: $(basename $0) [options]" echo "" echo "Options:" @@ -54,15 +27,6 @@ _print_default_help() { parse_default_args() { - # """ - # Parse default command-line arguments for docker scripts. - # - # Sets VERBOSE and FORCE variables in the caller's scope. Enables set -x - # when -v is passed. Prints help and exits when -h is passed. - # Updates OPTIND so the caller can shift away processed arguments. - # - # :param @: command-line arguments forwarded from the calling script - # """ VERBOSE=0 FORCE=0 while getopts "fhv" flag; do diff --git a/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/version.sh b/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/version.sh index c46ed254c..3a4117d7e 100755 --- a/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/version.sh +++ b/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/version.sh @@ -1,28 +1,29 @@ #!/bin/bash -# """ -# Display versions of installed tools and packages. -# -# This script prints version information for Python, pip, Jupyter, and all -# installed Python packages. Used for debugging and documentation purposes -# to verify the Docker container environment setup. -# """ +# Report key package versions baked into this image. -# Display Python 3 version. -echo "# Python3" -python3 --version +echo "============================================================" +echo " Image version report — $(date -u '+%Y-%m-%d %H:%M UTC')" +echo "============================================================" +echo "Python : $(python --version 2>&1)" +echo "pip : $(pip --version 2>&1)" +echo "------------------------------------------------------------" -# Display pip version. -echo "# pip3" -pip3 --version +packages=( + torch + transformers + datasets + accelerate + evaluate + scikit-learn + pandas + numpy + optuna + jupyterlab +) -# Display Jupyter version. -echo "# jupyter" -jupyter --version +for pkg in "${packages[@]}"; do + version=$(python -c "import importlib.metadata; print(importlib.metadata.version('$pkg'))" 2>/dev/null || echo "not installed") + printf "%-20s %s\n" "$pkg" "$version" +done -# List all installed Python packages and their versions. -echo "# Python packages" -pip3 list - -# Template for adding additional tool versions. -# echo "# mongo" -# mongod --version +echo "============================================================" From ad92f0cff58bed4c85f9f4be8b71630751b0dfd5 Mon Sep 17 00:00:00 2001 From: riyaapuri Date: Wed, 6 May 2026 00:35:46 -0400 Subject: [PATCH 4/8] Model evaluation script and relevant docker file added --- .../README.md | 27 ++- .../docker_evaluate.sh | 19 ++ .../project_files/scripts/evaluate_model.py | 208 ++++++++++++++++++ .../run.sh | 8 +- 4 files changed, 259 insertions(+), 3 deletions(-) create mode 100755 class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/docker_evaluate.sh create mode 100644 class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/project_files/scripts/evaluate_model.py diff --git a/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/README.md b/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/README.md index 660b6939c..31060c679 100644 --- a/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/README.md +++ b/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/README.md @@ -150,4 +150,29 @@ Rewritten standalone. Launches Jupyter Lab in a detached container with port-for Rewritten standalone. Gains a `--volumes` flag that additionally removes the `hf_cache` named volume, wiping downloaded model weights for a full clean slate. #### `run_jupyter.sh` -Simplified — removed monorepo framework calls. Retains the same Jupyter flags: `--no-browser`, `--ip=0.0.0.0`, `--allow-root`, no token/password. \ No newline at end of file +Simplified — removed monorepo framework calls. Retains the same Jupyter flags: `--no-browser`, `--ip=0.0.0.0`, `--allow-root`, no token/password. + + +--- + +## Release v3.0 + +### New Files + +#### `scripts/evaluate.py` +Runs full batch evaluation of the fine-tuned model on the AG News test set and saves four outputs to `results/`: +- `classification_report.txt` — per-class precision, recall, F1, and support via sklearn. +- `confusion_matrix.png` — heatmap of true vs predicted labels across all four classes. +- `per_class_metrics.png` — grouped bar chart of precision, recall, and F1 per category. +- `predictions.csv` — row-level predictions with true label, predicted label, and a correct/incorrect flag for manual inspection. +Accepts `--model_dir` to evaluate any saved checkpoint, defaulting to `models/distilbert-ag-news`. Uses `matplotlib.use("Agg")` so plots render correctly inside Docker without a display. + +#### `docker_evaluate.sh` +Runs `scripts/evaluate.py` inside the container non-interactively. All four result files are written to `./results/` on the host via the volume mount. Accepts `--model_dir` which is forwarded directly to the script. + +--- + +### Modified Files + +#### `run.sh` +Added evaluate as step 4, shifting predict to step 5. Pipeline order is now: **build → dataloader → train → evaluate → predict**. Step counter in all headers updated from `/4` to `/5`. \ No newline at end of file diff --git a/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/docker_evaluate.sh b/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/docker_evaluate.sh new file mode 100755 index 000000000..a1edde942 --- /dev/null +++ b/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/docker_evaluate.sh @@ -0,0 +1,19 @@ +#!/bin/bash + +set -e + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +source "$SCRIPT_DIR/docker_utils.sh" +load_docker_vars + +CONTAINER_NAME="${IMAGE_NAME}_evaluate" +OPTS=$(base_run_opts "$CONTAINER_NAME") + +echo "Running evaluation..." +echo " Results will be saved to ./results/ on your host." +echo "" + +run "docker run $OPTS $FULL_IMAGE_NAME python project_files/scripts/evaluate_model.py $*" + +echo "" +echo " Evaluation complete. Check ./results/ for outputs." \ No newline at end of file diff --git a/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/project_files/scripts/evaluate_model.py b/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/project_files/scripts/evaluate_model.py new file mode 100644 index 000000000..6bb0fdc9b --- /dev/null +++ b/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/project_files/scripts/evaluate_model.py @@ -0,0 +1,208 @@ +# scripts/evaluate.py +""" +1. Loads the fine-tuned model from models/distilbert-ag-news/best/ +2. Runs batch inference over the full test set +3. Computes accuracy, macro F1, per-class precision/recall/F1 +4. Saves the below to results/: + - classification_report.txt + - confusion_matrix.png + - per_class_metrics.png + - predictions.csv +""" + +import argparse +import os +import sys + +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +import numpy as np +import pandas as pd +import torch +from torch.utils.data import DataLoader +from transformers import ( + AutoTokenizer, + AutoModelForSequenceClassification, + DataCollatorWithPadding, +) +from tqdm import tqdm +import matplotlib +matplotlib.use("Agg") # non-interactive backend — works inside Docker +import matplotlib.pyplot as plt +import seaborn as sns +from sklearn.metrics import accuracy_score, f1_score + +from config import OUTPUT_DIR, RESULTS_DIR, LABEL_NAMES, ID2LABEL, BATCH_SIZE +from utils.dataset_loader import load_ag_news, get_subsets +from utils.preprocessing import tokenize_dataset +from utils.metrics import full_report + +os.makedirs(RESULTS_DIR, exist_ok=True) + + +# Model Loading + +def load_model(model_dir: str): + """Load fine-tuned model and tokenizer from disk.""" + best_path = os.path.join(model_dir, "best") + load_path = best_path if os.path.isdir(best_path) else model_dir + if not os.path.isdir(load_path): + print(f"[evaluate] Model not found at '{load_path}'.") + print("[evaluate] Run `python scripts/train.py` first.") + sys.exit(1) + print(f"[evaluate] Loading model from: {load_path}") + tokenizer = AutoTokenizer.from_pretrained(load_path) + model = AutoModelForSequenceClassification.from_pretrained(load_path) + return tokenizer, model + + +# ─── Batch Inference ─────────────────────────────────────────────────────────── + +def run_inference(model, test_ds, tokenizer, device): + """ + Run inference on the entire test split in batches. + + Returns + ------- + y_true : np.ndarray, shape (N,) + y_pred : np.ndarray, shape (N,) + """ + collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="pt") + loader = DataLoader(test_ds, batch_size=BATCH_SIZE * 2, collate_fn=collator) + + model.eval() + model.to(device) + + all_preds, all_labels = [], [] + + with torch.no_grad(): + for batch in tqdm(loader, desc="Evaluating"): + labels = batch.pop("labels") + batch = {k: v.to(device) for k, v in batch.items()} + logits = model(**batch).logits + preds = torch.argmax(logits, dim=-1).cpu().numpy() + all_preds.extend(preds) + all_labels.extend(labels.numpy()) + + return np.array(all_labels), np.array(all_preds) + + +# ─── Plots ───────────────────────────────────────────────────────────────────── + +def plot_confusion_matrix(cm, label_names, save_path): + """Save a styled confusion matrix heatmap.""" + fig, ax = plt.subplots(figsize=(7, 6)) + sns.heatmap( + cm, annot=True, fmt="d", cmap="Blues", + xticklabels=label_names, yticklabels=label_names, + linewidths=0.5, ax=ax, + ) + ax.set_xlabel("Predicted Label", fontsize=12, labelpad=10) + ax.set_ylabel("True Label", fontsize=12, labelpad=10) + ax.set_title("Confusion Matrix — AG News Test Set", fontsize=14, pad=15) + plt.tight_layout() + fig.savefig(save_path, dpi=150) + plt.close(fig) + print(f"[evaluate] Confusion matrix saved : {save_path}") + + +def plot_per_class_metrics(report_str, label_names, save_path): + """Parse sklearn report string and plot per-class bars.""" + lines = report_str.strip().split("\n") + rows = [] + for line in lines[2: 2 + len(label_names)]: + parts = line.split() + if len(parts) >= 5: + rows.append({ + "class": parts[0], + "Precision": float(parts[1]), + "Recall": float(parts[2]), + "F1-Score": float(parts[3]), + }) + + if not rows: + print("[evaluate] Could not parse report for per-class chart.") + return + + df = pd.DataFrame(rows).set_index("class") + ax = df.plot(kind="bar", figsize=(9, 5), colormap="Set2", + width=0.7, edgecolor="white") + ax.set_title("Per-Class Metrics — AG News", fontsize=14) + ax.set_xlabel("News Category", fontsize=11) + ax.set_ylabel("Score", fontsize=11) + ax.set_ylim(0, 1.05) + ax.legend(loc="lower right") + ax.grid(axis="y", alpha=0.4) + plt.xticks(rotation=0) + plt.tight_layout() + ax.figure.savefig(save_path, dpi=150) + plt.close(ax.figure) + print(f"[evaluate] Per-class chart saved : {save_path}") + + +# ─── Main ────────────────────────────────────────────────────────────────────── + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--model_dir", type=str, default=OUTPUT_DIR) + args = parser.parse_args() + + device = "cuda" if torch.cuda.is_available() else "cpu" + print(f"[evaluate] Device: {device}") + + # Load model. + tokenizer, model = load_model(args.model_dir) + + # Load and tokenize test data. + dataset = load_ag_news() + dataset = get_subsets(dataset) + tokenized = tokenize_dataset(dataset, tokenizer) + test_ds = tokenized["test"] + + # Run inference. + print(f"\n[evaluate] Running inference on {len(test_ds):,} test examples...") + y_true, y_pred = run_inference(model, test_ds, tokenizer, device) + + # ── 1. Classification report ─────────────────────────────────────────────── + report, cm = full_report(y_true, y_pred, LABEL_NAMES) + print("\n── Classification Report ─────────────────────────────────────────") + print(report) + + report_path = os.path.join(RESULTS_DIR, "classification_report.txt") + with open(report_path, "w") as f: + f.write(f"Model : {args.model_dir}\n\n") + f.write(report) + print(f"[evaluate] Report saved : {report_path}") + + # ── 2. Confusion matrix ──────────────────────────────────────────────────── + cm_path = os.path.join(RESULTS_DIR, "confusion_matrix.png") + plot_confusion_matrix(cm, LABEL_NAMES, cm_path) + + # ── 3. Per-class metrics bar chart ───────────────────────────────────────── + pc_path = os.path.join(RESULTS_DIR, "per_class_metrics.png") + plot_per_class_metrics(report, LABEL_NAMES, pc_path) + + # ── 4. Raw predictions CSV ───────────────────────────────────────────────── + pred_df = pd.DataFrame({ + "true_label_id": y_true, + "pred_label_id": y_pred, + "true_label": [ID2LABEL[i] for i in y_true], + "pred_label": [ID2LABEL[i] for i in y_pred], + "correct": y_true == y_pred, + }) + csv_path = os.path.join(RESULTS_DIR, "predictions.csv") + pred_df.to_csv(csv_path, index=False) + print(f"[evaluate] Predictions saved : {csv_path}") + + # ── 5. Summary ──────────────────────────────────────────────────────────── + acc = accuracy_score(y_true, y_pred) + f1 = f1_score(y_true, y_pred, average="macro") + print(f"\n{'='*50}") + print(f" Final Results") + print(f" Accuracy : {acc:.4f} ({acc*100:.2f}%)") + print(f" F1 Macro : {f1:.4f}") + print(f"{'='*50}\n") + + +if __name__ == "__main__": + main() diff --git a/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/run.sh b/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/run.sh index 4dd2b5652..5c9433403 100755 --- a/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/run.sh +++ b/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/run.sh @@ -52,16 +52,20 @@ else fi # Step 2: Train -step "Step 3/4 — Training model" +step "Step 2/4 — Training model" bash "$SCRIPT_DIR/docker_train.sh" $TRAIN_ARGS # Step 3: Predict -step "Step 4/4 — Running inference" +step "Step 3/4 — Running inference" if [[ -n "$PREDICT_TEXT" ]]; then bash "$SCRIPT_DIR/docker_predict.sh" --text "$PREDICT_TEXT" else bash "$SCRIPT_DIR/docker_predict.sh" --text "Apple reports record iPhone sales in Q3" fi +# Step 4: Evaluate +step "Step 4/4 — Evaluating model" +bash "$SCRIPT_DIR/docker_evaluate.sh" + echo "" echo "Pipeline complete!!!!" \ No newline at end of file From 75f69a91f33e6d1649525c99a9f351e97294c7f3 Mon Sep 17 00:00:00 2001 From: riyaapuri Date: Wed, 6 May 2026 01:00:45 -0400 Subject: [PATCH 5/8] Fixed train and predict scripts --- .../project_files/scripts/predict.py | 22 ++------- .../project_files/scripts/train.py | 49 ++++--------------- 2 files changed, 14 insertions(+), 57 deletions(-) diff --git a/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/project_files/scripts/predict.py b/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/project_files/scripts/predict.py index 7777ea7d8..32316ffaf 100644 --- a/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/project_files/scripts/predict.py +++ b/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/project_files/scripts/predict.py @@ -1,14 +1,6 @@ # scripts/predict.py """ -COMMIT 2 — Inference -====================== -Run the fine-tuned model on custom text input. - -Usage ------ - python scripts/predict.py --text "Apple reports record iPhone sales" - python scripts/predict.py --file articles.txt # one article per line - python scripts/predict.py # interactive mode +This script is to run the fine-tuned model on custom text input. """ import argparse @@ -26,14 +18,13 @@ def load_model(model_dir: str): """ - Load a fine-tuned model and tokenizer from disk. - Falls back to the HuggingFace hub if local path not found. + Load a fine-tuned model and tokenizer(best model that we got from train.py) """ best_path = os.path.join(model_dir, "best") load_path = best_path if os.path.isdir(best_path) else model_dir if not os.path.isdir(load_path): - print(f"[predict] ⚠️ Model not found at '{load_path}'.") + print(f"[predict] Model not found at '{load_path}'.") print("[predict] Please run `python scripts/train.py` first.") sys.exit(1) @@ -47,11 +38,6 @@ def load_model(model_dir: str): def predict(texts, tokenizer, model, device="cpu"): """ Predict category labels for a list of texts. - - Returns - ------- - results : list of dict - Each dict has 'text', 'label', 'confidence', 'all_scores' """ cleaned = [clean_text(t) for t in texts] inputs = tokenizer( @@ -111,7 +97,7 @@ def main(): texts = [] while True: try: - t = input("\n📰 Article: ").strip() + t = input("\n Article: ").strip() if t: results = predict([t], tokenizer, model, device) display_results(results) diff --git a/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/project_files/scripts/train.py b/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/project_files/scripts/train.py index 44ffa555d..edcb22a29 100644 --- a/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/project_files/scripts/train.py +++ b/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/project_files/scripts/train.py @@ -1,31 +1,11 @@ -# scripts/train.py """ -COMMIT 2 — Model Selection & Fine-Tuning -========================================== What this script does ---------------------- -1. Loads AG News dataset and applies the preprocessing pipeline from Commit 1. + +1. Loads AG News dataset and applies the preprocessing pipeline. 2. Instantiates a pre-trained DistilBERT model with a classification head (AutoModelForSequenceClassification). 3. Configures HuggingFace Trainer with TrainingArguments. -4. Fine-tunes the model for `EPOCHS` epochs. -5. Saves the best checkpoint to OUTPUT_DIR. - -Why DistilBERT as default? - - 40% smaller and 60% faster than BERT-base with ~97% of its accuracy. - - Great baseline for a 4-class news classification task. - - Can swap in BERT or RoBERTa via config.py for Commit 4 comparisons. - -Fine-tuning strategy - - All transformer layers are trainable (full fine-tune, not frozen). - - Linear learning rate warmup for 500 steps avoids large early gradient updates. - - WeightDecay regularization prevents overfitting on a relatively small task. - - evaluation_strategy="epoch" saves checkpoints every epoch and picks the best. - -Usage ------ - python scripts/train.py # default DistilBERT - python scripts/train.py --model bert-base-uncased +4. Saves the best checkpoint(model) """ import argparse @@ -80,12 +60,6 @@ def parse_args(): def build_model(model_name: str): """ Load a pre-trained transformer model with a sequence classification head. - - AutoModelForSequenceClassification adds: - - A dropout layer - - A linear projection: hidden_size → num_labels - on top of the transformer backbone. Only the classification head is randomly - initialized; the transformer weights come from the pre-trained checkpoint. """ print(f"\n[train] Loading model: {model_name}") model = AutoModelForSequenceClassification.from_pretrained( @@ -105,12 +79,9 @@ def build_training_args(output_dir: str, epochs: int, batch_size: int, lr: float """ Configure HuggingFace TrainingArguments. - Key decisions - ------------- - evaluation_strategy = "epoch" → evaluate on val set after every epoch - load_best_model_at_end = True → restore best checkpoint after training - - metric_for_best_model = "f1_macro" → choose checkpoints by macro-F1 - - fp16 = auto-detected based on CUDA availability → faster on GPU + - metric_for_best_model = "f1_macro" """ use_fp16 = torch.cuda.is_available() return TrainingArguments( @@ -130,7 +101,7 @@ def build_training_args(output_dir: str, epochs: int, batch_size: int, lr: float logging_steps=100, seed=SEED, fp16=use_fp16, - report_to="none", # disable W&B / MLflow unless you want them + report_to="none", ) @@ -140,7 +111,7 @@ def main(): out_dir = args.output_dir or OUTPUT_DIR os.makedirs(out_dir, exist_ok=True) - # ── Step 1: Load & preprocess data ───────────────────────────────────────── + #Load & preprocess data dataset = load_ag_news() dataset = get_subsets(dataset) tokenizer = get_tokenizer(model_name) @@ -149,17 +120,17 @@ def main(): train_ds = tokenized["train"] eval_ds = tokenized["validation"] - # ── Step 2: Build model ───────────────────────────────────────────────────── + #Build model model = build_model(model_name) - # ── Step 3: Configure training ────────────────────────────────────────────── + #Configure training training_args = build_training_args(out_dir, args.epochs, args.batch_size, args.lr) # DataCollatorWithPadding pads each batch to its longest sequence # (more efficient than global max_length padding) data_collator = DataCollatorWithPadding(tokenizer=tokenizer) - # ── Step 4: Train ─────────────────────────────────────────────────────────── + #Train trainer = Trainer( model=model, args=training_args, @@ -177,7 +148,7 @@ def main(): best_path = os.path.join(out_dir, "best") trainer.save_model(best_path) tokenizer.save_pretrained(best_path) - print(f"\n[train] ✅ Best model saved to: {best_path}") + print(f"\n[train] Best model saved to: {best_path}") # Save training metrics summary metrics_path = os.path.join(out_dir, "train_results.txt") From a7adf8b0a4e27672b6213a6d11a86dee8d49cffb Mon Sep 17 00:00:00 2001 From: riyaapuri Date: Wed, 6 May 2026 01:09:12 -0400 Subject: [PATCH 6/8] Updated README doc format --- .../README.md | 518 +++++++++++++----- 1 file changed, 379 insertions(+), 139 deletions(-) diff --git a/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/README.md b/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/README.md index 31060c679..d066cc25b 100644 --- a/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/README.md +++ b/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/README.md @@ -1,178 +1,418 @@ -# HuggingFace Text Classification Model +# HuggingFace News Article Classification ## Description -HuggingFace is an open-source platform that provides ready-to-use, state-of-the-art language models along with the tools needed to fine-tune, evaluate, and deploy them for any natural language task, without needing to build models from scratch. +This project builds an end-to-end News Article Classification Pipeline using HuggingFace Transformers. Given a raw news article, the system fine-tunes transformer models (DistilBERT, BERT, RoBERTa) on the AG News dataset for 4-class topic classification and serves predictions through a command-line inference interface. -This project builds a News Article Classification Pipeline on top of HuggingFace. Given a raw news article, the system ingests data from public news datasets (AG News, BBC News), fine-tunes transformer models, covering BERT, DistilBERT, and RoBERTa for multi-class topic classification, and serves predictions through a live inference endpoint and dashboard. - -The full stack uses HuggingFace Transformers and Datasets for tokenization and fine-tuning, PyTorch as the training backend, Scikit-learn for evaluation metrics, FastAPI for inference serving, and Streamlit for the prediction dashboard. - -## Project Specs: -https://github.com/gpsaggese/gpsaggese.github.io/blob/master/class_project/data605/Spring2026/projects_descriptions/HuggingFace_Project_Description.md +The entire pipeline — data loading, preprocessing, training, evaluation, and inference — runs inside Docker, requiring no local Python environment setup beyond Docker Desktop. **Authors**: @riyaapuri @stupatel17 **Assigned to**: @riyaapuri @stupatel17 @protocorn @gpsaggese +**Project Specs**: https://github.com/gpsaggese/gpsaggese.github.io/blob/master/class_project/data605/Spring2026/projects_descriptions/HuggingFace_Project_Description.md + +--- + +## Table of Contents + +- [Architecture](#architecture) +- [Stack](#stack) +- [Project Structure](#project-structure) +- [Prerequisites](#prerequisites) +- [Installation](#installation) +- [Usage](#usage) +- [Configuration](#configuration) +- [Pipeline Steps](#pipeline-steps) +- [Outputs](#outputs) +- [Release Notes](#release-notes) + +--- + +## Architecture + +``` +Raw News Article + │ + ▼ +┌─────────────────┐ +│ dataset_loader │ Loads AG News from HuggingFace Hub +│ │ Splits into train / validation / test +└────────┬────────┘ + │ + ▼ +┌─────────────────┐ +│ preprocessing │ Cleans text (HTML, URLs, whitespace) +│ │ Tokenizes with AutoTokenizer (max 128 tokens) +└────────┬────────┘ + │ + ▼ +┌─────────────────┐ +│ train.py │ Fine-tunes DistilBERT / BERT / RoBERTa +│ │ Saves best checkpoint by macro-F1 +└────────┬────────┘ + │ + ▼ +┌─────────────────┐ +│ evaluate.py │ Batch inference on test set +│ │ Outputs report, confusion matrix, CSV +└────────┬────────┘ + │ + ▼ +┌─────────────────┐ +│ predict.py │ Single article / file / interactive inference +│ │ Returns label + per-class confidence scores +└─────────────────┘ +``` + +**Docker layer**: All steps above run inside a single container. The project directory is volume-mounted at `/app` so code edits are reflected immediately without rebuilding. HuggingFace model downloads are persisted in a named Docker volume (`hf_cache`) so models are not re-downloaded across runs. + +--- + ## Stack | Layer | Library | |---|---| | Modeling & Tokenization | HuggingFace Transformers, Datasets | | Training Backend | PyTorch, Accelerate | -| Evaluation | Scikit-learn, HuggingFace Evaluate | +| Evaluation | Scikit-learn | | Hyperparameter Tuning | Optuna | -| Serving | FastAPI | -| Dashboard | Streamlit | +| Serving | FastAPI *(upcoming)* | +| Dashboard | Streamlit *(upcoming)* | +| Containerization | Docker | + +--- + +## Project Structure + +``` +project_root/ +│ +├── project_files/ +│ ├── config.py # All constants and hyperparameters +│ ├── requirements.txt # Python dependencies +│ │ +│ ├── scripts/ +│ │ ├── train.py # Fine-tuning script +│ │ ├── evaluate.py # Evaluation + result export +│ │ └── predict.py # Inference script +│ │ +│ └── utils/ +│ ├── dataset_loader.py # Data loading and inspection +│ ├── preprocessing.py # Text cleaning and tokenization +│ └── metrics.py # Metric callbacks and report utilities +│ +├── models/ # Saved model checkpoints (generated) +│ └── distilbert-ag-news/ +│ └── best/ # Best checkpoint by macro-F1 +│ +├── results/ # Evaluation outputs (generated) +│ ├── classification_report.txt +│ ├── confusion_matrix.png +│ ├── per_class_metrics.png +│ └── predictions.csv +│ +├── Dockerfile # ML-ready container image +├── docker_name.sh # Image name configuration +├── docker_utils.sh # Shared helper functions +├── run.sh # Unified pipeline wrapper +├── docker_build.sh # Build the Docker image +├── docker_dataloader.sh # Run dataset_loader.py +├── docker_train.sh # Run train.py +├── docker_evaluate.sh # Run evaluate.py +├── docker_predict.sh # Run predict.py +├── docker_bash.sh # Open interactive shell +├── docker_jupyter.sh # Launch Jupyter Lab +├── docker_clean.sh # Remove image and cache +├── run_jupyter.sh # Jupyter startup (runs inside container) +├── version.sh # Package version logger (runs at build) +├── bashrc # Shell config copied into image +└── etc_sudoers # Sudo config copied into image +``` + +--- + +## Prerequisites + +**Docker Desktop** is the only requirement. No local Python installation is needed. + +| OS | Instructions | +|---|---| +| macOS | Download from https://www.docker.com/products/docker-desktop and install | +| Windows | Install Docker Desktop; WSL2 will be enabled automatically | +| Linux | Install Docker Engine via your package manager (`apt`, `dnf`, etc.) | + +Verify Docker is working before proceeding: + +```bash +docker --version +docker run hello-world +``` --- -## Release v1.0 +## Installation -### `config.py` -Central config for all constants — dataset name, label mappings, model checkpoints, training hyperparameters, and paths. Imported by every other module to avoid hardcoded values. +```bash +# 1. Clone the repository +git clone +cd -### `dataset_loader.py` -Loads AG News from the HuggingFace hub. Optionally subsets train/test splits for faster iteration. Includes `summarize_dataset()` for label-distribution stats and `get_sample_articles()` for spot-checking raw examples. +# 2. Build the Docker image (one-time, ~5 minutes) +./docker_build.sh +``` -### `preprocessing.py` -Three-stage pipeline: **clean → tokenize → format**. -- `clean_text()` strips HTML entities, URLs, and excess whitespace. Punctuation and casing are intentionally preserved for the tokenizer. -- `get_tokenizer()` loads an `AutoTokenizer` for any HF checkpoint. -- `make_tokenize_fn()` returns a closure for use with `dataset.map()`, applying cleaning + tokenization with `padding="max_length"` and `truncation=True` at `MAX_LENGTH=128`. -- `tokenize_dataset()` runs batched tokenization over the full `DatasetDict` and sets torch format on the output columns (`input_ids`, `attention_mask`, `label`). +The build installs all Python dependencies from `project_files/requirements.txt` into the image. You only need to rebuild if `requirements.txt` or the `Dockerfile` changes. -### `metrics.py` -Two evaluation utilities: -- `compute_metrics()` — Trainer callback returning `accuracy` and `f1_macro` after each eval step. -- `full_report()` — generates a detailed sklearn classification report and confusion matrix, used during final model evaluation. +To force a clean rebuild (re-downloads all packages): -### `requirements.txt` -Pins all dependencies. Key versions: `transformers>=4.35`, `torch>=2.0`, `datasets>=2.14`, `scikit-learn>=1.3`, `optuna>=3.3`. +```bash +./docker_build.sh --no-cache +``` --- - -## Release v2.0 -### How to run (this will be cleaned up before the final commit) +## Usage + +### Full Pipeline -**Prerequisites:** Docker Desktop installed and running. No other local dependencies needed from this commit onwards. +Run the entire pipeline — data loading, training, evaluation, and prediction — in a single command: ```bash -# Full pipeline — build, load data, train, predict ./run.sh - -# With a custom model and prediction text -./run.sh --model roberta-base --epochs 5 --text "Fed raises interest rates again" - -# Skip rebuild if image already exists +``` + +With options: + +```bash +# Swap the model backbone +./run.sh --model bert-base-uncased + +# Override training hyperparameters +./run.sh --model roberta-base --epochs 5 --batch_size 32 --lr 3e-5 + +# Set a custom prediction article +./run.sh --text "Federal Reserve raises interest rates for the third time this year" + +# Skip the Docker build step if the image already exists ./run.sh --skip-build --text "Apple reports record iPhone sales" - -# Run individual steps -./docker_build.sh # build image -./docker_dataloader.sh # inspect dataset -./docker_train.sh --model bert-base-uncased --epochs 3 # fine-tune -./docker_predict.sh --text "NASA launches new satellite" # single prediction +``` + +### Individual Steps + +Each pipeline step can also be run independently: + +```bash +# Inspect the dataset (label distribution, sample articles) +./docker_dataloader.sh + +# Fine-tune the model +./docker_train.sh +./docker_train.sh --model bert-base-uncased --epochs 3 + +# Evaluate the trained model +./docker_evaluate.sh +./docker_evaluate.sh --model_dir models/bert-ag-news + +# Run inference +./docker_predict.sh --text "NASA launches a new satellite into orbit" +./docker_predict.sh --file /app/project_files/articles.txt ./docker_predict.sh # interactive mode -./docker_jupyter.sh # open Jupyter Lab + +# Open a shell inside the container for debugging +./docker_bash.sh + +# Launch Jupyter Lab at http://localhost:8888/lab +./docker_jupyter.sh +JUPYTER_PORT=8889 ./docker_jupyter.sh # custom port + +# Remove the Docker image +./docker_clean.sh +./docker_clean.sh --volumes # also clears HF model cache ``` - + --- - -### New Files - -#### `scripts/train.py` -Fine-tunes a transformer model on AG News end-to-end. Orchestrates the full training pipeline in five steps: -1. Loads and preprocesses the dataset via the `dataset_loader` and `preprocessing` utilities. -2. Instantiates `AutoModelForSequenceClassification` with a classification head (dropout + linear projection to 4 labels) on top of the pre-trained transformer backbone. -3. Configures HuggingFace `Trainer` with: linear LR warmup over 500 steps, weight decay regularization, per-epoch evaluation, and macro-F1 as the checkpoint selection metric. -4. Runs training with optional `fp16` on CUDA for speed. -5. Saves the best checkpoint to `models//best/` and writes a training log to `train_results.txt`. -Accepts CLI flags `--model`, `--epochs`, `--batch_size`, `--lr` so any backbone (DistilBERT, BERT, RoBERTa) can be swapped without touching code. - -#### `scripts/predict.py` -Loads a fine-tuned checkpoint and runs inference in three modes: -- `--text` — classify a single article passed as a string. -- `--file` — classify all articles in a text file (one per line). -- Interactive — prompts for articles in a loop until `Ctrl+C`. -Outputs the predicted label, confidence percentage, and a score bar for all four classes. Falls back gracefully with a clear error if the model checkpoint is not found. - -#### `run.sh` -Unified pipeline wrapper that runs the entire workflow in a single command. Executes four steps in order: build image → load dataset → train model → run prediction. Accepts `--skip-build` to avoid rebuilding when the image already exists, `--text` to set a custom prediction article, and all `train.py` flags (`--model`, `--epochs`, `--batch_size`, `--lr`) which are forwarded directly to the training step. - -#### `docker_utils.sh` -Shared helper library sourced by every `docker_*.sh` script. Provides three functions: `run()` to echo and execute commands, `load_docker_vars()` to source `docker_name.sh` and print resolved image names, and `base_run_opts()` to build the standard `docker run` flags including the code volume mount and HuggingFace cache volume. - -#### `docker_train.sh` -Runs `scripts/train.py` inside the container. Forwards all CLI arguments directly to the script, so any combination of `--model`, `--epochs`, `--batch_size`, and `--lr` works without modifying the script. The trained model is saved to `./models/` on the host via the volume mount. - -#### `docker_predict.sh` -Runs `scripts/predict.py` inside the container. Allocates a TTY only when called with no arguments (interactive mode), allowing it to also be called non-interactively from `run.sh` without a "not a TTY" error. - -#### `docker_dataloader.sh` -Runs `utils/dataset_loader.py` inside the container. Prints dataset size, per-label distribution, and sample articles so the data can be inspected before committing to a full training run. - -#### `version.sh` -Executed inside the container at image build time. Prints and logs the versions of all key packages (torch, transformers, datasets, etc.) to `/install/version.log`, making the image reproducible and debuggable. - + +## Configuration + +All tunable parameters live in `project_files/config.py`. Edit this file to change any default without modifying the scripts. + +| Parameter | Default | Description | +|---|---|---| +| `DEFAULT_MODEL` | `distilbert-base-uncased` | Backbone used when no `--model` flag is passed | +| `BERT_MODEL` | `bert-base-uncased` | BERT checkpoint name for reference | +| `ROBERTA_MODEL` | `roberta-base` | RoBERTa checkpoint name for reference | +| `EPOCHS` | `3` | Number of training epochs | +| `BATCH_SIZE` | `16` | Per-device training batch size | +| `LEARNING_RATE` | `2e-5` | Peak learning rate | +| `WEIGHT_DECAY` | `0.01` | L2 regularization strength | +| `WARMUP_STEPS` | `500` | Linear LR warmup steps | +| `MAX_LENGTH` | `128` | Max tokens per article | +| `TRAIN_SUBSET` | `None` | Set to an integer to use a smaller training slice | +| `EVAL_SUBSET` | `None` | Set to an integer to use a smaller test slice | +| `OUTPUT_DIR` | `models/distilbert-ag-news` | Where the trained checkpoint is saved | +| `RESULTS_DIR` | `results` | Where evaluation outputs are saved | +| `SEED` | `42` | Random seed for reproducibility | + --- - -### Modified Files - -#### `config.py` -Added `BERT_MODEL` and `ROBERTA_MODEL` checkpoint constants alongside the existing `DEFAULT_MODEL` (DistilBERT) so alternative backbones can be referenced by name across the codebase without hardcoding strings. `RESULTS_DIR` path added for evaluation outputs in future commits. - -#### `utils/dataset_loader.py` -Added a validation split to test and find the best fit model before saving it. Since AGNews does not have a default validation split to ensure the final evaluation is done on unseen test data. This split (90/10 train, validation) helps us achieve that. - -#### `utils/metrics.py` -Updated import paths to reflect the new `project_files/` directory structure. No logic changes — `compute_metrics()` and `full_report()` behave identically to v1.0. - -#### `utils/preprocessing.py` -Fixed the `sys.path` insert to correctly locate `config.py` when the script is called from inside the `utils/` subdirectory as part of the restructured project layout. - -#### `Dockerfile` -Updated from the bare template to a full ML image: -- Base stays `python:3.12-slim`; added system packages `git`, `build-essential`, `g++`, `libgomp1` -- `COPY` path for `requirements.txt` changed to `project_files/requirements.txt`. -- Added `ENV HF_HOME=/hf_cache` and `ENV TRANSFORMERS_CACHE=/hf_cache` so all HuggingFace model downloads survive container restarts. -- Project code is not copied into the image — it is volume-mounted at runtime, so code edits require no rebuild. -#### `docker_build.sh` -Rewritten to be fully standalone — removed the dependency on the monorepo `utils.sh` via `git rev-parse`. Sources `docker_utils.sh` instead. Enables `DOCKER_BUILDKIT=1` for faster cached layer builds. Extra args (e.g. `--no-cache`) are passed through to `docker build`. - -#### `docker_bash.sh` -Rewritten standalone. Opens an interactive bash shell inside the container with the project directory live-mounted at `/app`. Used for manual debugging and exploration. - -#### `docker_jupyter.sh` -Rewritten standalone. Launches Jupyter Lab in a detached container with port-forwarding (`host:8888 → container:8888`). Port can be overridden via `JUPYTER_PORT` env var. - -#### `docker_clean.sh` -Rewritten standalone. Gains a `--volumes` flag that additionally removes the `hf_cache` named volume, wiping downloaded model weights for a full clean slate. - -#### `run_jupyter.sh` -Simplified — removed monorepo framework calls. Retains the same Jupyter flags: `--no-browser`, `--ip=0.0.0.0`, `--allow-root`, no token/password. +## Pipeline Steps + +### Step 1 — Dataset Loading (`utils/dataset_loader.py`) + +Loads the [AG News](https://huggingface.co/datasets/ag_news) dataset from the HuggingFace Hub. AG News contains 120,000 training and 7,600 test articles across four categories: + +| ID | Category | +|---|---| +| 0 | World | +| 1 | Sports | +| 2 | Business | +| 3 | Sci/Tech | + +Key functions: + +- `load_ag_news()` — downloads the dataset from the HuggingFace Hub. +- `get_subsets()` — optionally samples a smaller slice for faster iteration, controlled by `TRAIN_SUBSET` and `EVAL_SUBSET` in `config.py`. +- `summarize_dataset()` — prints per-label counts to the terminal. +- `get_sample_articles()` — prints random raw examples for spot-checking before training. + +A validation split (90/10 from train) is created at load time since AG News does not provide one by default. This ensures the test set remains fully unseen during model selection. + +--- + +### Step 2 — Preprocessing (`utils/preprocessing.py`) + +Three-stage pipeline applied before tokenization: + +**Clean** (`clean_text()`): Removes HTML entities (`&`, `"`), URLs, and collapses excess whitespace. Punctuation and casing are preserved for the tokenizer. + +**Tokenize** (`make_tokenize_fn()`): Returns a closure for use with `dataset.map()`. Applies `padding="max_length"` and `truncation=True` at `MAX_LENGTH=128` tokens using `AutoTokenizer`. + +**Format** (`tokenize_dataset()`): Runs batched tokenization across the full `DatasetDict` and sets PyTorch tensor format on `input_ids`, `attention_mask`, and `label`. --- -## Release v3.0 - -### New Files - -#### `scripts/evaluate.py` -Runs full batch evaluation of the fine-tuned model on the AG News test set and saves four outputs to `results/`: -- `classification_report.txt` — per-class precision, recall, F1, and support via sklearn. -- `confusion_matrix.png` — heatmap of true vs predicted labels across all four classes. -- `per_class_metrics.png` — grouped bar chart of precision, recall, and F1 per category. -- `predictions.csv` — row-level predictions with true label, predicted label, and a correct/incorrect flag for manual inspection. -Accepts `--model_dir` to evaluate any saved checkpoint, defaulting to `models/distilbert-ag-news`. Uses `matplotlib.use("Agg")` so plots render correctly inside Docker without a display. - -#### `docker_evaluate.sh` -Runs `scripts/evaluate.py` inside the container non-interactively. All four result files are written to `./results/` on the host via the volume mount. Accepts `--model_dir` which is forwarded directly to the script. - ---- - -### Modified Files - -#### `run.sh` -Added evaluate as step 4, shifting predict to step 5. Pipeline order is now: **build → dataloader → train → evaluate → predict**. Step counter in all headers updated from `/4` to `/5`. \ No newline at end of file +### Step 3 — Training (`scripts/train.py`) + +Fine-tunes a transformer model on the preprocessed dataset using HuggingFace `Trainer`. + +**Model**: `AutoModelForSequenceClassification` adds a dropout layer and a linear projection (`hidden_size → 4 labels`) on top of the pre-trained backbone. Only the classification head is randomly initialized; transformer weights come from the HuggingFace checkpoint. + +**Training decisions**: +- All transformer layers are trainable (full fine-tune, not frozen). +- Linear learning rate warmup over 500 steps avoids large early gradient updates. +- Weight decay regularization prevents overfitting. +- Evaluation runs after every epoch; the best checkpoint is selected by macro-F1. +- `fp16` mixed-precision is enabled automatically when CUDA is available. + +**CLI flags**: + +```bash +python scripts/train.py --model bert-base-uncased --epochs 5 --batch_size 32 --lr 3e-5 +``` + +**Outputs** (written to `models//`): +- `best/` — best checkpoint (model weights + tokenizer). +- `train_results.txt` — per-epoch training log. + +--- + +### Step 4 — Evaluation (`scripts/evaluate.py`) + +Runs batch inference on the full test set and saves four outputs to `results/`: + +| Output file | Contents | +|---|---| +| `classification_report.txt` | Per-class precision, recall, F1, and support | +| `confusion_matrix.png` | Heatmap of true vs predicted labels | +| `per_class_metrics.png` | Grouped bar chart of precision, recall, and F1 per category | +| `predictions.csv` | Row-level predictions with true label, predicted label, and a correct/incorrect flag | + +Uses `matplotlib.use("Agg")` so plots render inside Docker without a display. + +**CLI flags**: + +```bash +python scripts/evaluate.py --model_dir models/bert-ag-news +``` + +--- + +### Step 5 — Inference (`scripts/predict.py`) + +Loads a saved checkpoint and classifies articles in three modes: + +**Single article**: +```bash +./docker_predict.sh --text "Apple reports record iPhone sales in Q3" +``` + +**File** (one article per line): +```bash +./docker_predict.sh --file /app/project_files/articles.txt +``` + +**Interactive** (type articles one at a time, `Ctrl+C` to quit): +```bash +./docker_predict.sh +``` + +Example output: +``` +── Result 1 ───────────────────────────────────────────── + Text : Apple reports record iPhone sales in Q3... + Prediction : Business (94.21% confidence) + All scores : + Business 94.21% ██████████████████ + Sci/Tech 3.87% + World 1.12% + Sports 0.80% +``` + +--- + +## Outputs + +All outputs are written to the host machine via the Docker volume mount and persist after the container exits. + +| Path | Contents | Generated by | +|---|---|---| +| `models//best/` | Fine-tuned model weights and tokenizer | `train.py` | +| `models//train_results.txt` | Per-epoch training log | `train.py` | +| `results/classification_report.txt` | Full sklearn classification report | `evaluate.py` | +| `results/confusion_matrix.png` | Confusion matrix heatmap | `evaluate.py` | +| `results/per_class_metrics.png` | Per-class metric bar chart | `evaluate.py` | +| `results/predictions.csv` | Row-level test set predictions | `evaluate.py` | + +--- + +## Release Notes + +### Release v1.0 +Initial data and preprocessing pipeline. + +- `config.py` — central configuration for all constants and hyperparameters. +- `utils/dataset_loader.py` — AG News loading, subsetting, and dataset inspection. +- `utils/preprocessing.py` — text cleaning, tokenization, and dataset formatting. +- `utils/metrics.py` — Trainer callback (`compute_metrics`) and sklearn report utility (`full_report`). +- `requirements.txt` — full dependency list. + +### Release v2.0 +Model training, inference, and Docker integration. + +- **New**: `scripts/train.py` — end-to-end fine-tuning with HuggingFace Trainer. +- **New**: `scripts/predict.py` — three-mode inference (single article, file, interactive). +- **New**: `run.sh` — unified pipeline wrapper with CLI flags forwarded to each step. +- **New**: `Dockerfile`, `docker_utils.sh`, `docker_name.sh`, `docker_build.sh`, `docker_train.sh`, `docker_predict.sh`, `docker_dataloader.sh`, `docker_bash.sh`, `docker_jupyter.sh`, `docker_clean.sh`, `run_jupyter.sh`, `version.sh` — full standalone Docker integration. +- **Modified**: `config.py` — added `BERT_MODEL`, `ROBERTA_MODEL`, and `RESULTS_DIR`. +- **Modified**: `utils/dataset_loader.py` — added 90/10 validation split from the training set since AG News has no default validation split. +- **Modified**: `utils/preprocessing.py` — fixed `sys.path` insert for subdirectory execution. +- **Modified**: `requirements.txt` — moved from project root into `project_files/`; root-level duplicate removed. +- **Modified**: `Dockerfile` — upgraded from bare template to full ML image; added system packages required by PyTorch and scikit-learn; configured HuggingFace cache volume. + +### Release v3.0 +Model evaluation and result export. + +- **New**: `scripts/evaluate.py` — batch inference on the test set, exports classification report, confusion matrix, per-class metric chart, and predictions CSV. +- **New**: `docker_evaluate.sh` — runs `evaluate.py` inside the container; results written to `./results/` on the host. +- **Modified**: `run.sh` — evaluation added as step 4; prediction moved to step 5; step counters updated. +- **Modified**: `utils/metrics.py` — removed `import evaluate` (HuggingFace library) to resolve a circular import. When running `scripts/evaluate.py`, Python adds `scripts/` to `sys.path` automatically, causing `import evaluate` inside `metrics.py` to resolve to `scripts/evaluate.py` instead of the HuggingFace package. Fixed by replacing the single usage with `accuracy_score` from sklearn, which produces identical results with no behaviour change. \ No newline at end of file From 6268f4708c846beba8c4c5ea205730deec5319d7 Mon Sep 17 00:00:00 2001 From: riyaapuri Date: Thu, 7 May 2026 22:49:28 -0400 Subject: [PATCH 7/8] File cleanup --- .../README.md | 28 +++++++++---------- .../docker_build.sh | 4 +-- .../docker_clean.sh | 2 +- .../docker_jupyter.sh | 4 +-- .../docker_predict.sh | 9 ++---- .../docker_train.sh | 4 +-- .../project_files/scripts/predict.py | 12 ++------ .../project_files/scripts/train.py | 16 ++++------- .../project_files/utils/preprocessing.py | 13 ++------- 9 files changed, 34 insertions(+), 58 deletions(-) diff --git a/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/README.md b/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/README.md index d066cc25b..dcf8bedb9 100644 --- a/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/README.md +++ b/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/README.md @@ -2,12 +2,11 @@ ## Description -This project builds an end-to-end News Article Classification Pipeline using HuggingFace Transformers. Given a raw news article, the system fine-tunes transformer models (DistilBERT, BERT, RoBERTa) on the AG News dataset for 4-class topic classification and serves predictions through a command-line inference interface. +In this project, an end-to-end pipeline will be implemented for News Article Classification with Huggingface Transformers. The system trains a DistilBERT transformer using the AG News dataset for classifying articles into four classes using transfer learning. -The entire pipeline — data loading, preprocessing, training, evaluation, and inference — runs inside Docker, requiring no local Python environment setup beyond Docker Desktop. +The entire pipeline - data loading, preprocessing, training, evaluation, and inference — runs inside Docker, requiring no local Python environment setup beyond Docker Desktop. **Authors**: @riyaapuri @stupatel17 -**Assigned to**: @riyaapuri @stupatel17 @protocorn @gpsaggese **Project Specs**: https://github.com/gpsaggese/gpsaggese.github.io/blob/master/class_project/data605/Spring2026/projects_descriptions/HuggingFace_Project_Description.md @@ -47,8 +46,8 @@ Raw News Article │ ▼ ┌─────────────────┐ -│ train.py │ Fine-tunes DistilBERT / BERT / RoBERTa -│ │ Saves best checkpoint by macro-F1 +│ train.py │ Fine-tunes DistilBERT +│ │ Saves best checkpoint under models by macro-F1 └────────┬────────┘ │ ▼ @@ -59,8 +58,8 @@ Raw News Article │ ▼ ┌─────────────────┐ -│ predict.py │ Single article / file / interactive inference -│ │ Returns label + per-class confidence scores +│ predict.py │ Takes in a single user input +│ │ Returns label, per-class confidence scores └─────────────────┘ ``` @@ -76,8 +75,6 @@ Raw News Article | Training Backend | PyTorch, Accelerate | | Evaluation | Scikit-learn | | Hyperparameter Tuning | Optuna | -| Serving | FastAPI *(upcoming)* | -| Dashboard | Streamlit *(upcoming)* | | Containerization | Docker | --- @@ -94,14 +91,15 @@ project_root/ │ ├── scripts/ │ │ ├── train.py # Fine-tuning script │ │ ├── evaluate.py # Evaluation + result export -│ │ └── predict.py # Inference script +│ │ └── predict.py # Interactive script │ │ │ └── utils/ │ ├── dataset_loader.py # Data loading and inspection │ ├── preprocessing.py # Text cleaning and tokenization │ └── metrics.py # Metric callbacks and report utilities │ -├── models/ # Saved model checkpoints (generated) +├── models/ # Saved model checkpoints +│ (generated) │ └── distilbert-ag-news/ │ └── best/ # Best checkpoint by macro-F1 │ @@ -123,8 +121,10 @@ project_root/ ├── docker_bash.sh # Open interactive shell ├── docker_jupyter.sh # Launch Jupyter Lab ├── docker_clean.sh # Remove image and cache -├── run_jupyter.sh # Jupyter startup (runs inside container) -├── version.sh # Package version logger (runs at build) +├── run_jupyter.sh # Jupyter startup +│ (runs inside container) +├── version.sh # Package version logger +│ (runs at build) ├── bashrc # Shell config copied into image └── etc_sudoers # Sudo config copied into image ``` @@ -197,7 +197,7 @@ With options: ./run.sh --skip-build --text "Apple reports record iPhone sales" ``` -### Individual Steps +### Individual Steps (Used when you only want to run part of the process) Each pipeline step can also be run independently: diff --git a/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/docker_build.sh b/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/docker_build.sh index aaa0ac4ee..739c36177 100755 --- a/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/docker_build.sh +++ b/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/docker_build.sh @@ -10,13 +10,13 @@ load_docker_vars # Enable BuildKit for faster, cached layer builds. export DOCKER_BUILDKIT=1 -# Pass any extra args (e.g. --no-cache) straight through to docker build. +# Pass any extra args (like --no-cache) straight through to docker build. EXTRA_ARGS="$*" run "docker build $EXTRA_ARGS -t $FULL_IMAGE_NAME $SCRIPT_DIR" echo "" -echo "✅ Image built: $FULL_IMAGE_NAME" +echo " Image built: $FULL_IMAGE_NAME" echo " Run './docker_bash.sh' to open an interactive shell." echo " Run './docker_train.sh' to start fine-tuning." echo " Run './docker_jupyter.sh' to launch Jupyter Lab." diff --git a/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/docker_clean.sh b/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/docker_clean.sh index ff086b13e..9ffd7f889 100755 --- a/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/docker_clean.sh +++ b/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/docker_clean.sh @@ -16,4 +16,4 @@ fi echo "" run "docker ps -a" -echo "✅ Cleanup complete. Run './docker_build.sh' to rebuild." +echo " Cleanup complete. Run './docker_build.sh' to rebuild." diff --git a/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/docker_jupyter.sh b/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/docker_jupyter.sh index d70263216..eb5a3bcbd 100755 --- a/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/docker_jupyter.sh +++ b/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/docker_jupyter.sh @@ -13,12 +13,12 @@ CONTAINER_NAME="${IMAGE_NAME}_jupyter" # -p maps host:container port; -d runs detached so the terminal stays free. OPTS=$(base_run_opts "$CONTAINER_NAME" "-d -p ${JUPYTER_PORT}:8888") -echo "🔬 Starting Jupyter Lab on http://localhost:${JUPYTER_PORT}/lab" +echo " Starting Jupyter Lab on http://localhost:${JUPYTER_PORT}/lab" echo " (container: $CONTAINER_NAME)" echo "" run "docker run $OPTS $FULL_IMAGE_NAME /bin/bash run_jupyter.sh" echo "" -echo "✅ Jupyter is running. Open: http://localhost:${JUPYTER_PORT}/lab" +echo " Jupyter is running. Open: http://localhost:${JUPYTER_PORT}/lab" echo " Stop with: docker stop $CONTAINER_NAME" diff --git a/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/docker_predict.sh b/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/docker_predict.sh index 1ebcf913f..5023b0659 100755 --- a/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/docker_predict.sh +++ b/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/docker_predict.sh @@ -1,16 +1,11 @@ #!/bin/bash -# ───────────────────────────────────────────────────────────────────────────── # Run inference with the fine-tuned model inside Docker. -# -# All CLI flags are forwarded directly to predict.py: -# -# Usage: # ./docker_predict.sh --text "Apple reports record iPhone sales" # ./docker_predict.sh --file /app/articles.txt # file must be inside ./ # ./docker_predict.sh # interactive mode # # Note: run ./docker_train.sh first so the model checkpoint exists. -# ───────────────────────────────────────────────────────────────────────────── + set -e SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" @@ -20,7 +15,7 @@ load_docker_vars CONTAINER_NAME="${IMAGE_NAME}_predict" # Only allocate a TTY for interactive mode (no args). -# When --text or --file is passed it runs non-interactively (e.g. from run.sh). +# When --text or --file is passed it runs non-interactively if [[ $# -eq 0 ]]; then EXTRA="-it" else diff --git a/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/docker_train.sh b/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/docker_train.sh index 4fe1c06c3..8eaf87422 100755 --- a/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/docker_train.sh +++ b/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/docker_train.sh @@ -15,11 +15,11 @@ CONTAINER_NAME="${IMAGE_NAME}_train" # No -it so it can be run non-interactively (e.g. in CI or nohup). OPTS=$(base_run_opts "$CONTAINER_NAME") -echo "🚀 Starting training — args: $*" +echo " Starting training — args: $*" echo " Fine-tuned model will be saved to ./models/ on your host." echo "" run "docker run $OPTS $FULL_IMAGE_NAME python project_files/scripts/train.py $*" echo "" -echo "✅ Training complete. Check ./models/ for the saved checkpoint." +echo " Training complete. Check ./models/ for the saved checkpoint." diff --git a/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/project_files/scripts/predict.py b/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/project_files/scripts/predict.py index 32316ffaf..6a7e23d36 100644 --- a/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/project_files/scripts/predict.py +++ b/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/project_files/scripts/predict.py @@ -1,7 +1,5 @@ # scripts/predict.py -""" -This script is to run the fine-tuned model on custom text input. -""" +#This script is to run the fine-tuned model on custom text input. import argparse import os @@ -17,9 +15,7 @@ def load_model(model_dir: str): - """ - Load a fine-tuned model and tokenizer(best model that we got from train.py) - """ + #Load a fine-tuned model and tokenizer(best model that we got from train.py) best_path = os.path.join(model_dir, "best") load_path = best_path if os.path.isdir(best_path) else model_dir @@ -36,9 +32,7 @@ def load_model(model_dir: str): def predict(texts, tokenizer, model, device="cpu"): - """ - Predict category labels for a list of texts. - """ + #Predict category labels for a list of texts. cleaned = [clean_text(t) for t in texts] inputs = tokenizer( cleaned, diff --git a/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/project_files/scripts/train.py b/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/project_files/scripts/train.py index edcb22a29..4e39392b1 100644 --- a/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/project_files/scripts/train.py +++ b/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/project_files/scripts/train.py @@ -1,6 +1,4 @@ """ -What this script does - 1. Loads AG News dataset and applies the preprocessing pipeline. 2. Instantiates a pre-trained DistilBERT model with a classification head (AutoModelForSequenceClassification). @@ -58,9 +56,7 @@ def parse_args(): def build_model(model_name: str): - """ - Load a pre-trained transformer model with a sequence classification head. - """ + #Load a pre-trained transformer model with a sequence classification head. print(f"\n[train] Loading model: {model_name}") model = AutoModelForSequenceClassification.from_pretrained( model_name, @@ -77,11 +73,9 @@ def build_model(model_name: str): def build_training_args(output_dir: str, epochs: int, batch_size: int, lr: float): """ - Configure HuggingFace TrainingArguments. - - - evaluation_strategy = "epoch" → evaluate on val set after every epoch - - load_best_model_at_end = True → restore best checkpoint after training - - metric_for_best_model = "f1_macro" + Configure HuggingFace training arguments. + - load_best_model_at_end restores best checkpoint after training + - metric_for_best_model: f1_macro chosen over accuracy to account for class imbalance """ use_fp16 = torch.cuda.is_available() return TrainingArguments( @@ -144,7 +138,7 @@ def main(): print(f"\n[train] Starting fine-tuning — {args.epochs} epoch(s)...") trainer.train() - # ── Step 5: Save best model ───────────────────────────────────────────────── + #Step 5: Save best model best_path = os.path.join(out_dir, "best") trainer.save_model(best_path) tokenizer.save_pretrained(best_path) diff --git a/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/project_files/utils/preprocessing.py b/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/project_files/utils/preprocessing.py index 6d0a0f0f8..2f03a268f 100644 --- a/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/project_files/utils/preprocessing.py +++ b/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/project_files/utils/preprocessing.py @@ -25,16 +25,9 @@ def clean_text(text: str) -> str: def get_tokenizer(model_name: str = DEFAULT_MODEL): """ - Load a HuggingFace AutoTokenizer for the given model checkpoint. - - Parameters - ---------- - model_name : str - HuggingFace model hub ID (e.g. 'distilbert-base-uncased') - - Returns - ------- - tokenizer : PreTrainedTokenizer + Load a HuggingFace AutoTokenizer + Parameters: model_name [HuggingFace model hub ID (like 'distilbert-base-uncased')] + Returns: PreTrainedTokenizer """ print(f"[preprocessing] Loading tokenizer: {model_name}") tokenizer = AutoTokenizer.from_pretrained(model_name) From f8f5bc604a430c640bc94f7cbd6a6ba8289ed282 Mon Sep 17 00:00:00 2001 From: riyaapuri Date: Fri, 8 May 2026 16:40:32 -0400 Subject: [PATCH 8/8] Created pynb for tutorial, fix bug in dataset loader --- .../project_files/utils/dataset_loader.py | 3 + .../tutorial-runthrough.ipynb | 1105 +++++++++++++++++ 2 files changed, 1108 insertions(+) create mode 100644 class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/tutorial-runthrough.ipynb diff --git a/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/project_files/utils/dataset_loader.py b/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/project_files/utils/dataset_loader.py index c11962a0d..62920367f 100644 --- a/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/project_files/utils/dataset_loader.py +++ b/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/project_files/utils/dataset_loader.py @@ -25,7 +25,10 @@ def load_ag_news(): def get_subsets(dataset): if TRAIN_SUBSET: dataset["train"] = dataset["train"].shuffle(seed=SEED).select(range(TRAIN_SUBSET)) + val_size = max(1, TRAIN_SUBSET // 10) + dataset["validation"] = dataset["validation"].shuffle(seed=SEED).select(range(val_size)) print(f"Using train subset: {TRAIN_SUBSET}") + print(f"Using validation subset: {val_size}") if EVAL_SUBSET: dataset["test"] = dataset["test"].shuffle(seed=SEED).select(range(EVAL_SUBSET)) print(f"Using test subset: {EVAL_SUBSET}") diff --git a/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/tutorial-runthrough.ipynb b/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/tutorial-runthrough.ipynb new file mode 100644 index 000000000..13daebbe2 --- /dev/null +++ b/class_project/data605/Spring2026/projects/UmdTask443_DATA605_Spring2026_HuggingFace_Text_Classification_Model/tutorial-runthrough.ipynb @@ -0,0 +1,1105 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "9cc58d01", + "metadata": {}, + "source": [ + "# HuggingFace News Article Text Classifier TUTORIAL!\n", + "1. Config review\n", + "2. Data loading & splits\n", + "3. Preprocessing\n", + "4. Training\n", + "5. Predictions\n", + "6. Evaluation & results" + ] + }, + { + "cell_type": "markdown", + "id": "54592dad", + "metadata": {}, + "source": [ + "---\n", + "## Add project to Python path" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "e5ce31d5", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Project root: /app/project_files\n", + "Python path set up correctly.\n" + ] + } + ], + "source": [ + "import sys\n", + "import os\n", + "\n", + "# Make sure project_files is on the path so all imports work\n", + "PROJECT_ROOT = os.path.abspath('project_files')\n", + "if PROJECT_ROOT not in sys.path:\n", + " sys.path.insert(0, PROJECT_ROOT)\n", + "\n", + "print('Project root:', PROJECT_ROOT)\n", + "print('Python path set up correctly.')" + ] + }, + { + "cell_type": "markdown", + "id": "b58749d5", + "metadata": {}, + "source": [ + "---\n", + "## Running Config File\n", + "Central configuration. Every other module imports from here — nothing is hardcoded anywhere else." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "2553f792", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "#Dataset\n", + "DATASET_NAME = \"ag_news\"\n", + "NUM_LABELS = 4\n", + "LABEL_NAMES = [\"World\", \"Sports\", \"Business\", \"Sci/Tech\"]\n", + "LABEL2ID = {name: i for i, name in enumerate(LABEL_NAMES)}\n", + "ID2LABEL = {i: name for i, name in enumerate(LABEL_NAMES)}\n", + "\n", + "#Model\n", + "DEFAULT_MODEL = \"distilbert-base-uncased\"\n", + "BERT_MODEL = \"bert-base-uncased\" \n", + "ROBERTA_MODEL = \"roberta-base\" \n", + "\n", + "#Training\n", + "OUTPUT_DIR = \"models/distilbert-ag-news\"\n", + "EPOCHS = 2\n", + "BATCH_SIZE = 16\n", + "LEARNING_RATE = 2e-5\n", + "WEIGHT_DECAY = 0.01\n", + "WARMUP_STEPS = 500\n", + "MAX_LENGTH = 128 #Max token length per article\n", + "TRAIN_SUBSET = 100\n", + "EVAL_SUBSET = 50\n", + "\n", + "#Evaluation\n", + "RESULTS_DIR = \"results\"\n", + "\n", + "#Reproducibility\n", + "SEED = 42\n", + "\n" + ] + } + ], + "source": [ + "# Read and print config.py so we can walk through it on camera\n", + "with open('project_files/config.py') as f:\n", + " print(f.read())" + ] + }, + { + "cell_type": "markdown", + "id": "ec6d0446", + "metadata": {}, + "source": [ + "---\n", + "## Data Loading & Splits\n", + "Loads AG News from HuggingFace hub and applies the 90/10 train/validation split.\n", + "\n", + "**Full dataset sizes (when TRAIN_SUBSET = None):**\n", + "- Train: 108,000 (90% of original 120k)\n", + "- Validation: 12,000 (10% of original 120k)\n", + "- Test: 7,600 (original test split — untouched during training)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "b31ee031", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n", + "Warning: You are sending unauthenticated requests to the HF Hub. Please set a HF_TOKEN to enable higher rate limits and faster downloads.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Using train subset: 100\n", + "Using validation subset: 10\n", + "Using test subset: 50\n", + "\n", + "Dataset Summary:\n", + "\n", + " Split: train (100 examples)\n", + " [0] World 28 \n", + " [1] Sports 26 \n", + " [2] Business 17 \n", + " [3] Sci/Tech 29 \n", + "\n", + " Split: test (50 examples)\n", + " [0] World 13 \n", + " [1] Sports 12 \n", + " [2] Business 16 \n", + " [3] Sci/Tech 9 \n", + "\n", + " Split: validation (10 examples)\n", + " [0] World 1 \n", + " [1] Sports 2 \n", + " [2] Business 5 \n", + " [3] Sci/Tech 2 \n", + "────────────────────────────────────────────────────\n", + "\n" + ] + } + ], + "source": [ + "from utils.dataset_loader import load_ag_news, get_subsets, summarize_dataset\n", + "\n", + "dataset = load_ag_news()\n", + "dataset = get_subsets(dataset)\n", + "summarize_dataset(dataset)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "d077dc04", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Actual split sizes after subsetting:\n", + " Train : 100\n", + " Validation : 10\n", + " Test : 50\n", + "\n", + "The test split is completely untouched during training.\n", + "Validation is used by the Trainer to pick the best checkpoint.\n" + ] + } + ], + "source": [ + "# Confirm exact split sizes\n", + "print('Actual split sizes after subsetting:')\n", + "print(f' Train : {len(dataset[\"train\"]):,}')\n", + "print(f' Validation : {len(dataset[\"validation\"]):,}')\n", + "print(f' Test : {len(dataset[\"test\"]):,}')\n", + "print()\n", + "print('The test split is completely untouched during training.')\n", + "print('Validation is used by the Trainer to pick the best checkpoint.')" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "7fbc5e93", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "3 Sample Articles from 'train' split\n", + "\n", + " [1] Label: Sci/Tech\n", + " Text : Space Station Tinkering Works (CBS/AP) Space station commander Gennady Padalka and flight engineer Michael Fincke completed a five-hour 21-minute spacewalk, successfully installing a new coolant syste...\n", + "\n", + " [2] Label: World\n", + " Text : Kerry and Bush Congratulate Red Sox (AP) AP - Count Sen. John Kerry of Massachusetts among those Boston baseball fans who have waited a lifetime to see the Red Sox win the World Series. President Bush...\n", + "\n", + " [3] Label: Sci/Tech\n", + " Text : Getting Listed in Netscapes Open Directory Project Getting Listed in Netscape's Open Directory Project\\\\When you are deciding on the major search engine directories that you want your website to be li...\n", + "────────────────────────────────────────────────────\n", + "\n" + ] + } + ], + "source": [ + "# Show a few raw sample articles to see what the data looks like\n", + "from utils.dataset_loader import get_sample_articles\n", + "_ = get_sample_articles(dataset, n=3, split='train')" + ] + }, + { + "cell_type": "markdown", + "id": "3f887bbe", + "metadata": {}, + "source": [ + "---\n", + "## Preprocessing\n", + "Three stages: **clean → tokenize → format as PyTorch tensors**" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "b76e8232", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Raw : ' NASA launches new satellite & rover into orbit. https://nasa.gov '\n", + "Clean : 'NASA launches new satellite rover into orbit.'\n" + ] + } + ], + "source": [ + "from utils.preprocessing import get_tokenizer, tokenize_dataset, clean_text\n", + "\n", + "raw = ' NASA launches new satellite & rover into orbit. https://nasa.gov '\n", + "print('Raw :', repr(raw))\n", + "print('Clean :', repr(clean_text(raw)))" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "5dbc8d54", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[preprocessing] Loading tokenizer: distilbert-base-uncased\n" + ] + } + ], + "source": [ + "# Load the tokenizer — downloads DistilBERT vocabulary\n", + "tokenizer = get_tokenizer()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "191df614", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Text : Apple reports record iPhone sales this quarter\n", + "Tokens : ['apple', 'reports', 'record', 'iphone', 'sales', 'this', 'quarter']\n", + "IDs : [101, 6207, 4311, 2501, 18059, 4341, 2023, 4284, 102]\n", + "\n", + "Note: [CLS]=101 prepended, [SEP]=102 appended automatically\n" + ] + } + ], + "source": [ + "# Show how a sentence is split into subword tokens\n", + "sample_text = 'Apple reports record iPhone sales this quarter'\n", + "tokens = tokenizer.tokenize(sample_text)\n", + "ids = tokenizer.encode(sample_text)\n", + "print('Text :', sample_text)\n", + "print('Tokens :', tokens)\n", + "print('IDs :', ids)\n", + "print()\n", + "print('Note: [CLS]=101 prepended, [SEP]=102 appended automatically')" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "028d8e03", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[preprocessing] Tokenizing dataset (max_length=128)...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Tokenizing: 100%|██████████| 100/100 [00:00<00:00, 3948.70 examples/s]\n", + "Tokenizing: 100%|██████████| 50/50 [00:00<00:00, 2319.76 examples/s]\n", + "Tokenizing: 100%|██████████| 10/10 [00:00<00:00, 1921.70 examples/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[preprocessing] Tokenization complete.\n", + "\n", + "Tokenized split sizes:\n", + " Train : 100\n", + " Validation : 10\n", + " Test : 50\n", + "\n", + "input_ids shape : torch.Size([128])\n", + "attention_mask : tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1]) ...\n", + "label : tensor(2)\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "# Tokenize the full dataset — batched=True is 10-50x faster than row-by-row\n", + "tokenized = tokenize_dataset(dataset, tokenizer)\n", + "\n", + "print('\\nTokenized split sizes:')\n", + "print(f' Train : {len(tokenized[\"train\"]):,}')\n", + "print(f' Validation : {len(tokenized[\"validation\"]):,}')\n", + "print(f' Test : {len(tokenized[\"test\"]):,}')\n", + "print()\n", + "print('input_ids shape :', tokenized['train'][0]['input_ids'].shape)\n", + "print('attention_mask :', tokenized['train'][0]['attention_mask'][:10], '...')\n", + "print('label :', tokenized['train'][0]['label'])" + ] + }, + { + "cell_type": "markdown", + "id": "781384ac", + "metadata": {}, + "source": [ + "---\n", + "## Training\n", + "Fine-tunes DistilBERT on AG News using the HuggingFace Trainer API." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "f4c87b9d", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Warning: You are sending unauthenticated requests to the HF Hub. Please set a HF_TOKEN to enable higher rate limits and faster downloads.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Using train subset: 100\n", + "Using validation subset: 10\n", + "Using test subset: 50\n", + "[preprocessing] Loading tokenizer: distilbert-base-uncased\n", + "[preprocessing] Tokenizing dataset (max_length=128)...\n", + "[preprocessing] Tokenization complete.\n", + "\n", + "[train] Loading model: distilbert-base-uncased\n", + "[train] Total params : 66,956,548\n", + "[train] Trainable params: 66,956,548\n", + "\n", + "[train] Starting fine-tuning — 2 epoch(s)...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Loading weights: 100%|██████████| 100/100 [00:00<00:00, 4902.81it/s]\n", + "[transformers] \u001b[1mDistilBertForSequenceClassification LOAD REPORT\u001b[0m from: distilbert-base-uncased\n", + "Key | Status | \n", + "------------------------+------------+-\n", + "vocab_layer_norm.weight | UNEXPECTED | \n", + "vocab_transform.bias | UNEXPECTED | \n", + "vocab_layer_norm.bias | UNEXPECTED | \n", + "vocab_projector.bias | UNEXPECTED | \n", + "vocab_transform.weight | UNEXPECTED | \n", + "pre_classifier.weight | MISSING | \n", + "classifier.weight | MISSING | \n", + "pre_classifier.bias | MISSING | \n", + "classifier.bias | MISSING | \n", + "\n", + "Notes:\n", + "- UNEXPECTED:\tcan be ignored when loading from different task/architecture; not ok if you expect identical arch.\n", + "- MISSING:\tthose params were newly initialized because missing from the checkpoint. Consider training on your downstream task.\n", + "[transformers] `logging_dir` is deprecated and will be removed in v5.2. Please set `TENSORBOARD_LOGGING_DIR` instead.\n", + " 0%| | 0/14 [00:008,} KB')" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "4db74f87", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Model: distilbert-base-uncased\n", + "Epochs: 2\n", + "Batch size: 16\n", + "Learning rate: 2e-05\n", + "\n", + "{'eval_loss': 1.399472951889038, 'eval_accuracy': 0.2, 'eval_f1_macro': 0.0833, 'eval_runtime': 0.8181, 'eval_samples_per_second': 12.223, 'eval_steps_per_second': 1.222, 'epoch': 1.0, 'step': 7}\n", + "{'eval_loss': 1.3993321657180786, 'eval_accuracy': 0.2, 'eval_f1_macro': 0.0833, 'eval_runtime': 0.7555, 'eval_samples_per_second': 13.237, 'eval_steps_per_second': 1.324, 'epoch': 2.0, 'step': 14}\n", + "{'train_runtime': 95.7502, 'train_samples_per_second': 2.089, 'train_steps_per_second': 0.146, 'total_flos': 6623606169600.0, 'train_loss': 1.3771130698067802, 'epoch': 2.0, 'step': 14}\n", + "\n" + ] + } + ], + "source": [ + "# Show the training log — accuracy and F1 per eval step\n", + "log_path = 'models/distilbert-ag-news/train_results.txt'\n", + "with open(log_path) as f:\n", + " print(f.read())" + ] + }, + { + "cell_type": "markdown", + "id": "edb0c85b", + "metadata": {}, + "source": [ + "---\n", + "## Predictions\n", + "Load the saved model and classify new articles." + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "cfd5b1ec", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[predict] Loading model from: models/distilbert-ag-news/best\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Loading weights: 100%|██████████| 104/104 [00:00<00:00, 1723.50it/s]\n" + ] + } + ], + "source": [ + "from scripts.predict import load_model, predict, display_results\n", + "\n", + "tokenizer, model = load_model('models/distilbert-ag-news')" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "37b85b81", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "── Result 1 ─────────────────────────────────────────────\n", + " Text : Apple reports record iPhone sales this quarter\n", + " Prediction : Sports (26.45% confidence)\n", + " All scores :\n", + " Sports 26.45% ▓▓▓▓▓\n", + " Sci/Tech 25.56% ▓▓▓▓▓\n", + " World 24.10% ▓▓▓▓\n", + " Business 23.88% ▓▓▓▓\n", + "\n", + "── Result 2 ─────────────────────────────────────────────\n", + " Text : Manchester United defeats Arsenal 3-1 in Premier League\n", + " Prediction : Sports (26.66% confidence)\n", + " All scores :\n", + " Sports 26.66% ▓▓▓▓▓\n", + " Sci/Tech 26.31% ▓▓▓▓▓\n", + " World 24.23% ▓▓▓▓\n", + " Business 22.80% ▓▓▓▓\n", + "\n", + "── Result 3 ─────────────────────────────────────────────\n", + " Text : NASA launches new satellite to study climate change\n", + " Prediction : Sports (27.96% confidence)\n", + " All scores :\n", + " Sports 27.96% ▓▓▓▓▓\n", + " Sci/Tech 25.51% ▓▓▓▓▓\n", + " Business 23.35% ▓▓▓▓\n", + " World 23.18% ▓▓▓▓\n", + "\n", + "── Result 4 ─────────────────────────────────────────────\n", + " Text : Senate passes new immigration reform bill\n", + " Prediction : Sports (27.79% confidence)\n", + " All scores :\n", + " Sports 27.79% ▓▓▓▓▓\n", + " World 25.38% ▓▓▓▓▓\n", + " Sci/Tech 24.54% ▓▓▓▓\n", + " Business 22.29% ▓▓▓▓\n" + ] + } + ], + "source": [ + "# One article per class — none of these were seen during training\n", + "articles = [\n", + " 'Apple reports record iPhone sales this quarter', # Business\n", + " 'Manchester United defeats Arsenal 3-1 in Premier League', # Sports\n", + " 'NASA launches new satellite to study climate change', # Sci/Tech\n", + " 'Senate passes new immigration reform bill', # World\n", + "]\n", + "\n", + "results = predict(articles, tokenizer, model)\n", + "display_results(results)" + ] + }, + { + "cell_type": "markdown", + "id": "623b99b9", + "metadata": {}, + "source": [ + "---\n", + "## Best Model Evaluation\n", + "Run the full evaluation on the held-out test set (never seen during training)." + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "be3253d5", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[evaluate] Device: cpu\n", + "[evaluate] Loading model from: models/distilbert-ag-news/best\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Loading weights: 100%|██████████| 104/104 [00:00<00:00, 2588.29it/s]\n", + "Warning: You are sending unauthenticated requests to the HF Hub. Please set a HF_TOKEN to enable higher rate limits and faster downloads.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Using train subset: 100\n", + "Using validation subset: 10\n", + "Using test subset: 50\n", + "[preprocessing] Tokenizing dataset (max_length=128)...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Tokenizing: 100%|██████████| 100/100 [00:00<00:00, 2463.72 examples/s]\n", + "Tokenizing: 100%|██████████| 50/50 [00:00<00:00, 6135.07 examples/s]\n", + "Tokenizing: 100%|██████████| 10/10 [00:00<00:00, 1296.06 examples/s]\n", + "Evaluating: 0%| | 0/2 [00:00" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Show the confusion matrix heatmap\n", + "from IPython.display import Image, display\n", + "display(Image('results/confusion_matrix.png'))" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "5c16222e", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAABUYAAALuCAYAAABiqcxsAAAAOnRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjEwLjksIGh0dHBzOi8vbWF0cGxvdGxpYi5vcmcvJkbTWQAAAAlwSFlzAAAXEgAAFxIBZ5/SUgAAnidJREFUeJzs3X+81/P9P/7b6XcqKSVLSii2yK+YhML8/p2fMb+a/cDGe7aZ+TXfzT428+ttxsxvJkzY/P6d/FpIhFBERaIU6fePc17fP1zOeTs61Sl1jvW8Xnc5l53zfDyej+f9+Tqvc6qbx/PxKCuVSqUAAAAAABRIg/ouAAAAAACgrglGAQAAAIDCEYwCAAAAAIUjGAUAAAAACkcwCgAAAAAUjmAUAAAAACgcwSgAAAAAUDiCUQAAAACgcASjAAAAAEDhCEYBAAAAgMIRjAIAAAAAhSMYBQAAAAAKRzAKAAAAABSOYBQAAAAAKBzBKAAAAABQOIJRAAAAAKBwBKMAAAAAQOEIRgEAAACAwhGMAgAAAACFIxgFAAAAAApHMAoAAAAAFI5gFAAAAAAoHMEoAFB46623XsrKytKvX7/6LoVvgGOPPTZlZWUpKyur71IAAFiJBKMAsAoaN25cVbDz1Y9GjRqlbdu22XzzzXPCCSdk2LBh9V3u1zZixIicc8452XHHHdO5c+esttpqad68eTp27Jh+/frl9NNPz0svvVTfZX7jffW9cumll9bqvKlTp6Zp06bVzr3mmmtWbrGsMm6++eZq750HH3xwuccaNmxYzjrrrOy4447p0qVLWrZsmWbNmqVDhw7p1atXjj/++AwaNCiff/7516773HPPrVb38ccfv9Rznnzyyar+Z5111teuAQD4egSjAFAw5eXl+fTTTzNy5Mj87W9/S+/evXPsscdmwYIF9V3aMhszZkz23nvvbLXVVvn973+fp59+Ou+//37mzJmTuXPnZtKkSRk6dGj+9Kc/pVevXtl6663z5JNP1nfZ/zWuv/76WvX7xz/+kfnz56/kav7Pl8OlG264oc6uy8px7bXXVvv6uuuuW+YxRowYkX79+qV37975wx/+kKeffjoTJkzIrFmzMm/evEyePDkvvfRSrr322hx55JFZa621csIJJ2TixIkr6jZyww035K233lph4wEAK1+j+i4AAFi5evXqVS3gKi8vz6RJk/Lggw/mb3/7W+bPn58bb7wxTZo0yd///vd6rHTZPPzwwzn88MPz2WefJUk6duyYQw89NDvssEPWXnvtNGnSJB9//HGGDx+ee+65JyNGjMjw4cNz7rnnCkeXonnz5pkzZ05effXVvPTSS9lqq62W2L/y/VV53n+7G264QeBaR8aOHZunnnoqSdKqVavMmDEj99xzTz755JO0a9euVmPcdNNN+fGPf5y5c+cmSbp27Zr+/ftnu+22S4cOHdK8efN88sknGTt2bJ544ok89NBDmTlzZv72t79lo402yv/8z/+skHspLy/PGWeckbvuumuFjAcArHyCUQBYxbVo0SKbbLJJtWObbbZZ9thjjxx44IHZdddds3DhwlxzzTX5xS9+kY022qieKq29V155JQceeGBVCHfGGWfk7LPPTrNmzRbpu/fee+e3v/1tHn/88fzqV7+q61L/K22xxRb58MMPM27cuFx//fVLDEZHjBiRkSNHJkn69++fW265pa7KZBVw3XXXpVQqpaysLFdddVWOOOKIzJ8/PzfffHN+/vOfL/X8Bx54IMcdd1wqKirSuHHjXHTRRTnhhBPSqFHN/8w54YQT8umnn+aKK67I+eefv8Luo3379pkyZUruvvvuPP/88/nud7+7wsYGAFYej9IDQIH169cvBx98cJKkVCrl/vvvr+eKlm7hwoU59NBDq0LRCy+8MH/4wx9qDEW/bJdddsl//vOf/OAHP6iLMv+rlZWV5dhjj02SDBo0KPPmzVts38rHnrt06ZJddtmlLspjFVFeXl41M3fHHXfMgAEDsvHGGyep3eP0U6dOzZFHHpmKioqUlZXlzjvvzM9+9rPFhqKV2rRpkzPPPDOvvvpqNt988697G0mS3/zmN2nQ4It/Wv36179eIWMCACufYBQACq5Pnz5Vn7/33ns19nn00Udz7LHHplu3bmnZsmVWW221bLDBBjn66KPz9NNPL3H8r+74/v777+fXv/51Ntlkk6yxxhrLtMlP8sV6lm+//XaSL4LdX/ziF7U+t2nTpjnqqKNq3b/SwoUL88gjj+TUU09Nnz590qFDhzRp0iQtW7bMhhtumCOPPDKPPfZYrcZ69NFHc+SRR6Zbt25p0aJFmjRpkrXXXjs9evTIoYcemquvvjqffPJJjee++eabOeWUU7L55pundevWady4cdq1a5fu3btn9913zwUXXJDRo0cv8/3VpHJn9k8//TT/+te/auwzb968DBo0KElyzDHHLNMu7rNnz85ll12W3XbbLR07dkzTpk3Ttm3bbL311jn77LPz8ccfL3JO5aZiO+20U9Wx4447bpFNo9Zbb71q5/Xr16/a8WnTpuV3v/tdttpqq6y55popKyur9jj1suxKP2LEiPzsZz/LZpttlrZt26Zx48Zp27Ztvvvd7+bUU0/Nf/7znxrPe//993PGGWdkm222SZs2bdK4ceO0adMmG264Yfr165ff/e53GTFixNJfyP9iDz30UD788MMkqQriK///9ddfzwsvvLDE8y+99NKqpTSOP/747Lvvvst0/fXXX7/q99LXtemmm+b73/9+kmTo0KFfawOpSgsWLMj111+f/fbbL506dUqzZs3SunXr9OzZMz//+c8X+/v6qquuqnr/Lu7999vf/raqT48ePRZbwzrrrJOysrLstddei7QtXLgw119/ffbaa6+q+lZbbbWsu+662WKLLfKTn/wkd955Z52uPwwAy6wEAKxy3nvvvVKSUpJS3759l9j3yiuvrOp7wgknVGubNm1aaY899qhqX9zHD37wg9K8efNqHL9Lly5VdTz00EOl1q1bL3L+JZdcUut722677arOu//++2t93pJ8ucaanHLKKUt9DZKUDj/88NLcuXNrHKO8vLx09NFH12qcq6++epHz//a3v5UaNWq01HOPPPLI5X4dKsfo06dPqVQqlXbZZZdSktLuu+9eY//bbrutlKRUVlZWevfdd0vXX3/9Eu+h0pAhQ0prr732Eu+jVatWpbvvvrvaeV9+Xy/po0uXLtXO69u3b9Xxl156qdSxY8dFzjnllFOq+h9zzDFVxxdn9uzZtf5+ftU999xTWm211ZZ6XuX3YVXVv3//UpJSixYtSjNmzCiVSqXSxIkTSw0bNiwlKf3oRz9a4vnf+ta3ql6r0aNH10XJ1fz2t7+tuv6jjz5aGjduXKlp06alJKXNNtusVFFRscg5Q4YMqTrnzDPPXOzYr776aqlbt25LfH80bty4dOWVVy5y7ttvv13V57zzzqtx/D59+lQba9KkSYv0efPNN6va//znP1drmzx5cmmLLbao1fv/7bffXtpLCQD1xhqjAFBwr7zyStXn66yzTtXns2bNSr9+/fLqq68m+WKtzkMOOSTrrbdeWrRokbfeeitXXXVVnnnmmapdpa+55prFXuf999/PoYcemiQ566yzsvPOO2f11VfP2LFj0759+1rVOmvWrKpZZM2bN8+uu+66TPe6vBYuXJiOHTtmv/32y7bbbpsNNtggLVq0yOTJk/PWW2/liiuuyJgxY3LbbbelXbt2+ctf/rLIGH//+99z0003JUk23njj/PjHP86mm26atm3bZs6cOXn33Xfz/PPP55577lnk3Ndffz0nnXRSysvL07Zt2/zoRz9K37590759+5SXl+fDDz/MSy+9lAceeGCZZm0uzXHHHZfHH388jz76aD744IN06tSpWnvl4879+vVL165dM3To0KWO+cQTT2SPPfbIggULsvrqq+cnP/lJttlmm3Tp0iWzZ8/OM888k//93//N5MmTc8ghh+TRRx+tmtW3zjrr5LXXXsuLL76YgQMHJknOO++87L///tWu0aRJkxqvPXPmzBxwwAGZOnVqTjnllOy1115p165dPvjgg5SXl9f6dVmwYEH22muvqk281l577Zxwwgnp06dP2rZtmxkzZuS1117Lgw8+mIcffrjauZMnT86RRx6Z2bNnp3nz5jn++OOz2267pUOHDikrK8vHH3+cV155ZZHzVjVTpkzJvffemyQ5+OCD07JlyyRfbKK222675cEHH8xtt92WSy65JKutttoi57/11luZNGlSkqR79+7p3r173RW/GF26dMkJJ5yQSy+9NCNHjsygQYNy5JFHLvM4b7zxRvr06ZMZM2akWbNmOe6449K3b9+st956KS8vzwsvvJDLLrss7733Xk444YS0bNmyarZqkmy44Ybp3LlzJkyYkMcffzxnnnlmtfFnzpy5yGzcJ554IkcccUS1Y48//njV519dJuNnP/tZXn755aq2o446Kl27ds3qq6+e6dOn56233spTTz2V++67b5nvHwDqVH0nswDAilfbGaNjxowptWjRoqrvsGHDqtp+/OMfV83meuKJJ2o8v6KionTyySdXnf/ss88u0qdyNmaSUrt27b7WzK7nnnuuaqxtt912ucdZXI2Le63eeeed0oIFCxZ7fnl5eemoo44qJSk1bNiwNG7cuEX67LDDDqUkpXXXXbc0ffr0JY41bdq0asfOPvvsqvt++eWXl3gvn3zyyRLbl6TyGpUzFWfPnl01w/erM8/ef//9UoMGDUpJSjfddFOpVCotdcbo9OnTS2uttVbVNb56n5UmTZpUNVtuo402KpWXl1dr//Ksu+uvv36p91U5YzRJqVmzZqXnnntuif2XNmP0nHPOqWrfaaedlvj9HD9+fLWvr7322qpzvzoj9qu+zvfym+7CCy+seh2GDBlSre3222+varvxxhtrPH/QoEFVfY444og6qHhRX50xWiqVSlOmTCm1atWqlKTUtWvXRWbSL23GaHl5ealHjx5V7/0JEybUeO0ZM2aUevfuXUpSatu2benzzz+v1n7ccceVkpSaNm1amj17drW2+++/v6ptr732KiUpDRw4cJFrHHjggVXjf3n265w5c0qNGzcuJSkdcMABNc6MrTRz5szSnDlzFtsOAPXNGqMAUDDl5eWZOHFirr766uywww6ZNWtWkmTXXXet2kl54sSJVbMBzz777GprOn5ZWVlZ/vznP2fttddO8sXadkty/vnnf62ZXV9ee7NDhw7LPc6y2mCDDZa4oUuDBg1yySWXpGHDhikvL69xTc6PPvooSbLVVltl9dVXX+JYbdq0qfHcNdZYY6mbxay55ppLbF8WzZs3z+GHH54kVZvkVLrhhhtSUVGR1VdfPQcddFCtxrvqqqsyefLkNG7cOLfddtsi91lp7bXXzkUXXZQkGT16dK1motbWL37xi/Tu3Xu5z58xY0bVmrjt2rXLHXfcscTvZ+fOnat9Xfm9TLLYn6tKK/J7+U1T+fula9eu6du3b7W2/fffv+q9UTkb/au+/LtgrbXWWuK1Pvroo7z++uuL/ViR2rVrl1/96ldJvliz+W9/+9synX/nnXdm1KhRSZIbb7wx6667bo39WrZsWfX7dtq0aRk8eHC19p133jnJF+sAP/vss9XaKmeC9u7dO/vss0+1Y5UqKiqqZkTvtNNO1WaiT5s2LQsWLEjyf+v3Lk6LFi2WujEeANQnwSgArOKGDh1abWOaRo0apVOnTvnRj35UtcFNr169cuutt1adc++991b9w/erj1d+VZMmTao2cPrqP8C/rHHjxksda2k+//zzqs8rH72tD7Nmzcr48ePzxhtvVIUrkyZNqgqyato0p/Ix9KeeemqZN0iqPPezzz7LHXfc8TWrXzbHHXdckuSdd97JU089lSQplUpVQelhhx1W46PONbnzzjuTJDvssMMij+V/1ZdDwyW9r5bVMccc87XOHzJkSNX78Ac/+MEyh5dfvu+rr776a9Xy32rYsGF54403kiRHH330IsFa06ZNM2DAgCTJ008/nXfeeWeRMZbld8GFF16YTTfddLEfK9qpp55a9R9uzjvvvMyYMaPW51b+jKy//vpV/6FqcTbddNOq999Xf0Yqg9Fk0dCz8utddtml6hH58ePHZ+zYsVV9RowYkU8//XSRsZIvwt/KsPPWW2/NzJkza3dzAPANZI1RACiopk2bZquttsrRRx+dH/zgB9VmRH55/bmvznhbkso1/2rSrVu3xQZoS5q11aZNm6q1T788M6+u/zH+3nvv5dJLL819992X9957L6VSabF9a9pV/kc/+lGGDBmSadOmpWfPntl7772z++67Z9ttt02PHj2WOCP1+9//fv74xz9mzpw5OfTQQ9OnT5/su+++6dOnT7bYYou0aNFihdxjTb773e/mO9/5Tt54441cf/312XHHHfPUU09VhSiVa30uTXl5eV566aUkX6xnuCxroS7pfbUsWrRokW7dun2tMSrvIVn6jM+a7L///llrrbUyefLk/OpXv8o//vGPHHjggdlhhx2y5ZZbZo011vha9X322Wf54IMPvtYYtdGkSZPlnv1dOQu0rKxssUH1sccemyuuuCKlUinXXXdd/t//+3/V2lu1alX1+bIEj3WhRYsWOfvss/PTn/40U6ZMyUUXXZRzzz23VudW/u599913v9bPSMeOHfPtb387b775ZrVg9JNPPqlaN3qXXXZJ9+7d06lTp3zwwQd5/PHHs8EGGyRZ8vqiTZo0yTHHHJOrrroqzz//fLp06ZKDDjoou+yyS7beeuusv/76ta4bAOpdPT/KDwCsBF9eY7RXr16l1157rerjjTfeKL3//vul+fPnL/b8vffeu1a7Ddf08VWV63duv/32i73eksY75phjqvrV1xqjt99+e9Vu07X56NevX43jXHTRRTXuRt6yZcvS3nvvXbrlllsWu5bpY489VurUqdMi5zZq1Ki07bbblv785z+XPv3006/1OlSO+dXd0P/85z9XrTc7Y8aMqt3Yv/3tb1frt6Q1RqdMmbLc76ljjz222ljLu8Zop06davU6LGmN0RNOOKGq7bXXXqvVeF/18ssvl7797W8vcp9lZWWlzTbbrPTb3/629OGHHy7X2F/+HqzMjy5duixXfTNnzqxag3NJ6x+XSqWqtTY7duxYWrhwYbW2W265paqWAQMGLHMdX177eHnVtMZopfnz55c22GCDqp/vjz/+uFQqLX2N0S+v+bwsHzX9zvnpT39aSr5Y97jyd0Pl+q2tWrWq+l1T+X4/9NBDq87dddddS0lK66yzTo33Pnv27NIxxxxTKisrW6SWDh06lI455pjS0KFDl+t1BYC6ZMYoAKziWrRokU022WSZzlm4cGHV5y+88EKaN2/+teto2LDh1x5j0003TaNGjbJw4cKMHDkyCxYsSOPGjb/2uEvyzjvv5Oijj868efOy2mqr5ZRTTskee+yRDTfcMGuuuWaaNGlSNbOrc+fOef/99xc7m/TUU0/N0UcfnX/+85957LHH8txzz+Xjjz/OzJkzc//99+f+++/P+eefn3vuuSddu3atdu4uu+ySd955J/fcc08efPDBPPPMM3n77bezcOHCDBs2LMOGDcv555+fQYMGZffdd1+hr8FRRx2V3/zmN5k1a1auvfbaqvUMKx+zr40vv6d22223qjVEa2Nxa5EuqxXxHlwRNt9887z22mt5+OGHc++99+aZZ57JG2+8kYqKiowcOTIjR47MBRdckL/97W85+uij67vcFeqOO+6omuFZuczH0nz44Yd56KGHsvfee1cd+/Jau8OHD1/hdX5djRs3znnnnZcBAwZk5syZOe+883LZZZct9bzKn5MePXrktttuq/X1apo1vvPOO+fyyy9PeXl5nnzyyRxwwAFVM0H79u1bNUt9l112yY033pghQ4akVCpl/vz5eeaZZ6rGqEnz5s1zww035Iwzzsjtt9+eJ598Mi+88EJmzpyZjz/+ODfeeGNuvPHG9O/fP7fccot1RgH4xhKMAgCLaN++fdXn7dq1WySkW9EWFyR+VcuWLbP11lvnP//5T+bMmZNHHnmkWliyMlx33XWZN29ekmTw4MHZc889F9u3ck2+JWnXrl1OPPHEnHjiiUm+CF4ffvjhXHXVVXnttdfy+uuv5+CDD672yHalpk2b5pBDDskhhxySJJkyZUqeeOKJ3HTTTXnggQcybdq0HHTQQXnnnXeqNsRaETp06JC99tor99xzT04//fTMnTs3jRo1ylFHHVXrMdZcc82UlZWlVCpl7ty5yxzWf1N8+Wfjww8/XO77aNiwYfbaa6/stddeSZLp06dn6NChue2223L77bdnzpw5Oe6449KzZ8+lbrj1Zccee2yOPfbY5aqpLixuM6XanPfln/XvfOc76dChQz7++OO8/fbbGTNmzNfa2G1lOOyww3LBBRfk5ZdfzlVXXZWf//znSz2nffv2+eCDDzJjxoyv/TPSr1+/NGjQIBUVFXn88cerBaPf+973qvpVPio/ZcqUvPrqq/n0008zZ86cJIsPRit17949Z599ds4+++yUl5fn5Zdfzn333Zcrr7wykydPzl133ZUzzzxzmf5DCADUJZsvAQCL2Gqrrao+r9xw55vihz/8YdXnF1544Uq/3muvvZbki1mLSwpF33jjjeVa93TDDTfMSSedlOHDh1cFYCNGjKjVBk3t27fPYYcdlvvvv78qaJ01a1b+/e9/L3MdS1O5lujcuXOTJHvuuecyha+NGzdOz549k3wxw68yeFkey7L24orWq1evqs+HDBmywsZt3bp19ttvvwwaNCh//OMfk3yxM/jtt9++wq5R38aMGVM1E3H33XfPrbfeutSPyh3r77vvvkyePLnaeD/4wQ+qPq+L3wXLqqysrOp7OX/+/Jx99tlLPafyd++ECRMyfvz4r3X9Nm3aZMstt0zyxZqhX95g6cvrhnbs2DEbbbRRVb8lrS+6JA0bNkyvXr1y7rnn5vnnn69aU3rQoEFf6z4AYGUSjAIAi9h///2rHju+9NJLqz0GXd+OOuqobLjhhkmSJ598cplmIs2fPz//+Mc/lul6lfc+b968lJeXL7ZfbR6TXZImTZpU28xnypQpy3T+lx+fX9Zza2PvvfdOp06d0rRp0zRt2rTWmy592UEHHZQkmT17dv76178udy1fXtqhcjZvXdlpp53SunXrJMk111yTadOmrfBrrOzvZX257rrrqj4/+eSTc/jhhy/144QTTkiSLFiwIDfffHO18X7+859Xbch2zTXX5N577627m6ml3XbbrWrW5aBBgzJy5Mgl9q/8GUmSP//5z1/7+pXB5ptvvln1+nXo0GGR2aiV/b4cjG644YZZd911l+u66623XlXYuiq9hwFY9QhGAYBFdO3atWr9yFdeeSUDBw7M/Pnzl3jOo48+mmeffXal19aoUaP885//rArHfvnLX+bss8+umsm4OEOHDk3v3r1zzTXXLNP1Kv9xP3v27Nx666019vnHP/6Rv//970sc54YbblhiiDd37tyqGYhlZWXVdna+6667atzp/ssefPDBqs8rd5ZekRo1apT3338/c+fOzdy5c3PAAQcs8xgnn3xy2rVrlyQ588wzlzqzdebMmbn44otTUVFR7XjHjh2rPq/NzNoVqWXLljn11FOTfLHD98EHH5zPP/98sf0nTJhQ7esHH3xwkWNf9cADD1R9vjK+l/WhvLw8N910U5Kkbdu22XXXXWt13r777lu1fuZXH8Nv165dbr755qolGg466KBcfvnlS/0PObNmzarTQL1y1mipVMof/vCHJfYdMGBANt544yTJX//611x55ZVL7D9//vxce+21+fjjj2ts//Kj8JWzamt6PL4yGB06dGhefPHFase+6t13381jjz22xLrGjRuXN998M8mq8x4GYNVkjVEAoEaXXnppRowYkREjRuTmm2/Os88+m+OPPz7f/e5307Zt28yePTvvv/9+XnjhhfzrX//Ku+++m6uvvjp9+vRZ6bVtscUWufvuu3PYYYdl+vTpOe+883Ldddfl8MMPzw477JC11147jRo1yuTJkzNixIjce++9eeGFF5Kk6tHc2jr22GNz2WWXpVQq5fjjj89LL72UPfbYI+3bt8/48eNz6623ZvDgwdloo40ybdq0RR73rXTcccflF7/4RfbZZ59sv/326d69e1q3bp3p06fnzTffzN///ve88sorSb6YNfbl8O+yyy7LEUccke9973vZZZdd0qNHj7Rr1y7z5s2rquGee+5JknTp0iX77bffcryqK1/r1q0zePDg7Lbbbpk/f34OOOCA7L333jn44IOz0UYbpVmzZvnss8/y5ptv5qmnnsp9992XWbNm5eSTT06DBv/33/M7deqUrl275r333su1116bb3/729l6662rNnhp3LjxSg1jzjjjjDzxxBMZOnRohgwZko033jgnnHBC+vTpk7Zt22bGjBl544038sADD+SBBx7IggULqs69/fbb849//CM77LBDdt999/Ts2TNrrbVWKioqMnHixPzrX/+qmtnXunXrHHPMMSvtPurSAw88kEmTJiVJ+vfvX+tN01ZbbbXss88+uf322/Pmm2/mP//5T3r37l3Vvt9+++Xaa6/NCSeckHnz5uVnP/tZLr744hx88MHZbrvtstZaa6VFixaZOXNm3nvvvTz77LMZPHhw1X9oWBEbyy3N1ltvnYMPPjiDBw9e6uzJRo0a5a677sp2222Xzz77LCeeeGIGDRqU73//+9lkk03SokWLfP755xkzZkyee+65/Pvf/860adPy9ttvp0OHDouMt/3226dJkyaZP39+pk+fnqTmwHOnnXZKgwYNMmvWrKpji1tfdMKECdl1112z/vrrZ7/99ss222yTLl26pHnz5vnkk08ybNiwXHnllVX/serkk0+u9WsFAHVu5W98DwDUtffee6+UpJSk1Ldv3+Ue5/PPPy8NGDCgaqwlfZSVlZVuv/32Rcbo0qXL165jcUaPHl3ac889a1VfktK2225bevrpp5e5xosvvrhUVla22HG7d+9eevvtt5c4Tm1r3GOPPUrTp0+vdm7fvn1rde6GG25YGjVq1HK/npXj9OnTZ7nOv/7666vGuPrqqxfb7/nnny9tsMEGtbqnVq1alRYuXLjIGDfddNNiz+nSpUu1vpWv31ePL84xxxxTNdbizJo1q3TEEUfU6h4WN/aSPjp06FDje/W/1f777191b4899tgynXv33XdXnXv88cfX2OfFF18sbb/99rX+OWvatGnpuOOOK33wwQfLfU+//e1vq8Z79NFHl9h39OjRpUaNGlWr4cwzz1xs/zFjxpS23HLLWt/LhAkTFjvWV39/vPfeezX222qrrar9Pp88eXKN/YYMGVKruho0aFA6/fTTSxUVFUt8bQCgPpkxCgAsVqtWrTJo0KD86le/yg033JCnnnoqEyZMyPTp09OsWbN07Ngx3/nOd9K3b98ccMABK333+q/q3r17HnjggYwYMSJ33313nnzyyYwbNy5Tp05NRUVF2rRpk4022ii9e/fOIYccUrURybL6+c9/nl69euXSSy/Ns88+m6lTp2aNNdbIhhtumIMOOignnnhi1UYji/PGG2/koYceynPPPZcxY8Zk8uTJ+eSTT9KkSZN06tQpW2+9dY488sgaN3i67bbb8tBDD+Wpp57K66+/no8++ihTpkxJqVRK+/bts9lmm+WAAw7IUUcdlaZNmy7XPdalbbbZJm+99VZuv/32/Pvf/86LL76YKVOmZN68eVl99dXTtWvXbLnllvne976XffbZp2q92y876qij8q1vfStXXHFFhg8fnsmTJ9fp49GrrbZabrnllpxyyim59tpr89RTT2XixImZPXt2Wrdune7du2eHHXbI4YcfXu28Sy65JHvuuWeefPLJjBw5MpMmTcrkyZOzcOHCtG3bNj169Mg+++yTgQMHVq2f+d/u448/zv3335/ki/Ut+/Xrt0zn77nnnlWzq2+//fZceumlVY/XV+rVq1eefvrp/Oc//8m9996bp556KuPHj8+0adOycOHCtG7dOh07dswWW2yR7bffPgcddFDWWGONFXSHS9e9e/cMHDhwqUtuVOrWrVuGDx+ee++9N4MHD86wYcPy0UcfZfbs2WnVqlU6d+6czTbbLLvsskv233//Jd7LLrvskqFDhyZJ1l9//ay33nqL7ffSSy8lSTbddNO0b9++xn477LBDnn766Tz66KN5/vnnM2HChHz88ceZPn16WrRokfXXXz877rhjjj/++Gy66aa1ul8AqC9lpVKpVN9FAAAAAADUJZsvAQAAAACFIxgFAAAAAApHMAoAAAAAFI5gFAAAAAAoHMEoAAAAAFA4glEAAAAAoHAEowAAAABA4QhGAQAAAIDCEYwCAAAAAIUjGAUAAAAACkcwCgAAAAAUTqP6LoDq1l577cyaNSudO3eu71IAAAAAoF5NmDAhLVq0yEcffbTCxzZj9Btm1qxZWbBgQX2XAQAAAAD1bsGCBZk1a9ZKGduM0W+Yypmio0aNqudKAAAAAKB+9ejRY6WNbcYoAAAAAFA4glEAAAAAoHAEowAAAABA4QhGAQAAAIDCEYwCAAAAAIUjGAUAAAAACkcwCgAAAAAUjmAUAAAAACgcwSgAAAAAUDiCUQAAAACgcASjAAAAAEDhCEYBAAAAgMIRjAIAAAAAhSMYBQAAAAAKRzAKAAAAABSOYBQAAAAAKBzBKAAAAABQOIJRAAAAAKBwVvlg9KWXXsof//jH9O/fP506dUpZWVnKysqWe7xPP/00p5xySrp06ZKmTZumS5cu+Z//+Z989tlnK65oAAAAAGClKiuVSqX6LmJlOuCAA/Lvf/97kePLc9uffPJJevfunXfeeSfrr79+evXqlVGjRmXUqFHp3r17/vOf/6Rt27Zfq94ePXokSUaNGvW1xgEAAACA/3YrMytb5WeM9u7dO2effXbuueeeTJo0KU2bNl3usf7nf/4n77zzTvr375/Ro0fn9ttvz+uvv56f/exnGTNmTE499dQVWDkAAAAAsLKs8jNGv6pZs2aZN2/eMs8YnTRpUjp16pRGjRplwoQJ6dChQ1XbvHnzsu6662batGn58MMPs9Zaay13fWaMAgAAAMAXzBj9BnjooYdSUVGRHXbYoVoomiRNmzbNvvvum/Ly8jzwwAP1VCEAAAAAUFuC0VoaOXJkkmTLLbessb3y+KuvvlpnNQEAAAAAy0cwWksTJkxIknTq1KnG9srj48ePr7OaAAAAAIDl06i+C/hvMXPmzCTJaqutVmN7ixYtkiQzZsyo1XiV6yN81dixY9OlS5dMnDhxOaoEAAAAgFXHwoUL06jRyokwzRgFAAAAAArHjNFaatmyZZJk9uzZNbbPmjUrSdKqVatajbe4nbQqZ5Kus846y1oiAHwj3PXOD7KwYl59l1GvGjVomv4bXlvfZQBAvfnlsDszr3xhfZdRr5o2bJQLtz2ovsuA/3ora7ZoIhittc6dOydJPvjggxrbK4936dKlzmoCgG+ihRXzUl4qdjCaivouAADq17zyhZlfUV7fZQAskUfpa2mzzTZLkowYMaLG9srjPXv2rLOaAAAAAIDlIxitpT322CMNGjTI008/ncmTJ1drmzdvXu699940bNgwe+21Vz1VCAAAAADUlmD0Ky6//PJsvPHG+c1vflPt+Le+9a0MGDAg8+fPz4knnpiFC/9vrZTTTjstU6ZMyfe///2stdZadV0yAAAAALCMVvk1Ru+///78/ve/r/p6/vz5SZJtt9226tjZZ5+dvffeO0nyySefZPTo0Zk0adIiY1166aUZNmxY7rzzzmy88cbp1atXRo0alddffz3dunXLxRdfvJLvBgAAAABYEVb5YHTKlCl5/vnnFzn+5WNTpkyp1Vjt2rXLCy+8kHPPPTf/+te/cvfdd6dDhw45+eST8//9f/9f1lhjjRVVNgAAAACwEpWVSqVSfRfB/+nRo0eSZNSoUfVcCQAsn3+O+X7hd6VvWNY0h3b/R32XAQD15mfP3l74XembNGiYv/Q5rL7LgP96KzMrs8YoAAAAAFA4glEAAAAAoHAEowAAAABA4QhGAQAAAIDCEYwCAAAAAIUjGAUAAAAACkcwCgAAAAAUjmAUAAAAACgcwSgAAAAAUDiCUQAAAACgcASjAAAAAEDhCEYBAAAAgMIRjAIAAAAAhSMYBQAAAAAKRzAKAAAAABSOYBQAAAAAKBzBKAAAAABQOIJRAAAAAKBwBKMAAAAAQOEIRgEAAACAwhGMAgAAAACFIxgFAAAAAApHMAoAAAAAFI5gFAAAAAAoHMEoAAAAAFA4glEAAAAAoHAEowAAAABA4QhGAQAAAIDCEYwCAAAAAIUjGAUAAAAACkcwCgAAAAAUjmAUAAAAACgcwSgAAAAAUDiCUQAAAACgcASjAAAAAEDhCEYBAAAAgMIRjAIAAAAAhSMYBQAAAAAKRzAKAAAAABSOYBQAAAAAKBzBKAAAAABQOIJRAAAAAKBwBKMAAAAAQOEIRgEAAACAwhGMAgAAAACFIxgFAAAAAApHMAoAAAAAFI5gFAAAAAAoHMEoAAAAAFA4glEAAAAAoHAEowAAAABA4QhGAQAAAIDCEYwCAAAAAIUjGAUAAAAACkcwCgAAAAAUjmAUAAAAACgcwSgAAAAAUDiCUQAAAACgcASjAAAAAEDhCEYBAAAAgMIRjAIAAAAAhSMYBQAAAAAKRzAKAAAAABSOYBQAAAAAKBzBKAAAAABQOIJRAAAAAKBwBKMAAAAAQOEIRgEAAACAwhGMAgAAAACFIxgFAAAAAApHMAoAAAAAFI5gFAAAAAAoHMEoAAAAAFA4glEAAAAAoHAEowAAAABA4QhGAQAAAIDCEYwCAAAAAIUjGAUAAAAACkcwCgAAAAAUjmAUAAAAACgcwSgAAAAAUDiCUQAAAACgcASjAAAAAEDhCEYBAAAAgMIRjAIAAAAAhSMYBQAAAAAKRzAKAAAAABSOYBQAAAAAKBzBKAAAAABQOIJRAAAAAKBwBKMAAAAAQOEIRgEAAACAwhGMAgAAAACFIxgFAAAAAApHMAoAAAAAFI5gFAAAAAAoHMEoAAAAAFA4glEAAAAAoHAKEYzOmTMn55xzTrp3755mzZqlY8eOGThwYCZOnLjMYz366KPZe++90759+zRu3Dhrrrlmdtttt9x9990roXIAAAAAYGVY5YPRuXPnZuedd87vf//7zJw5M/vvv3/WXXfdXH/99dliiy3y7rvv1nqsSy+9NLvttlsefPDBdO/ePQcddFA23njjPPbYY+nfv3/OPPPMlXgnAAAAAMCKssoHo+edd16GDRuW3r17Z8yYMbn99tvz/PPP56KLLsqUKVMycODAWo0zZcqUnH766WncuHGGDBmSZ599NrfddlueffbZPPnkk2natGnOP//8ZQpaAQAAAID6sUoHo/Pnz8/ll1+eJPnrX/+ali1bVrWdeuqp6dmzZ4YOHZqXXnppqWM9//zzmTdvXnbeeef07du3WtuOO+6Y3XffPaVSKcOHD1+xNwEAAAAArHCrdDD67LPPZvr06dlggw2yxRZbLNJ+8MEHJ0nuvffepY7VtGnTWl1zzTXXXLYiAQAAAIA6t0oHoyNHjkySbLnlljW2Vx5/9dVXlzrWNttskzXWWCNPPPFEhg4dWq3tqaeeysMPP5xu3bplhx12+JpVAwAAAAAr2yodjE6YMCFJ0qlTpxrbK4+PHz9+qWO1bt061157bRo0aJCddtop22+/fQ4//PBsv/326devX7beeus8/PDDadKkyYq7AQAAAABgpWhU3wWsTDNnzkySrLbaajW2t2jRIkkyY8aMWo3Xv3//PPjggzn00EPz7LPPVh1fffXVs9tuu2WdddapdW09evSo8fjYsWPTpUuXTJw4sdZjAcA3SYPPOyWlBfVdRr1qUNbYn+UAFNpa8xpkYam+q6hfjcoa+PsArAALFy5Mo0YrJ8JcpWeMrmgXXXRRvve972XHHXfMq6++mpkzZ+bVV1/NzjvvnHPOOSf9+/ev7xIBAAAAgFpYpWeMVu5CP3v27BrbZ82alSRp1arVUsd68skn88tf/jJbbrll7rjjjjRo8EWmvOmmm2bw4MHp1atX7r///jz44IPZc889lzreqFGjajxeOZN0WWafAsA3ScWsD1JemlffZdSvsqb+LAeg0CaPq8j8ivL6LqNeNWng3/awIqys2aLJKj5jtHPnzkmSDz74oMb2yuNdunRZ6lg333xzkuTAAw+sCkUrNWzYsGq26FNPPbXc9QIAAAAAdWOVDkY322yzJMmIESNqbK883rNnz6WOVRmitm7dusb2yuOffvrpMtcJAAAAANStVToY7dOnT1q3bp2xY8fmlVdeWaR98ODBSZJ99913qWOtvfbaSZLhw4fX2P7iiy8mSdZbb73lKxYAAAAAqDOrdDDapEmT/PSnP02SnHTSSVVriibJxRdfnFdffTV9+/bNVlttVXX88ssvz8Ybb5zf/OY31cY64IADkiS33HJL7rvvvmpt//73vzNo0KA0aNAgBx544Eq6GwAAAABgRVmlN19KkrPOOiuPPfZYnnvuuXTr1i077LBDxo8fn+effz7t27fPddddV63/J598ktGjR2fSpEnVjh9wwAE55JBDcscdd2TfffdNr1690rVr17z33ntVs0j/8Ic/ZKONNqqzewMAAAAAls8qPWM0SZo1a5YhQ4bk7LPPzmqrrZZ//etfGT9+fI499tiMGDEi66+/fq3GKSsry+23355rr702O+64Y955553cfffdGTduXPbaa688+OCDOeOMM1by3QAAAAAAK0JZqVQq1XcR/J8ePXokSUaNGlXPlQDA8vnnmO+nvDSvvsuoVw3LmubQ7v+o7zIAoN787NnbM7+ivL7LqFdNGjTMX/ocVt9lwH+9lZmVrfIzRgEAAAAAvkowCgAAAAAUjmAUAAAAACgcwSgAAAAAUDiCUQAAAACgcASjAAAAAEDhCEYBAAAAgMIRjAIAAAAAhSMYBQAAAAAKRzAKAAAAABSOYBQAAAAAKBzBKAAAAABQOIJRAAAAAKBwBKMAAAAAQOEIRgEAAACAwhGMAgAAAACFIxgFAAAAAApHMAoAAAAAFI5gFAAAAAAoHMEoAAAAAFA4glEAAAAAoHAEowAAAABA4QhGAQAAAIDCEYwCAAAAAIUjGAUAAAAACkcwCgAAAAAUjmAUAAAAACgcwSgAAAAAUDiCUQAAAACgcASjAAAAAEDhCEYBAAAAgMIRjAIAAAAAhSMYBQAAAAAKRzAKAAAAABSOYBQAAAAAKBzBKAAAAABQOIJRAAAAAKBwBKMAAAAAQOEIRgEAAACAwhGMAgAAAACFIxgFAAAAAApHMAoAAAAAFI5gFAAAAAAoHMEoAAAAAFA4glEAAAAAoHAEowAAAABA4QhGAQAAAIDCEYwCAAAAAIUjGAUAAAAACkcwCgAAAAAUjmAUAAAAACgcwSgAAAAAUDiCUQAAAACgcASjAAAAAEDhCEYBAAAAgMIRjAIAAAAAhSMYBQAAAAAKRzAKAAAAABSOYBQAAAAAKBzBKAAAAABQOIJRAAAAAKBwBKMAAAAAQOEIRgEAAACAwhGMAgAAAACFIxgFAAAAAApHMAoAAAAAFI5gFAAAAAAoHMEoAAAAAFA4glEAAAAAoHAEowAAAABA4QhGAQAAAIDCEYwCAAAAAIUjGAUAAAAACkcwCgAAAAAUjmAUAAAAACgcwSgAAAAAUDiCUQAAAACgcASjAAAAAEDhCEYBAAAAgMIRjAIAAAAAhSMYBQAAAAAKRzAKAAAAABSOYBQAAAAAKBzBKAAAAABQOIJRAAAAAKBwBKMAAAAAQOEIRgEAAACAwhGMAgAAAACFIxgFAAAAAApHMAoAAAAAFI5gFAAAAAAoHMEoAAAAAFA4glEAAAAAoHAEowAAAABA4QhGAQAAAIDCEYwCAAAAAIUjGAUAAAAACkcwCgAAAAAUziofjM6ZMyfnnHNOunfvnmbNmqVjx44ZOHBgJk6cuFzjjRs3Lj/5yU/StWvXNG3aNO3atUvv3r3z5z//eQVXDgAAAACsLKt0MDp37tzsvPPO+f3vf5+ZM2dm//33z7rrrpvrr78+W2yxRd59991lGu/BBx9Mjx498ve//z1rrrlm+vfvny233DLjxo3LVVddtZLuAgAAAABY0RrVdwEr03nnnZdhw4ald+/eeeSRR9KyZcskycUXX5xf/OIXGThwYJ588slajfXWW2+lf//+adWqVR599NFst912VW0VFRUZMWLEyrgFAAAAAGAlWGVnjM6fPz+XX355kuSvf/1rVSiaJKeeemp69uyZoUOH5qWXXqrVeKeeemrmzp2bG264oVoomiQNGjRIr169VlzxAAAAAMBKtcoGo88++2ymT5+eDTbYIFtsscUi7QcffHCS5N57713qWO+//34efvjhrL/++tlrr71WeK0AAAAAQN1aZR+lHzlyZJJkyy23rLG98virr7661LGefPLJVFRUZLvttsvChQtz11135dlnn015eXk22WSTHHbYYWnTps2KKx4AAAAAWKlW2WB0woQJSZJOnTrV2F55fPz48Usd64033kiStGzZMjvssEOGDRtWrf3MM8/M4MGDs9NOO32dkgEAAACAOrLKBqMzZ85Mkqy22mo1trdo0SJJMmPGjKWO9emnnyZJrrnmmrRs2TKDBg3KHnvskSlTpuT3v/99/vGPf+TAAw/MqFGjss4669Sqvh49etR4fOzYsenSpUsmTpxYq3EA4JumweedktKC+i6jXjUoa+zPcgAKba15DbKwVN9V1K9GZQ38fQBWgIULF6ZRo5UTYa6ya4yuSBUVFUm++EZcddVVGTBgQNq0aZPu3bvn5ptvztZbb53p06fniiuuqOdKAQAAAIDaWGVnjFbuQj979uwa22fNmpUkadWqVa3HatmyZQ455JBF2o877ri8+OKLGTp0aK3rGzVqVI3HK2eS1nbmKQB801TM+iDlpXn1XUb9Kmvqz3IACm3yuIrMryiv7zLqVZMG/m0PK8LKmi2arMIzRjt37pwk+eCDD2psrzzepUuXpY5V2adz584pKytbpH299dZLkkyePHl5SgUAAAAA6tgqG4xuttlmSZIRI0bU2F55vGfPnksda4sttkjyf2uNftW0adOS/N/MUgAAAADgm22VDUb79OmT1q1bZ+zYsXnllVcWaR88eHCSZN99913qWNttt13WXHPNfPTRRxk9evQi7ZWP0FcGqAAAAADAN9sqG4w2adIkP/3pT5MkJ510UtWaokly8cUX59VXX03fvn2z1VZbVR2//PLLs/HGG+c3v/lNtbEaNWqUU089NaVSKSeddFI+//zzqrbHHnssN9xwQ8rKyvLjH/94Jd8VAAAAALAirLKbLyXJWWedlcceeyzPPfdcunXrlh122CHjx4/P888/n/bt2+e6666r1v+TTz7J6NGjM2nSpEXG+tWvfpUhQ4bkscceS/fu3bPtttvmk08+ybBhw1JeXp4//OEP2Wabberq1gAAAACAr2GVnTGaJM2aNcuQIUNy9tlnZ7XVVsu//vWvjB8/Pscee2xGjBiR9ddfv9ZjNW7cOA888ED+9Kc/pV27dnn44Yfz2muvpW/fvrn33ntzxhlnrMQ7AQAAAABWpLJSqVSq7yL4Pz169EiSjBo1qp4rAYDl888x3095aV59l1GvGpY1zaHd/1HfZQBAvfnZs7dnfkV5fZdRr5o0aJi/9DmsvsuA/3orMytbpWeMAgAAAADURDAKAAAAABSOYBQAAAAAKBzBKAAAAABQOIJRAAAAAKBwBKMAAAAAQOE0qo+LTp06NVdffXWefPLJfPDBB5kzZ07Gjh1b1X7//fdn6tSpOfzww9OkSZP6KBEAAAAAWIXVeTD68MMP58gjj8ynn36aUqmUJCkrK6vW58UXX8zvf//7tG3bNvvss09dlwgAAAAArOLq9FH60aNHp3///pk2bVoOOOCA3HjjjenRo8ci/Q4//PCUSqXcfffddVkeAAAAAFAQdRqMnn/++ZkzZ05++9vf5s4778xRRx2VNdZYY5F+G2+8cdq2bZvnnnuuLssDAAAAAAqiToPRxx9/PK1atcpZZ5211L5dunTJ+++/XwdVAQAAAABFU6fB6OTJk7PhhhumYcOGS+3buHHjLFy4sA6qAgAAAACKpk6D0dVXXz2TJ0+uVd/x48enffv2K7kiAAAAAKCI6jQY3WKLLfLhhx9m1KhRS+z31FNP5eOPP853v/vdOqoMAAAAACiSOg1GjznmmJRKpfzwhz/M1KlTa+zzwQcf5Pjjj09ZWVmOO+64uiwPAAAAACiIRnV5sSOOOCK33HJLHnroofTo0SP7779/Jk6cmCS56KKL8vrrr2fw4MGZNWtWDj744Oy99951WR4AAAAAUBB1GoyWlZXlzjvvzIknnpibbropV199dVXbaaedllKplCQ59thjc+WVV9ZlaQAAAABAgdRpMJokzZs3z/XXX5/TTjstd955Z1599dV89tlnadmyZTbZZJMcfPDB6dmzZ12XBQAAAAAUSJ0Go5dddlnKysryox/9KN/+9rdz1lln1eXlAQAAAACS1PHmS6eeemouv/zyNG3atC4vCwAAAABQTZ0Go+3bt0/r1q3r8pIAAAAAAIuo02B0hx12yJtvvpk5c+bU5WUBAAAAAKqp02D0rLPOysKFC3PKKadU7UAPAAAAAFDX6nTzpc8++yxnnHFGfve732X48OH5/ve/n29/+9tp0aLFYs/Zcccd67BCAAAAAKAI6jQY7devX8rKylIqlTJy5MiMHDlyif3LysqycOHCOqoOAAAAACiKOg1GO3funLKysrq8JAAAAADAIuo0GB03blxdXg4AAAAAoEZ1uvkSAAAAAMA3gWAUAAAAACicOn2U/sueeeaZPPTQQ3nrrbcyY8aMtGrVKhtvvHH23HPP9OnTp77KAgAAAAAKoM6D0Q8//DDf//73M3To0CRJqVSqaisrK8v555+ffv365eabb07Hjh3rujwAAAAAoADqNBidOXNmdtlll4wePTplZWXZdddds+mmm2bttdfORx99lNdffz2PPPJIhgwZku9973t58cUX06JFi7osEQAAAAAogDoNRi+55JKMHj06m2yySQYNGpRNNtlkkT6jRo3KgAEDMmrUqFx66aU588wz67JEAAAAAKAA6nTzpbvuuisNGzbM3XffXWMomiQ9evTI3XffnbKystxxxx11WR4AAAAAUBB1Goy+88476dGjRzbYYIMl9ttggw3So0ePjB07to4qAwAAAACKpE6D0S9vtLQ0ZWVlK7ESAAAAAKDI6jQY3XDDDTNq1KiMGzduif3ee++9vP7661l//fXrpjAAAAAAoFDqNBg98MADU15enoMOOihvv/12jX3GjBmTgw46KKVSKQcddFBdlgcAAAAAFESd7kp/6qmn5pZbbsnLL7+c73znO9l9992zySabpEOHDvn444/z+uuv5+GHH055eXm6deuWU089tS7LAwAAAAAKok6D0VatWuXxxx/PgAED8txzz+WBBx7Igw8+WNVeuQbp9ttvn1tuuSUtW7asy/IAAAAAgIKo02A0SdZdd90888wzeeqpp/Lggw9m9OjRmTFjRlq1apWNNtooe+65Z3bccce6LgsAAAAAKJA6D0Yr7bjjjgJQAAAAAKBe1OnmSwAAAAAA3wR1Goy+8sorGThwYAYNGrTEfrfccksGDhyY1157rY4qAwAAAACKpE6D0WuuuSY33nhjOnXqtMR+6667bm644YZce+21dVQZAAAAAFAkdRqMPvnkk2nVqtVS1xbdcccd06pVqzzxxBN1VBkAAAAAUCR1Goy+//776dq1a636du3aNR988MFKrggAAAAAKKI6DUbLy8tTKpVq3X/evHkrsRoAAAAAoKjqNBjt3Llz3nrrrXz22WdL7PfZZ5/lzTffXOpapAAAAAAAy6NOg9Fdd9018+fPz+mnn77EfmeccUYWLlyY3XbbrY4qAwAAAACKpE6D0VNPPTWrrbZarr766hx88MF54YUXUlFRkSSpqKjICy+8kEMOOSRXXXVVmjdvnl/84hd1WR4AAAAAUBCN6vJiXbp0yc0335wjjjgid999d+6+++40atQoLVq0yKxZs7Jw4cKUSqU0bdo0N998c9Zbb726LA8AAAAAKIg6nTGaJAceeGCef/757LPPPmncuHEWLFiQzz77LAsWLEiTJk1ywAEH5IUXXsiBBx5Y16UBAAAAAAVRpzNGK/Xs2TP//ve/M3fu3Lzzzjv5/PPPs/rqq6dbt25p2rRpfZQEAAAAABRInc8Y/bJmzZplk002yTbbbJOKioq8+uqrmTZtWn2WBAAAAAAUwEoNRmfPnp3nnnsuL7744mL7XHjhhWnXrl222GKLbLvttunQoUMOPfTQTJ06dWWWBgAAAAAU2EoNRu+6667ssMMOueSSS2ps/9Of/pTTTjstn3/+eUqlUkqlUsrLy3PnnXdmr732qtqxHgAAAABgRVqpwejQoUOTJMcee+wibZ988kl+97vfpaysLD169Mh9992XN998M1dffXVWX331DB8+PLfccsvKLA8AAAAAKKiVuvnS8OHD06RJk+y0006LtP3zn//MnDlz0rx589x///3p3LlzkmSjjTZKw4YNM3DgwAwePDhHHXXUyiwRAAAAACiglTpj9OOPP86GG26Yxo0bL9L25JNPJkn22GOPqlC00pFHHpkWLVrklVdeWZnlAQAAAAAFtVKD0alTp6ZFixY1tg0fPjxlZWXZbbfdFmlr3LhxOnfunClTpqzM8gAAAACAglqpwWizZs0yadKkRY5PnTo148aNS5JstdVWNZ7bsmXLlJeXr8zyAAAAAICCWqnB6IYbbpiJEydm7Nix1Y4/+uijSZLmzZtniy22qPHcjz76KG3btl2Z5QEAAAAABbVSg9E99tgjFRUV+elPf5o5c+YkST777LNccMEFKSsry5577pmGDRsuct7kyZPz/vvvp1u3biuzPAAAAACgoFZqMHryySdnjTXWyCOPPJK1114722yzTbp27ZqRI0emrKwsv/jFL2o876677kqS9O3bd2WWBwAAAAAU1EoNRjt06JB77rkn7du3z4wZMzJ8+PBMnz49ZWVl+dOf/pRtt922xvOuuOKKxW7MBAAAAADwdTVa2RfYfvvtM3bs2Nx///159913s/rqq2f33XfPBhtsUGP/Tz75JD/4wQ9SVlaW7bbbbmWXBwAAAAAU0EoPRpOkRYsWOfTQQ2vVt127djnllFNWckUAAAAAQJGt1EfpAQAAAAC+iQSjAAAAAEDhCEYBAAAAgMIRjAIAAAAAhSMYBQAAAAAKRzAKAAAAABSOYBQAAAAAKBzBKAAAAABQOIJRAAAAAKBwBKMAAAAAQOEIRgEAAACAwhGMAgAAAACFIxgFAAAAAApHMAoAAAAAFI5gFAAAAAAoHMEoAAAAAFA4glEAAAAAoHAEowAAAABA4QhGAQAAAIDCEYwCAAAAAIUjGAUAAAAACkcwCgAAAAAUjmAUAAAAACgcwSgAAAAAUDiCUQAAAACgcASjAAAAAEDhCEYBAAAAgMIRjAIAAAAAhSMYBQAAAAAKRzAKAAAAABROIYLROXPm5Jxzzkn37t3TrFmzdOzYMQMHDszEiRO/1rhvv/12mjdvnrKysnzve99bQdUCAAAAACvbKh+Mzp07NzvvvHN+//vfZ+bMmdl///2z7rrr5vrrr88WW2yRd999d7nH/tGPfpR58+atwGoBAAAAgLqwygej5513XoYNG5bevXtnzJgxuf322/P888/noosuypQpUzJw4MDlGvfaa6/Nk08+mR/+8IcruGIAAAAAYGVbpYPR+fPn5/LLL0+S/PWvf03Lli2r2k499dT07NkzQ4cOzUsvvbRM43788cf51a9+lV133TUDBgxYoTUDAAAAACvfKh2MPvvss5k+fXo22GCDbLHFFou0H3zwwUmSe++9d5nGPeWUUzJnzpxcccUVK6ROAAAAAKBurdLB6MiRI5MkW265ZY3tlcdfffXVWo/5wAMP5Pbbb88ZZ5yRDTfc8OsXCQAAAADUuVU6GJ0wYUKSpFOnTjW2Vx4fP358rcabNWtWTjzxxGy00Ub59a9/vWKKBAAAAADqXKP6LmBlmjlzZpJktdVWq7G9RYsWSZIZM2bUaryzzjor48ePz5AhQ9KkSZOvVVuPHj1qPD527Nh06dIlEydO/FrjA0B9afB5p6S0oL7LqFcNyhr7sxyAQltrXoMsLNV3FfWrUVkDfx+AFWDhwoVp1GjlRJir9IzRFWn48OG57LLLcvTRR6dfv371XQ4AAAAA8DWs0jNGK3ehnz17do3ts2bNSpK0atVqieMsXLgwP/zhD7PGGmvkwgsvXCG1jRo1qsbjlTNJ11lnnRVyHQCoaxWzPkh5aV59l1G/ypr6sxyAQps8riLzK8rru4x61aSBf9vDirCyZosmq3gw2rlz5yTJBx98UGN75fEuXboscZwPPvggr7zyStZee+0ccsgh1do+++yzJMlLL71UNZP0ySefXP6iAQAAAICVbpUORjfbbLMkyYgRI2psrzzes2fPWo330Ucf5aOPPqqx7bPPPsvQoUOXo0oAYFXTsKxpfZcAAAAsxSq9xmifPn3SunXrjB07Nq+88soi7YMHD06S7LvvvkscZ7311kupVKrxY8iQIUmSXXbZpeoYAAAAAPDNtkrPGG3SpEl++tOf5g9/+ENOOumkPPLII1U70V988cV59dVX07dv32y11VZV51x++eW5/PLLc+CBB+b888+vr9IBgFXEI488kvLyYq+x1rBhw+y22271XQYAAFSzSgejSXLWWWflsccey3PPPZdu3bplhx12yPjx4/P888+nffv2ue6666r1/+STTzJ69OhMmjSpnioGAFYl5eXlhQ9GAQDgm2iVfpQ+SZo1a5YhQ4bk7LPPzmqrrZZ//etfGT9+fI499tiMGDEi66+/fn2XCAAAAADUsVV+xmiSNG/ePL/73e/yu9/9bql9zz333Jx77rm1Hrtfv37WFQUAAACA/zKr/IxRAAAAAICvEowCAAAAAIUjGAUAAAAACkcwCgAAAAAUjmAUAAAAACgcwSgAAAAAUDiCUQAAAACgcASjAAAAAEDhCEYBAAAAgMIRjAIAAAAAhSMYBQAAAAAKRzAKAAAAABSOYBQAAAAAKBzBKAAAAABQOIJRAAAAAKBwBKMAAAAAQOEIRgEAAACAwhGMAgAAAACFIxgFAAAAAApHMAoAAAAAFI5gFAAAAAAoHMEoAAAAAFA4glEAAAAAoHAEowAAAABA4QhGAQAAAIDCEYwCAAAAAIUjGAUAAAAACkcwCgAAAAAUjmAUAAAAACgcwSgAAAAAUDiCUQAAAACgcASjAAAAAEDhCEYBAAAAgMIRjAIAAAAAhSMYBQAAAAAKRzAKAAAAABSOYBQAAAAAKBzBKAAAAABQOIJRAAAAAKBwBKMAAAAAQOEIRgEAAACAwhGMAgAAAACFIxgFAAAAAApHMAoAAAAAFI5gFAAAAAAoHMEoAAAAAFA4glEAAAAAoHAEowAAAABA4QhGAQAAAIDCEYwCAAAAAIUjGAUAAAAACkcwCgAAAAAUjmAUAAAAACgcwSgAAAAAUDiCUQAAAACgcASjAAAAAEDhCEYBAAAAgMIRjAIAAAAAhSMYBQAAAAAKRzAKAAAAABSOYBQAAAAAKBzBKAAAAABQOIJRAAAAAKBwBKMAAAAAQOEIRgEAAACAwhGMAgAAAACFIxgFAAAAAApHMAoAAAAAFI5gFAAAAAAoHMEoAAAAAFA4glEAAAAAoHAEowAAAABA4QhGAQAAAIDCEYwCAAAAAIUjGAUAAAAACkcwCgAAAAAUjmAUAAAAACgcwSgAAAAAUDiCUQAAAACgcASjAAAAAEDhCEYBAAAAgMJpVN8FAMB/u18OuzPzyhfWdxn1qlXjpvl/2xxQ32UAAADUmmAUAL6meeULM7+ivL7LqFdFD4YBAID/Ph6lBwAAAAAKRzAKAAAAABSOYBQAAAAAKBzBKAAAAABQOIJRAAAAAKBwBKMAAAAAQOEIRgEAAACAwhGMAgAAAACFIxgFAAAAAApHMAoAAAAAFI5gFAAAAAAoHMEoAAAAAFA4glEAAAAAoHAEowAAAABA4QhGAQAAAIDCKUQwOmfOnJxzzjnp3r17mjVrlo4dO2bgwIGZOHFircf47LPPMmjQoAwYMCBdu3ZNkyZN0qpVq3z3u9/N//7v/2bBggUr8Q4AAAAAgBWpUX0XsLLNnTs3O++8c4YNG5Zvfetb2X///TNu3Lhcf/31ue+++zJs2LCsv/76Sx3nwgsvzB/+8IeUlZVl8803z3e/+91MmTIlzz77bF544YUMHjw4Dz/8cFZbbbU6uCsAAAAA4OtY5WeMnnfeeRk2bFh69+6dMWPG5Pbbb8/zzz+fiy66KFOmTMnAgQNrNU6LFi1y2mmnZdy4cRkxYkRuu+22PP7443nttdfSuXPnPPPMMznvvPNW8t0AAAAAACvCKh2Mzp8/P5dffnmS5K9//WtatmxZ1XbqqaemZ8+eGTp0aF566aWljvWb3/wmf/rTn9K5c+dqx7t165Y//vGPSZJbb711BVYPAAAAAKwsq3Qw+uyzz2b69OnZYIMNssUWWyzSfvDBBydJ7r333q91nc022yxJ8uGHH36tcQAAAACAurFKB6MjR45Mkmy55ZY1tlcef/XVV7/Wdd59990kydprr/21xgEAAAAA6sYqHYxOmDAhSdKpU6ca2yuPjx8//mtd53//93+TJPvvv//XGgcAAAAAqBur9K70M2fOTJLF7hTfokWLJMmMGTOW+xp/+9vf8thjj2WNNdbI6aefXuvzevToUePxsWPHpkuXLpk4ceJy1wRA3VprXoMsLNV3FfWr2cKyqj+7GnzeKSktqOeK6ldZg+ZVr0dFRUVKpWK/QSoqKvzdBqBg/P0oaVTWwJ9/sAIsXLgwjRqtnAhzlZ4xurI9/fTTOeWUU1JWVpbrrrsuHTt2rO+SAAAAAIBaWKVnjFbuQj979uwa22fNmpUkadWq1TKP/frrr2f//ffP/Pnzc9lll+XAAw9cpvNHjRpV4/HKmaTrrLPOMtcEQP2YPK4i8yvK67uMetWyUcOqP7sqZn2Q8tK8eq6ofjVssHrV6/Hqq68WfsZogwYN/N0GoGD8/Shp0sC/7RfnkUceSXl5sd8fDRs2zG677VbfZfxXWFmzRZNVPBjt3LlzkuSDDz6osb3yeJcuXZZp3Pfeey+77bZbPv3005x77rn52c9+9vUKBQAAACiI8vLywgejfDOs0o/Sb7bZZkmSESNG1Nheebxnz561HnPSpEnZddddM2nSpJxyyin57W9/+/ULBQAAAADq1CodjPbp0yetW7fO2LFj88orryzSPnjw4CTJvvvuW6vxPv300+y+++4ZO3ZsjjvuuFxyySUrslwAAAAAoI6s0sFokyZN8tOf/jRJctJJJ1WtKZokF198cV599dX07ds3W221VdXxyy+/PBtvvHF+85vfVBtr9uzZ2XvvvfPaa6/l0EMPzdVXX52ysrK6uREAAACAr6FUKqWiouIb8cEX6vv7UNcf38R191fpNUaT5Kyzzspjjz2W5557Lt26dcsOO+yQ8ePH5/nnn0/79u1z3XXXVev/ySefZPTo0Zk0aVK142eeeWb+85//pGHDhmnUqFF+8IMf1Hi9G264YWXdCgAAAECtlZeXZ+rUqZkxY0bmz59f3+VUadOmzTcyJKtLZWVlGT16dH2XUeeaNGmSVq1aZc0110zDhg3ru5xVPxht1qxZhgwZkvPPPz+DBg3Kv/71r7Rt2zbHHntsfv/736dTp061GufTTz9N8sUvlUGDBi22n2AUAAAAqG/l5eWZMGFC5s6dW9+lLKJly5b1XQL1ZP78+Zk6dWpmzZqVzp0713s4usoHo0nSvHnz/O53v8vvfve7pfY999xzc+655y5y/IYbbhB6AgAAAP8Vpk6dmrlz56Zhw4bp0KFDWrRokQYNvhkrKk6fPt2M0bKytG7dur7LqFMVFRWZNWtWPv7448ydOzdTp07NWmutVa81FSIYBQAAACiSGTNmJEk6dOjwjQvgGjRoIBgtK/vGBNV1pUGDBlXvxQ8//DAzZsyo92C0WN8BAAAAgFVcqVSqWlO0RYsW9VwNVFf5npw/f369B+SCUQAAAIBVyJfDpqLNSuSb78vvScEoAAAAAEAdE4wCAAAAAIUjGAUAAAAACkcwCgAAAAAUjmAUAAAAgDrTunXrrLHGGsv0saK0adOm2kfbtm3TpUuX7LnnnrnpppvqfTOgr7rhhhtSVlaWc889d7nHOPbYY1NWVpYnn3xyhdW1qmhU3wUAAAAAQF0aMGBAkqS8vDzvvfdenn/++QwbNixDhw7NtddeW8/VUVcEowAAAAAF9cthd2Ze+cL6LqNGTRs2yoXbHrRSxr7iiiuqfT1kyJAceuihueuuu3LIIYdkjz32WCnXXVYHHnhgtt1227Rr1265xzj//PNz+umnp3PnziuwslWDYBQAAACgoOaVL8z8ivL6LqPe7bTTTjnssMNyyy235IEHHvjGBKOtW7dO69atv9YY3/rWt/Ktb31rBVW0arHGKAAAAACF17NnzyTJxIkTq461adMmPXv2zPz583PBBRdkm222SYcOHXLkkUdW9Zk9e3Yuvvji7LjjjunUqVM6deqUXXfdNbfeeutirzVt2rSceeaZ2XTTTdOiRYusvvrq2XTTTXPaaadl0qRJVf0Wt8bo/Pnzc8UVV2TrrbfOmmuumdVWWy3rrbde9tlnn9x2223V+i5pjdH3338/P/7xj9OlS5c0bdo0a621Vvr3758XX3xxkb7jxo1LWVlZ+vXrlzlz5uT000+vOm/DDTfMn/70p2/cGq1LY8YoAAAAAIU3Y8aMJEmTJk2qHa+oqMiRRx6Z//znP9luu+3So0ePtGnTJkkyZcqUHHjggRk1alQ6dOiQ7bbbLqVSKS+88EJOPPHEvPzyy7nggguqjTd69OgcdNBBmThxYtZee+3svvvuSZIxY8bkz3/+c7bbbrsccMABS6z1yCOPzODBg9OqVavssMMOWX311TNx4sQ888wzmTlzZg4//PCl3u9rr72WnXfeOZ988kk22mij9O/fPxMmTMjdd9+de++9N4MGDcohhxyyyHnz58/PbrvtljfeeCP9+vXLrFmzMnTo0Jx++umZMWNGzjvvvKVe+5tCMAoAAABAoZVKpTz88MNJkh49elRrmzhxYpo2bZoXXnghHTt2rNZ20kknZdSoUfnJT36Sc889N02bNk2STJ48OYcffniuvvrq7Lbbbvne976XJFm4cGGOOuqoTJw4Mf/zP/+TP/3pT9WC2FGjRqVZs2ZLrPW9997L4MGD06VLl7z00ktZc801q9rmzp2bl19+uVb3e+SRR+aTTz7Jaaedlj/+8Y8pKytLktx555059NBDM3DgwGy//faLPIb/n//8J3379s17772X1VdfPUkyfPjwbLvttrnkkkty+umnp2XLlkut4ZvAo/QAAAAAFFJ5eXnGjh2bn/70p3nxxRfTtGnTao/JVzrnnHMWCUVfe+21PProo9lyyy3zhz/8oSoUTZK11lorl156aZLkuuuuqzp+77335u233863v/3tXHjhhYvMTu3Ro0c22GCDJdY8ZcqUJMkWW2xRLRRNkmbNmqV3795Lve8nn3wyr732Wjp37pzzzjuvKhRNkoMOOigHHHBAZs6cWa32Sg0aNMhVV11VFYomSa9evbLnnntm9uzZGT58+FKv/00hGAUAAACgUNq0aZM2bdqkXbt26dWrVwYNGpRWrVrlmmuuSdeuXav1LSsrq3EzpieeeCJJstdee6VBg0Ujtp49e6Zly5YZMWJE1bGhQ4cmSY466qg0bNhwuWrfeOON06JFi9x///3585//nA8//HCZx3j66aeTJIceemgaN268SPtRRx1Vrd+XdenSJRtttNEix7t3754k1dZI/abzKD0AAAAAhTJgwIAkX8x+bNWqVb7zne9k3333zRprrLFI3/bt21ebDVppwoQJSZLzzjtvietqzp07t+rzDz74IEkWCV+Xxeqrr56rr746P/rRj3LaaafltNNOS/fu3bPTTjvlqKOOSp8+fZY6RmWYut5669XYXnn8yxtRVerUqVON57Rq1SpJMm/evFrcxTeDYBQAAACAQrniiitq3bemUDT5YlOmJNl2222/VtC5PAYMGJDvfe97+fe//51HHnkkQ4cOzVVXXZWrrroqp556ai666KKvNf6XH63/qppmx/63EowCAAAAwDJaZ511kiR77713fvrTn9bqnMrZlu+9997Xvn779u1z/PHH5/jjj6/aPOqwww7LxRdfnIEDBy6yidSXVa6XOn78+Brbx40bl+T/7nFVtepEvAAAAABQR/r165ckue+++2p9Tt++fZMk//jHP6pmnK4Ileug7r333km+2N1+SXbYYYckyR133JHy8vJF2v/xj39U67eqEowCAAAAwDLq1atXdtpppzz//PP55S9/mc8//3yRPq+99loee+yxqq/33XffbLjhhnnjjTdy2mmnZcGCBdX6jxo1Ku++++4Sr/vyyy/nrrvuyvz586sdnzZtWp5//vkkybrrrrvEMfr165dNN90048aNyznnnJNSqVTVdvfdd+euu+5Ky5YtM3DgwCWO89/Oo/QAAAAABdW04Tc3Gvom11bpqquuysEHH5xrr702gwcPzqabbpq11147n3/+eUaNGpWJEyfmJz/5Sb73ve8lSRo1apQbb7wx/fv3z0UXXZRBgwald+/eKZVKefvtt/P666/n7rvvzvrrr7/Ya44fPz4HHXRQWrdunV69emXttdfOZ599lqeeeiozZszIvvvum969ey+x7rKystxyyy3Zaaed8v/+3//L3Xffnc033zwTJkzIs88+m0aNGuXaa6/Nt771rRX6en3TfPPfYQAAAACsFBdue1B9l/BfrX379nn44Ydz00035c4778yrr76aF154Ie3bt896662XH//4x+nfv3+1c77zne/kmWeeyVVXXZV77rknDzzwQJo2bZrOnTvn17/+dbbddtslXnPbbbfNeeedlyeeeCKjR4/O008/nTZt2qRnz575wQ9+kO9///u1qn3TTTfNiBEjct555+Whhx7K4MGD07p16xxwwAH5zW9+k2222Wa5X5f/FmWlL8+Vpd5VLoy7tLUgAPjm+Nmzt2d+xaLr8hRJy0ZNclHvg5Mk/xzz/ZSX5tVzRfWrSYPVc1C3a5MkDz74YI3rNhVJw4YNs+eee9Z3GQDUIX8/Spo0aJi/9DmsXq5dUVGR0aNHJ0k22mijVWIX8c8++6y+S1ihysrK0rp16/ouo14s6/tzZWZlZowCAAAAUGemT58e8/T4Jvjv/08GAAAAAADLSDAKAAAAABSOYBQAAAAAKBzBKAAAAABQOIJRAAAAAKBwBKMAAAAAQOEIRgEAAACAwhGMAgAAAACFIxgFAAAAAApHMAoAAAAAFI5gFAAAAAAoHMEoAAAAAFA4glEAAAAA6kzr1q2zxhprLNPHitKmTZtqH23btk3nzp2z66675sorr8yCBQtW2LVWphtuuCFlZWU599xzqx0/99xzU1ZWlhtuuKFe6vpv06i+CwAAAACAujRgwIAkSXl5eSZMmJAXXnghw4cPz8MPP5zBgwenUSORWRH4LgMAAAAU1F3v/CALK+bVdxk1atSgafpveO1KGfuKK66o9vXw4cOz7777ZujQobnzzjtz2GGHrZTr8s3iUXoAAACAglpYMS/lpW/mR10Gtr169aqaRfrEE0/U2XWpX4JRAAAAgBWsSYOG9V0Cy2jjjTdOknzyySeLtJVKpQwePDj77bdf1ltvvay99tr57ne/mz/+8Y+ZPXt2jeMtWLAg1113XfbYY4906dIl3/rWt7LlllvmpJNOyiuvvFJt7FtvvTWHH354unfvnhYtWqRVq1bZZpttcsUVV6SiomKl3C8epQcAAACAzJw5M0nSrl27ascrKiryox/9KHfeeWdatmyZzTffPGussUZefvnl/OlPf8pjjz2We++9N82bN686Z9asWTn00EPz3HPPpUWLFtl2222z+uqr5/33388dd9yR1VdfPX379k2SzJs3L0cccUTWXHPNfOc738mWW26ZqVOn5rnnnstJJ52UF154wWZKK4lgFAAAAGAlqut1PMtKDdNx7mFZvUnHTJv7Xho0KPtqj7Rt1rXO6vlv8fjjjydJdtlll2rHL7/88tx5553Zfvvtc80116RDhw5Jkvnz5+eXv/xlbr755vzpT3+qtkP8b37zmzz33HPZbrvtcuONN1YLWydPnpz333+/6utGjRrl7rvvzt57753GjRtXHZ8yZUr22muv3HjjjRk4cGB23HHHlXHbheZRegAAAICVqO7X8Zyf5IvHr0uL+R9fqKioyHvvvZdTTz01zz33XPbaa6/079+/qn3hwoW57LLL0qJFi1x77bVVoWiSNGnSJH/605/SoUOH3HjjjVWPvE+aNCmDBg1K06ZNc+WVVy4yA3WttdZKr169qr5u1KhRDjjggGqhaJK0b98+559/fpLk3//+9wq/d8wYBQAAAKBg2rRps8ixY445JpdccknKyv5vhu3IkSMzderU7LTTTllrrbUWOad58+bZbLPN8sgjj2Ts2LHp1q1bnnnmmZSXl2f33XdP586da13TK6+8kkceeSTjx4/P7NmzUyqVMmPGjCTJ22+/vRx3ydIIRgEAAAAolMod6OfOnZtRo0ZlzJgxufHGG7PNNtvkiCOOqOo3YcKEJMmQIUNqDFO/bOrUqenWrVsmTpyYJOnatXbLFcyfPz/HHntsbr311sX2qQxIWbEEowAAAAAUyhVXXFHt68suuyy//e1v86tf/Srbb7991UzPysfj119//Xz3u99d4pht27Zdrlouvvji3Hrrrdl0001zwQUXZMstt0ybNm3SuHHjjBkzJhtttFFKJcsfrAyCUQAAAAAK7eSTT87QoUPzxBNP5IILLsjll1+eJOnYsWOSpFu3bouEqYuzzjrrJEnee++9WvW/++67kyS33nprevToUa3t3XffrdUYLB+bLwEAAABQeL/97W+TJLfffnvVI/RbbrllVl999Tz33HP59NNPazXO9ttvn4YNG+aJJ57IBx98sNT+leN26tRpkbZ//vOftS2f5SAYBQAAAKDwevbsmb333rtqJ/okadq0aU4++eTMmDEjRx11VMaNG7fIeR9++GFuu+22qq+/9a1v5fDDD8/cuXNz4oknZtq0adX6T5kyJcOHD6/6unv37kmSv/3tb9X6DR48ODfddNOKuj1q4FF6AAAAgIJq1KBpUlHfVdSsUYOmdX7NX//613nggQdyyy235Fe/+lU6dOiQn//853n77bdz++2357vf/W569uyZzp07Z8GCBXn77bczevTo9OjRI4cffnjVOOeff37efvvtPP300+nZs2d69+6dVq1a5YMPPsjIkSMzcODA7LLLLkmS0047LQ899FBOP/303HHHHenevXvefvvtDB8+PL/85S9z4YUX1vnrUBSCUQAAAICC6r/htfVdwjfKpptumn322Sf33ntv/vrXv+Z3v/tdGjRokL/97W/Zb7/9cuONN+bll1/OyJEjs8Yaa2SdddbJz372s/Tv37/aOK1atcq9996b66+/PnfccUeGDRuW8vLyrL322jnkkEOqhag77rhjnnnmmZx55pl5+eWXM2bMmGy66aa58847s+WWWwpGVyLBKAAAAACFUJt1Qhf3+Ppee+2Vvfbaq9bXatKkSX784x/nxz/+8SJtZWVl1b7edttt8/jjj9c4Tk070h977LE59thjFzl+7rnn5txzz611jUUnGAUAAAAolFKmzq3b3c4bpGHaNOuSJJk+fXqNYR/UNcEoAAAAQOHUdTApCOWbx670AAAAAEDhCEYBAAAAgMIRjAIAAAAAhSMYBQAAAAAKRzAKAAAAABSOYBQAAAAAKBzBKAAAAABQOIJRAAAAAKBwBKMAAAAAQOEIRgEAAACAwhGMAgAAAACF06i+CwAAAACgrpWt4terWZs2bZbY3qdPn9x3331VX7/yyisZMmRIRowYkREjRuTDDz9Mknz66afLXUNFRUUGDRqUO++8MyNHjsyMGTPStm3brL322tlmm23Sr1+/HHnkkcs9PrUnGAUAAAAolLKs2Wz9ert669atl/mczz77bIXWMGDAgBqPd+vWrdrXf/7zn/PAAw+ssOvOnz8/RxxxRB5//PE0aNAg22yzTbp06ZJ58+Zl5MiRueaaa3LzzTcLRuuIYBQAAACAQrniiitq1W/rrbdOjx49ssUWW2TLLbfMZpttlnnz5i33da+++uo8/vjj6dSpUx599NFsvPHG1dpHjRqVG2+8cbnHZ9kIRgEAAAAK6pFHHkl5eXl9l1Gjhg0bZrfddqvXGv7nf/5nhY53zz33JElOO+20RULRJOnRo0cuuOCCFXpNFs/mSwAAAAAFVV5e/o3+WNVMnTo1SbLmmmsu87nvv/9+Tj755HTv3j3NmzdP27Zt06tXr/x//9//l88//7xa39mzZ+f3v/99NtlkkzRv3jytW7fOjjvumNtuu63Gsddbb72UlZWlVCrlL3/5SzbbbLOsttpq2Xzzzav6LFy4MFdeeWV69+6d1VdfPc2bN8/mm2+eSy+9NAsXLlzm+/kmEIwCAAAAQB1YZ511kiQ33XRTFixYUOvznn766fTs2TN/+ctfsmDBguy7777p06dPpk+fnnPPPTfvvvtuVd8ZM2Zkxx13zDnnnJPJkydnn332SZ8+ffLCCy9kwIABOeWUUxZ7nZ/85Cf5xS9+kbXWWiv77bdf1l//i7Vo58yZk9122y0nnnhixowZk2233Ta77rprJk2alJ///Oc56KCDUlFRsZyvSv3xKD0AAAAA1IGjjz46Tz31VB5++OFsuOGG6d+/f3r37p2tttoqG2ywQY3nTJs2LQcddFA+++yz/PnPf86pp56aBg3+b67jf/7zn3Ts2LHq6zPOOCMvvfRSdtppp/z73/9Oq1atkiRvvfVW+vbtm8suuyy77rpr9tlnn0Wuddddd+Xll19Ojx49qh3/5S9/mSFDhuSwww7LVVddVbWB1owZM3L44Yfnnnvuyd///vf85Cc/+dqvUV0yYxQAAACAQmnTpk2NHxMmTFip1z3ooIPyhz/8IS1atMiECRNy6aWX5rDDDsuGG26Yrl275o9//GPmzp1b7ZxrrrkmU6ZMyR577JFf/vKX1ULRJOndu3fWWmutJMmsWbNy7bXXpkGDBrniiiuqQtEk2XjjjXPWWWclSf73f/+3xvp+/etfLxKKTp48OVdffXXWXXfdXH/99VWhaJK0atUq1157bZo0aZIrr7xy+V+YemLGKAAAAACFMmDAgBqPt2jRYqVf+8QTT8wRRxyRRx99NE8++WRefPHFvP322xk3blx+85vf5N///neeeOKJNG/ePEny2GOPJUl+/OMfL3Xsl156KXPmzEmvXr1q3NzpqKOOysknn5xnn302FRUVi4Ss++233yLnPPnkk1mwYEH22GOPqpq+bO211063bt3y2muvZc6cOTX2+aYSjAIAAABQKFdcccVKGfeSSy7J22+/Xe1Yt27d8vOf/7zasTZt2uSHP/xhfvjDHyZJxo8fn7/+9a+5+OKLM2zYsFx88cU588wzk3yx6VKSxT5q/2Uffvhhki82U6rJGmuskdatW2f69On59NNPF9kEqnPnzoucM27cuCTJ1VdfnauvvnqJ1582bVrVOqr/DQSjAAAAALACPP7443n22WerHevTp88iwehXdenSJRdccEEWLlyYSy65JPfff39VMLqilZWVLbatWbNmixyr3FRp8803z2abbbbEsZs2bfr1iqtjglEAAAAAWAHuu+++r3X+zjvvnEsuuSSffPJJ1bF11103b731VsaOHZtNN910iedXbsI0fvz4GtunT5+ezz77LM2bN0+bNm1qVVOnTp2SJNtvv33+8pe/1Oqc/xY2XwIAAACAOlAqlZbY/s477yRJtcfRv/e97yVJ/v73vy91/K222irNmzfPSy+9tMgj/Unyj3/8I8kXs1i/ur7o4uy0005p2LBh7rvvvixYsKBW5/y3EIwCAAAAQB0YMGBArrrqqnz66aeLtD3//PP5/e9/nyQ5+OCDq44ff/zxadeuXR588MFceumli4Srw4YNy+TJk5N8sXnUwIEDU1FRkZNOOimzZs2q6jdmzJicd955SZKTTz651jWvs846GThwYMaNG5cBAwbk448/XqTPO++8kzvvvLPWY35TeJQeAAAAoKAaNmxY3yUs1jehtocffjgXXnhh1dfz589Pkuy6665Vx375y19m9913r9V4EydOzOmnn56zzjorm2++ebp27ZqKioqMHTs2r7zySpJk3333rbYDfdu2bXPHHXdkv/32y89//vNcdtll2XrrrTNnzpy8+eabeeedd/Lyyy9nrbXWSpKcf/75GTZsWB599NGsv/766du3b2bNmpUnnngic+fOzcknn5x99913mV6H//3f/824ceNy55135qGHHsrmm2+ezp07Z9asWXnjjTfyzjvvZP/9989BBx20TOPWN8EoAAAAQEHttttu9V3CN9rUqVMzfPjwRY5/+djUqVNrPd6NN96Yxx57LEOGDMm7776bBx98MPPmzUu7du2y995758gjj8zhhx++yAZJ/fr1y8iRI3PBBRfkoYceyr/+9a+0bNkyXbt2ze9+97tqO9a3atUqQ4cOzUUXXZTbb78999xzT5o0aZJevXrlxBNPzIABA5b5dWjevHkefPDB3HLLLbnxxhvzyiuv5IUXXkj79u3TpUuXHHXUUTn88MOXedz6JhgFAAAAoBBqeoR9SY444ogcccQRK+z666+/fn70ox/lxz/+cVq3br1M53bt2jVXXnllrfq2aNEi55xzTs4555xa9R83btxS+zRs2DBHH310jj766FqN+d9AMAoAAABQKKVMnftunV6xQRqmTbMuSb7YGX1pmxBBXRCMAgAAABROXQeTglC+eexKDwAAAAAUjmAUAAAAACgcwSgAAAAAUDiCUQAAAACgcASjAAAAAEDhCEYBAAAAgMIRjAIAAACsQkopr/qsVFGq11rgqyoqKqo+Lysrq8dKBKMAAAAAq5ayUhaUfZ7y0vzMm1O+9P5Qh2bNmpUkadKkSb0Ho43q9eoAAAAArHCzG41L8/IO+fyTpkmSps0bpqxBfYZQpaqZghUVFSmVij2TtaysrNrMySKoqKjIrFmz8vHHHydJWrVqVc8VCUYBAAAAVjmfNxqV5uXrJPOTBR+3TMOyJknqd3be5AajkyTl5WaxJslHH31U3yXUm2bNmmXNNdes7zL+//buPS7n+/8f+OPdVVfnk0MSKUUiOhGmUs1sPo5twxc/5rCNbTmEjzFmmvl8jOU0GzazDNvYnKYNsUklZEq1zLmEFGVKB5W6Xr8/3K7r0+W6oqKDetxvt+v20ev0fr3f+3h5X89eBwZGiYiIiIiIiIgaGyGV4pb+IZiVucCozB56wqxe+yNBBksDewBAQUEBZ4xKEszNzeu7G3VOLpfD1NQUzZs3h0wmq+/uMDBKRERERERERNQYCakUeXpnkKd3BhASJNRfIEquY4reHTYAACIiIpr8rFGZTIaePXvWdzfqlCRJ9b6n6KOaxOFL9+/fx0cffQQnJycYGBjAxsYGkyZNQkZGRrXbunv3LmbMmAE7Ozvo6+vDzs4OwcHByM3NffYdJyIiIiIiIiJ6FiQBIZXV46ccOjo60NFpEqGoKlE+j6byaWhBUaAJBEaLi4vx4osv4pNPPkFBQQGGDRsGW1tbhIWFwcPDA6mpqVVuKycnBz179sTnn38OXV1dBAYGwtTUFGvWrEGvXr3wzz//1OKdEBERERERERER0bPS6AOjS5YswcmTJ/HCCy/g4sWL2LFjB+Li4rBixQpkZ2dj0qRJVW4rODgYly9fxmuvvYYLFy5gx44dSElJwbRp03Dx4kXMmjWrFu+EiIiIiIiIiIiInpVGHRgtLS3FF198AQD48ssvYWJiosqbNWsWXF1dERUVhfj4+Ce2lZmZiR9//BFyuRzr1q2Dru7/tmf97LPP0LJlS2zbtg23b99+9jdCREREREREREREz1SjDozGxsYiLy8Pjo6O8PDw0MgfPnw4ACA8PPyJbR08eBAKhQK+vr5o1aqVWp6+vj6GDBmC8vJy7N+//9l0noiIiIiIiIiIiGpNow6MJiUlAQA8PT215ivTk5OT67QtIiIiIiIiIiIiql+6Ty7y/Lp27RoAoG3btlrzlenp6el12hYRETUu+rJG/c9plVR8Bro6+oCiHjvTAOjqyFV/lslk9diThoHPgIio6eH7Ed+PHsX3I3V8Bg1Dox6pCgoKAABGRkZa842NjQEA+fn5ddoWALi4uGhNP3/+PHR1ddGpU6cqtUNERNRQ7MLc+u5Cg/I++G85ERFRU8f3I3V8P6KauHr1KuRy+ZML1kCjDow+jyRJglwuVzvcieh5cOXKFQCAo6NjPfeEiJo6jkdE1FBwPCKihoLjET3P5HK5akLis9aoo2/KU+iLioq05hcWFgIATE1N67QtADh79myVyhE9L5SzoPn/bSKqbxyPiKih4HhERA0FxyMi7Rr14Uvt2rUDANy4cUNrvjLdzs6uTtsiIiIiIiIiIiKi+tWoA6Nubm4AgISEBK35ynRXV9c6bYuIiIiIiIiIiIjqV6MOjHp7e8Pc3BxXrlxBYmKiRv7OnTsBAEOGDHliWwMGDICOjg5iYmJw+/ZttbySkhKEh4dDJpNh4MCBz6TvREREREREREREVHsadWBULpdj6tSpAICgoCDVPqAAsHLlSiQnJ8PPzw/du3dXpX/xxRdwdnbGBx98oNZW69atMXr0aJSWluK9995DWVmZKu/9999HdnY2xo4dCysrq1q+KyIiIiIiIiIiInpajfrwJQD48MMP8fvvv+P48ePo2LEjfH19kZ6ejri4OLRs2RLffvutWvmcnBxcuHABmZmZGm2tXr0aJ0+exK5du+Ds7IwePXrg7NmzSElJQceOHbFy5cq6ui0iIiIiIiIiIiJ6Co16xigAGBgYIDIyEgsXLoSRkRH27t2L9PR0TJgwAQkJCXBwcKhyWy1atMCpU6cwbdo0lJaWYs+ePcjLy8P06dNx6tQpNGvWrBbvhKhhO3v2LE84JKIGgeMRETUUHI+IqKHgeESknSSEEPXdCSIiIiIiIiIiIqK61OhnjBIRERERERERERE9ioFRIiIiIiIiIiIianIYGCUiIiIiIiIiIqImh4FRIiIiIiIiIiIianIYGCUiIiIiIiIiIqImh4FRIiIiIiIiIiIianIYGCVqpM6cOQNJktCmTRut+QqFAhYWFpAkCZMnT9ZaJjo6GpIkwcXFpVb6ePToUUiShAkTJlSrnr29PSRJqpU+EdGzFRkZiddffx1t2rSBXC6HpaUlOnXqhBEjRuCLL75AXl5efXeRiJ4zkiRpfPT09GBjY4PXX38dx48fr5d++fv7Q5IkXL16tV6uT0QNQ229+1Tlu1NsbCwkSYKdnZ3WsfJxn5CQkJrdcBVwfKSGTLe+O0BEtcPNzQ1mZma4efMmUlNT4eDgoJb/119/qf5RPnbsmNY2YmJiAAC+vr6121kiapQWL16MRYsWAQA6d+6MXr16QU9PDxcuXMDu3buxc+dO9OjRA717967nnqo7evQoAgICMH78eGzevLm+u0NElRg/frzqz/n5+UhKSsLu3buxZ88ebNu2DWPGjKnH3hFRU1Tf7z7h4eEAgC1btiAsLEwj/7vvvgMAvP766zAxMVHLc3d3r5U+ETV0DIwSNVI6Ojro06cPDh48iGPHjmkERpVBTzc3NyQnJ+POnTto3ry51jIMjBJRdcXHxyMkJAR6enr46aefEBgYqJaflZWFbdu2wcLCol76R0TPv0d/caFQKDB//nwsW7YM06dPx4gRI6Cnp1dn/dmyZQuKiooqXa1DRI1bbb/79OzZE+fOnYO5uXmlZfbt24c2bdqgb9++8PPz08hXBkZDQ0Nhb29fo34QNTZcSk/UiCkDmtpmhB47dgx6enoIDg6GEAKxsbFq+QqFAidOnFBrh4ioqnbv3g0hBEaOHKnxxQAArK2t8e9//xvOzs513zkiapR0dHSwePFi6Orq4s6dOzh79mydXr9du3Zwdnau02AsETUctf3uY2RkBGdnZ7Ru3Vpr/pUrV3Du3DkMHjyY244RVQMDo0SNmDKgqZz5WVFMTAw8PT3Rv39/rWWSkpJw7949tGvXDu3atQMA3LlzB3PmzEHHjh1hYGCAZs2aYcCAATh06JDW60uSBHt7e5SWlmLx4sVwdnaGvr6+1heFR92/fx8LFixA+/btYWBgAEdHRyxatAilpaXVeQREVE+ys7MBAC1btqxyHeX+wUIIrFmzBl26dIGBgQHatGmD6dOnIzc3V2u9oqIifPLJJ+jatSsMDQ1hbm6Ovn37Yvv27U+8ztq1a+Hm5gYjIyO4u7tjwoQJCAgIAPBwVkVle2+lpKRg7NixcHBwgIGBAVq2bAl3d3cEBwcjMzOzyvdMRM+WXC5XzaYqKytTy1O+l2izefNmrXvsFRQUYOnSpXBzc4O5uTlMTEzg6OiIESNGICIiQq1sZXvoKa9bXl6OZcuWwcnJCfr6+rC1tcXcuXNRUlKitU9FRUVYunQpPDw8YGJiAhMTE/Tu3Vs14+tR6enpePfdd+Hk5AQjIyM0a9YMLi4umDJlCi5cuKBWlmMY0bNXk3cfACgsLMSyZcvQo0cPmJmZwdjYGM7OzggKCsLFixdV5Z60x+i+ffsAAEOGDKlR/69fv46pU6fC0dFR9V1v8ODBj923+dy5c3jzzTdhb28PfX19WFlZwdvbG6GhoRpjsNLevXvRu3dvGBsbo1mzZhg9ejRu3LhRoz4TPQtcSk/UiPXs2RP6+vq4cOECcnJy0KJFCwBAamoqbt68idGjR6NNmzaws7PTmFX66DL6jIwM9O3bF6mpqWjXrh0CAwORnZ2N33//HREREVi5ciVmzpyp0QeFQoHAwEBER0fDz88Prq6uGkv2H1VaWopXXnkFMTExsLS0xKBBg1BSUoLPPvsMZ86cgRDiWTweIqpFtra2AIBdu3bhgw8+gJWVVZXrTps2DV9//TX8/f3RrVs3REVFYe3atYiKikJMTAzMzMxUZfPz8xEQEID4+Hi0bNkSgwcPRmFhIY4cOYKYmBicOHECa9as0Xqdd955B2FhYfDz80Pnzp1RWloKHx8fZGVlISIiAo6OjvDx8VGVV+69FR8fDx8fHxQXF8PV1RXDhg1DUVERUlNTsWbNGgQGBlY6m4OIaldaWhru3LkDPT09dOjQ4anaKi8vx0svvYS4uDi0aNEC/v7+MDAwwI0bN7B//34YGxvjlVdeqXJ7Y8aMwf79++Hv749OnTohJiYGy5cvR0ZGBrZt26ZW9vbt2+jfvz+Sk5NhbW0NPz8/CCFw/PhxTJgwAadPn8batWtV5a9fvw5PT0/8888/6NixIwYOHIjy8nKkp6dj48aNeOGFF9CpUycAHMOIaktN3n0yMzPRv39/nD17FpaWlvD394e+vj5SU1OxYcMGdOzYEU5OTlW6fnh4OIyMjNCvX79q9/3EiRMYNGgQ7t69i06dOmHQoEHIzs5GREQEDh48iO+//x7/93//p1bn559/xrhx41BSUoLOnTvj1VdfRV5eHs6ePYs5c+bgrbfe0tg2YN26dVi5ciV8fX0xcOBAxMXFYfv27YiPj0dSUhIMDQ2r3XeipyaIqFHz8fERAMTevXtVad99950AIPbs2SOEEGLMmDFCLpeLoqIiVZkRI0YIAGLDhg1CCCEGDx4sAIgxY8aIkpISVbmYmBhhZGQkZDKZOHPmjNq1AQgAokOHDuLGjRsafYuMjBQAxPjx49XSP/30UwFAeHh4iJycHFX6pUuXhI2NjapdImq4rly5IgwNDQUAYWpqKsaPHy82btwoEhISRFlZmdY6dnZ2AoAwMzMTp0+fVqXn5+eLF198UQAQM2bMUKszdepUAUAEBASIe/fuqdLPnTsnrKysBAARHh6u9TotWrQQKSkpGv2obGxSeuONNwQAERoaqpF37tw5cfPmzcoeCxE9A9reA/Lz80VMTIzo0aOHACCmT5+utZ6dnZ3WNsPCwgQAsWjRIlXakSNHBADh5eUl7t+/r1Y+Ly9PbZwSQgg/Pz8BQKSlpWntb+fOnUVmZqYqPTU1VVhYWAgA4vLly2p1Bg4cqBrziouLVelZWVmqezxw4IAq/aOPPhIAxNSpUzXuLT09Xa19jmFEtaMm7z79+vUTAMTIkSNFfn6+Wl5aWppISkpS/fy495O7d+8KXV1dMWzYsMf2UTkeVRyn8vLyROvWrYVMJhPbtm1TK//nn38KS0tLYWJiIm7fvq1Kv3jxojAwMBC6urri+++/V6ujUChERESE2tilHB+NjIzE8ePHVemFhYWiT58+AoDYtGnTY/tOVFu4lJ6okdO2nF75Z29vb9X/lpaWIi4uTlVGOYPU19cXqamp+PXXX2FiYoK1a9dCLperyvn4+OCdd95BeXk5vvzyS619WLp0abUOIli3bh0AYMWKFWqzSzt06ICFCxdWuR0iqj8ODg4IDw+Hra0t8vPz8d133+Htt9+Gp6cnWrRogffee6/S5ZpTp05F9+7dVT8rxx5JkrBp0yYUFxcDeLj0bNOmTdDR0cG6detgamqqquPs7IwPP/wQACqdMTp37ly4uLhU+96US+VeeukljbzH7f1FRM9Wxa0uTE1N4evriwsXLmDt2rVYvXr1U7ev/Lvu7e0NAwMDtTwzMzO1caoqPv/8c1hbW6t+bt++PcaOHQtA/T0tMTER+/fvh5eXF1auXAl9fX1VXqtWrfD1118DANavX6/RV23jUrt27eDo6FilshzDiGquuu8+p06dwh9//AErKyt88803GqfE29vbw9XVtUrXPnDgAMrKyjB06NBq9/vbb79FZmYmgoOD8f/+3/9Ty+vRowcWLlyIgoICtZntq1atQnFxMd566y2MGTNGrY4kSXj55ZfVxi6lmTNn4oUXXlD9bGRkhFmzZgEAoqOjq913omeBgVGiRk7bAUzHjh2Dk5OTav8bZYBUWebKlSvIzMxE8+bN0blzZ1X6gAED0KxZM41rjBs3DoD2vUwlSarWPjfXrl3DtWvXYGVlpdrnr6LRo0dXuS0iql/9+vXD5cuXsXv3brzzzjvw9PSErq4ucnNzsX79eri7u2vsewcAo0aN0kjr0qUL3NzcUFBQgDNnzgB4uBz0/v378PT01HqQgXJsio2NhUKh0MivyZcHAKpgSFBQEI4ePVrpHlpEVLvGjx+v+owaNQovvPACCgsLsXjxYhw8ePCp23d3d4eOjg7CwsKwceNG3Llzp8Zt6enpaX2vUS6RrRgsUe7dHhgYCB0dza9ryj1HT506pUpTjkvz58/Hr7/+qvoFkjYcw4hqT3XefX7//XcAD7/fVPzlbk3s27cPOjo6GDRoULXrKsec1157TWu+8vtkxTFH2fcpU6ZU61ovv/yyRpq2cZCoLjEwStTI9enTBzo6OkhISMD9+/eRnZ2N8+fPq+2b161bN5iZmakCoMoAp4+PDyRJws2bNwGg0gMLlOkZGRkaeVZWVlp/W1gZ5bXs7Oy05pubm2vsVUNEDZdcLserr76K9evXIz4+HtnZ2Vi/fj0sLS1x+/ZtTJ06VaNOZX//lWONcpx40thkYWEBc3Nz3L9/H3fv3tXIVx4sV11z5syBv78/YmNjERAQAEtLS7z88stYs2YN8vLyatQmEVXf5s2bVZ8ff/wRx48fx+nTp1FcXIyhQ4dq/cVLdTg5OWH58uUoKirC5MmTYWVlBTc3N8yaNQvJycnVasva2hoymUwjXRkMqXgAk/LwpgULFqjNiq34KSgoQE5OjqrOhAkTMHLkSPz9998YMmQILC0t0bdvX/z3v/9FVlaW2jU5hhHVrqq++1y/fh0A1GZ010RZWRkOHjyInj17olWrVtWurxxzvL29tY43Xl5eAKA25tS0723bttVI0zYOEtUlHr5E1MiZm5vD1dUViYmJOHnypOpU54qBUR0dHfTu3RsnTpxAeXm5xsFLTyJJUqV5jy49I6KmzcLCAu+88w5sbGwwbNgwREZGoqioCEZGRrVyvdoYn8zMzHDkyBHExsYiPDwcR48exZEjR3D48GEsXboUMTEx6NixY027TERPwcPDA1OmTEFoaCjWr19f5SX12maVA8Ds2bMxcuRI7N27F4cPH0ZMTAxWrVqF1atXY9WqVZgxY0aV2tc28/NJffHx8aly0EEmk2HHjh2YN28efvnlFxw5cgRxcXGIiYnBp59+ioMHD6JPnz4AOIYR1bXK3n2elejoaOTm5tb4NHrlmDN8+HAYGxtXWk7b6pzqqs5YSFRXGBglagJ8fX2RmJiIY8eOaQ2MAg9/Q3jo0CEkJyer7S8KADY2NgCA9PR0re0rf8tYnX1EK6Pc16qya927d091D0T0/HrxxRcBPDz1OTc3Vy0wmp6ejm7dumnUUY4LyjHpSWNTXl4ecnNzYWhoCEtLy2faf0mS4OPjoxpLb9++jeDgYPz4449YsGABfvrpp2d6PSKquvbt2wMALl26pJaup6eHgoICrXWUs5+0sbW1xbRp0zBt2jSUlZVh+/btmDhxIt5//3288cYbz3x8Uc6oCgwMxOzZs6tV18PDAx4eHggJCcG9e/cQEhKCVatWITg4WG0ZLMcworr36LuP8hT7K1euPFW74eHhAGq+RVDbtm1x4cIFzJs3r8p7J9va2uLSpUu4cuUK3N3da3RdooaC4XqiJqDiPqMxMTFo1aqVxkwA5T6ju3fvxsWLF2FsbAxPT08A/wuiHjx4UGtQUrkRd1VnmD6OnZ0dbG1tcfv2bURFRWnkb9++/amvQUS1Twjx2PzLly8DeLjcrEWLFmp52r6Qnz9/HomJiTAxMVG9gHfv3h2GhoaIj4/XCIAA/xubvL29qzVDQXnAXHX23bOyskJISAgAICUlpcr1iOjZS01NBQCNg0xat26NO3fuaN0rVLlf3pPo6upi7Nix8PLyQmlpqdax52n1798fALBnz56nasfMzAxLly6FJElPHJc4hhE9veq++ygPQPvxxx8r/aVNVYSHh6N9+/bo2rVrjerXZMxR9l15GBzR84yBUaImQBmwPH78OM6cOaMKglbUq1cvyGQy1cnyvXv3hq7uw0nlDg4OGDRoEPLz8zFjxgw8ePBAVe/EiRNYv349ZDIZgoKCnkl/3333XQAPl6/9888/qvTU1FQsXrz4mVyDiGrXwoULMWfOHK2zIDIyMlSb9Q8dOlQViFRau3at6oAlACgqKsK0adMghMDEiRNhaGgIADA2NsakSZOgUCgQFBSEwsJCVZ2LFy9iyZIlAIDp06dXq+/KmaiV7U+4YcMGpKWlaaTv378fAFQzQIio7p05c0b1RX3gwIFqeX5+fgCgGhuUli9frnZIpVJkZCR+//13jWX2aWlpOHfuHCRJ0rpf3tPq1asX+vfvj9jYWAQFBeHevXsaZZKSktQOmNq6davWgOaBAwcghFAblziGEdWO6r779OzZEwEBAbh9+zYmT56s9h4DPFyV99dffz32mn///TeuXLlS42X0wMMDlKysrLB8+XJ8/fXXGmNeWVkZIiIi1MaY4OBgGBgYYOPGjdixY4daeSEEDh8+zD1D6bnBpfRETYC1tTU6dOig+i3lo8vogYezKtzc3JCQkABAc/bnV199BV9fX2zZsgVRUVF44YUXkJ2djaNHj6K8vBwrVqx4ZssoZs+ejd9++w2xsbHo0KEDXnzxRZSUlOCPP/5Av379IJPJcO3atWdyLSKqHQUFBVizZg1CQ0Ph5OSELl26wMDAADdu3EBcXBwePHiADh06aN3/b+zYsejVqxdefPFFmJubIzo6GllZWXBxccEnn3yiVnbp0qU4efIkDh8+DAcHB/j5+aGwsBBHjhxBcXExpk+fXu0vC/b29nB1dcXp06fRs2dPuLi4QCaTYejQoRg6dCg2bNiAd999F126dEHnzp2hq6uL8+fPIykpCQYGBvjoo4+e5tERURVNmDBB9efS0lKkp6fj5MmTUCgUGDJkCMaNG6dWfu7cudi5cydWr16No0ePwtHREX/99ReuX7+O9957D+vWrVMrn5SUhJkzZ6Jly5bo3r07mjdvjuzsbERFRaGkpATTpk1T/SLlWdu2bRsGDBiAdevW4YcffoC7uztsbGyQl5eH5ORkXL9+HTNmzMCAAQMAALt27cIbb7wBR0dHdOvWDYaGhkhLS0NcXBx0dHTUgsEcw4hqR03efbZu3Yp+/frhxx9/REREBHx8fKCvr48rV64gMTERK1as0Lq9kNK+ffsA4KkCoxYWFvjll18wZMgQTJkyBUuWLEHXrl1haWmJrKwsJCQkIDc3F3v27FHNSnVyckJYWBjeeOMNjBo1CosXL4arqyvy8vKQkpKC69ev4+7du9U6hJeo3ggiahImTpwoAAgA4tSpU1rLTJs2TVXmjz/+0MjPyckRs2fPFo6OjkIulwsLCwvx8ssvi4iICK3tARB2dnaV9ikyMlIAEOPHj9fIKywsFB988IFo166dkMvlwt7eXsyfP1+UlJQIOzs7weGLqGHLzs4WW7duFWPHjhXdunUTzZs3F7q6uqJZs2bC29tbLF++XBQUFKjVUf7dLi8vF6GhocLZ2Vno6+uL1q1bi6CgIPHPP/9ovVZBQYH4+OOPRZcuXYS+vr4wNTUVPj4+4ocfftBavipjyKVLl0RgYKBo3ry50NHREQDEokWLhBBC7Nu3T0yaNEm4uLgICwsLYWRkJJycnMRbb70lzp8/X/2HRUTVonxXqfjR0dERzZo1E/7+/mLTpk2ivLxca90TJ04If39/YWRkJMzMzMS//vUvkZiYKMLCwtT+ngvxcBz48MMPhbe3t2jdurWQy+WiTZs2ol+/fmLXrl1CoVCote3n5ycAiLS0NI3+VvY+pO26Svfv3xeff/656NOnjzA3NxdyuVzY2toKPz8/8dlnn4nr16+rykZFRYmgoCDh7u4umjdvLgwMDISDg4MYNWqU+PPPP9Xa5RhGVDtq8u4jhBD37t0TixcvFq6ursLQ0FCYmJgIZ2dnMXXqVHHp0iVVOW3fnfr06SPMzMxEaWlplfqoHDMfHaeEECIzM1O8//77wsXFRRgZGQkjIyPh6Ogohg0bJjZv3izy8/M16iQlJYmxY8eKNm3aCD09PWFlZSW8vb3FihUrxIMHD1TlKhsfhRAiLS1NABB+fn5VugeiZ00S4gkbYRARERHVAXt7e6Snpz9xjy4iIiKipi47OxvW1tYYPny4xnJ2Iqo6LqUnIiIiIiIiInqO3L17FwsXLsS//vWv+u4K0XONM0aJiIioQeCMUSIiIiIiqks8lZ6IiIiIiIiIiIiaHM4YJSIiIiIiIiIioiaHM0aJiIiIiIiIiIioyWFglIiIiIiIiIiIiJocBkaJiIiIiIiIiIioyWFglIiIiIiIiIiIiJocBkaJiIiIiIiIiIioyWFglIiIiIiIiIiIiJocBkaJiIiIiIiIiIioyWFglIiIiIiIiIiIiJocBkaJiIiIGgF/f39IkgRJkjB//vxKy/3666+QJAn29vZ117kG4OrVq5g/fz569eoFKysryOVyWFpawsvLC7Nnz0ZycvIzu9bq1asREhKCq1evPrM2iYiIiOjZY2CUiIiIqJH5/PPPcevWrfruRoMghMCiRYvg5OSEpUuX4tSpUzAxMYG7uzusrKyQlJSElStXws3NDdOmTXsm11y9ejU+/vhjBkaJiIiIGjgGRomIiIgaEZlMhsLCQvznP/+p7640CFOmTMHixYuhUCgwd+5c3Lx5E6mpqTh16hQuXLiAnJwchIWFwdHRETExMfXdXSIiIiKqQwyMEhERETUiY8eOBQB89dVXuHbtWj33pn59//332LhxIyRJwk8//YRPP/0UrVu3VitjZmaGCRMmICUlBePHj6+nnhIRERFRfWBglIiIiKgR6dGjB1599VWUlpYiJCSkRm389ttvGDZsGKytrSGXy2FtbY3hw4cjLi5Oo+yoUaMgSRK++eYbjbyXXnoJkiShWbNmUCgUanmJiYmQJAnt27dXS8/IyMC0adPg5OQEAwMDGBkZoV27dujXrx+WLVuGBw8eVOkeFAoFPv74YwDAxIkT8dprrz22vIGBAWbOnKmWlpKSgo8//hg+Pj5o27Yt5HI5WrRogZdffhm7du3SaGPz5s2QJAnp6ekAgICAANW+r5IkYfPmzWrlS0tLsW7dOvj6+qJZs2bQ19eHg4MDgoKCcOPGjUr7mpGRgTfffBM2NjYwMDBAx44dsXDhQhQXF2PChAlar6UUFRWFV199Ve2/7WuvvVbpbFnlPfn7+6OsrAyhoaFwc3ODsbExLCwskJycDEmSYGZmhqKiokr7PGPGDEiShMmTJ1dahoiIiKiuMTBKRERE1MgsWbIEOjo62LJlCy5cuFDlegqFApMmTcLgwYOxb98+KBQKdO3aFSUlJdi1axe8vb3x7bffqtXx9/cHABw9elQtvbS0FMePHwcA3L17F0lJSWr5kZGRavUB4Nq1a/D09MQXX3yBq1evwsHBAS4uLigrK0NkZCTmzZuHwsLCKt3Ln3/+iUuXLgFAjfcODQ4ORkhICP766y+YmJjAzc0NBgYGOHz4MIYPH445c+aolW/VqhW8vb2hr68PAOjatSu8vb1Vn1atWqnK3r59G97e3ggKCsLx48dhZmYGZ2dnZGZmYt26dXB3d0d8fLxGny5evAhPT098++23yMnJQZcuXaCrq4slS5YgICAApaWlld7P0qVL4e/vj71790KhUMDNzQ3l5eXYs2cP+vbti88++6zSukIIBAYGYs6cOcjPz0eXLl1gZmYGV1dX9OzZE/n5+fj555+11i0tLcX3338PAHjzzTcrf+BEREREdU0QERER0XPPz89PABBr164VQggxbtw4AUCMGDFCrVx4eLgAIOzs7DTaCAkJEQBEx44dRVRUlCpdoVCI9evXC5lMJuRyuTh79qwq79y5cwKAsLGxUWsrKipKABBt2rQRAMSKFSvU8ocOHSoAiM2bN6vSpk+fLgCIl156SWRnZ6uVz8rKEqtWrRKFhYVVeh6hoaECgLCwsBAKhaJKdR71888/izNnzmikx8fHCycnJwFAxMbGauTb2dkJACIyMrLStgMCAgQAMWDAAJGamqpKLygoEG+//bYAINq3by9KSkpUeQqFQvTo0UMAEH369BEZGRmqvISEBGFjYyP09PQEABEWFqZ2vUOHDgkAQpIkERoaKsrLy4UQQpSVlYlPP/1UlffHH3+o1QsLCxMAhEwmEy1atBDR0dGqvKKiIiGEEBs3bhQARN++fbXe608//SQACBcXl0qfBxEREVF94IxRIiIiokYoJCQEenp62LlzJ86cOfPE8nfu3MHy5cuhr6+PX375BX379lXlSZKEd955B9OnT0dpaSlWr16tynN2doa1tTVu3ryJixcvqtKVM0Lnz5+v9jPwcGaqcul2QECAKl05u3Xq1Klo0aKFWv9atWqF4OBgGBkZVen+MzIyAAD29vaQJKlKdR41fPhwuLu7a6R7enriyy+/BABs3bq12u0eOHAAkZGRcHZ2xu7du9W2EzA2NsaGDRvQo0cPpKWlYefOnaq8yMhInD59GkZGRti5cydsbGxUeR4eHti8eXOlWw3897//BfBw64PZs2dDR+fh1wCZTIa5c+fi9ddfhxACS5Ys0Vq/vLwc69evh6+vryrN0NBQ1aaJiQliYmJw+fJljbrKWcacLUpEREQNDQOjRERERI2Qg4MD3nzzTQghsGDBgieW379/P4qKiuDr64vOnTtrLRMYGAhAc9m8tuX0R48ehY6ODkaPHg0nJydER0ejvLwcwMP9Re/evYv27dujXbt2qjrKP+/evbvKe4lWJj8/HwBgYmLyVO1kZWVh1apVGDNmDF566SX4+PjAx8cHH3zwAQBUKej8KGWwc+zYsargYkU6OjoYMmQIAPVnevDgQQDAwIEDNQ6RAoD+/fvDzs5OI72wsFAViA4ODtbap9mzZwMAYmJitO4VamZmhldffVVrXRMTE4waNQpCCI2tFjIyMnDo0CHI5XKMGzdOa30iIiKi+qJb3x0gIiIiotqxcOFCfPfddzhw4ACOHTsGHx+fSssmJycDAM6ePVtpueLiYgDQOBjI398f27dvR2RkJCZPnozi4mKcOHECbm5usLS0REBAAL766iskJCTAy8tLFeyruL8oAEyfPh1btmzBli1bcODAAQwYMADe3t7w8/ODs7Nzte7d1NQUAFBQUFCtehXt2LEDb7755mP3Nb1z506121U+623btuHAgQNay9y6dQuA+rNWzsh1c3OrtG03NzfV4U9Kly9fVgWlu3btqrVet27dAABlZWW4fPkyXF1d1fKdnJwgk8kqve7bb7+Nb775Blu2bMEnn3yiKrt582YoFAoMGTJEYxYwERERUX1jYJSIiIiokbKxsUFQUBBCQ0OxYMECREVFVVo2NzcXAJCZmYnMzMzHtnv//n21n5XL4ZUBzxMnTqCkpESV7u/vj6+++gqRkZFqgdGKy+iBh0G72NhYhISE4PDhw9i6datqqXrXrl2xbNkyDBw4sEr33qZNGwDA1atXIYSo9nL6tLQ0vPHGGygtLUVQUBDGjx+Pjh07wtTUFDKZDKmpqXB0dKzRzFblsz5//vwTy1acvakM8iqDvtpoy6s4e7ayrQhMTExgbGyMwsJCVfmKjI2NH9vPnj17wtXVFcnJyYiIiFD9d9q8eTMALqMnIiKiholL6YmIiIgasXnz5sHMzAzR0dGIiIiotJxyyfl7770HIcQTPxU5OTnBxsYGWVlZOH/+vEbgUzkzNDIyEgqFAtHR0WrpFXXv3h3h4eG4e/cuIiMjsXjxYnTr1g0pKSkYOnQoTp48WaX7Vs56zc3NRVJSUpXqVLRjxw6UlpZi+PDh+OKLL+Dl5QULCwvVTMiazBRVUj7rn3766YnPueJSemU9bYFLJW15FWfPalsmr8xTzox9XOD1cd5++20A/9tTNDo6GpcvX0bbtm3xyiuv1KhNIiIiotrEwCgRERFRI9a8eXPMmjULAPDhhx9WWk65xDolJaVG1/Hz8wPwMPgZGRkJmUymOsDJ2toazs7OOHbsGP7880/k5eXBwcEBtra2lbZnaGgIf39/LFy4EElJSRg0aBDKy8uxcePGKvXHy8sLHTp0AACsXbu22veTlpYGAGqHUFX0uADtk2an1vRZOzk5AfjfUnxttOU5OjqqArqVXVOZrqurq3pu1aXcM3Xfvn3IyclRBUgnTJigOuyJiIiIqCHhGwoRERFRIzdr1iy0aNECp0+fxq5du7SWGTx4MAwMDBATE4M///yz2tdQzg7dv38/4uLi4OnpCTMzM7X8goICfPbZZ2rlq0KSJPTu3RsAcPPmzSrV0dHRwUcffQQACAsLw+7dux9bvri4GKtXr1b9rFxyrm1bgeLi4scGW5UHKj265YDSiBEjAACbNm1CXl7eY/tV0YABAwA8fMbKPUgr+uOPP3D16lWNdBMTE9Vp8hXvsaKVK1cCeBgIrmy5/ZNYWFjg9ddfx4MHD7Bu3Trs3LkTkiRh4sSJNWqPiIiIqLYxMEpERETUyJmammLevHkAoNqz81GtWrXCvHnzIITA4MGDsXfvXo0l8+np6QgNDcWmTZs06iuXxf/2228oLS3VCHwq85UBSm3L6KdMmYIffvhBYzn4xYsXVXtVdu/e/bH3WtG4ceMwceJECCEwcuRIfPDBB8jKylIrU1BQgG3btsHV1VV1DeB/M0XXrVunFii+ffs2hg8fjuvXr1d6XUdHRwCodE/XIUOGICAgABkZGejfvz8SExPV8oUQSEhIwMyZM9WuHRAQAC8vLxQWFmL48OFqQdvExERMmDABenp6Wq85f/58AMD27duxevVqKBQKAIBCoUBoaCh+/vlnSJL02FnFVaFcTv/JJ5+gsLAQ/v7+cHBweKo2iYiIiGqNICIiIqLnnp+fnwAg1q5dqzX//v37ok2bNgKAACDs7Ow0yigUCjF16lRVGUtLS9GjRw/RvXt3YW1trUpftGiR1mtUbP/AgQNqebdu3VLlARDXr1/XqO/m5iYACJlMJpycnESvXr1Ex44dhSRJAoDo1q2byM3NrdZzKS8vFwsWLBC6uroCgJAkSTg6OoqePXsKZ2dnIZfLVekzZsxQ1SsrKxO+vr6qPCcnJ+Hh4SH09PSEXC4XX3/9daXP8fvvv1fdp5OTk+jbt6/w8/NTeyY5OTmq9gEIW1tb0atXL+Hm5iZMTU1V6ZGRkWptX7hwQVhZWQkAQk9PT3h4eIguXboIAKJXr15i9OjRAoDYsmWLRr/+85//qNq1srISXl5eomXLlqq0ZcuWadQJCwsTAISfn1+Vn3mnTp1UbW7btq3K9YiIiIjqGmeMEhERETUBBgYGWLhw4WPLSJKEtWvXIjo6GmPGjIGpqSn++usvpKSkwNDQECNHjsQPP/yg2rP0UcpZoLq6uqrDj5SsrKzg4uIC4OGMyrZt22rUX7VqFWbOnAkPDw/cu3cP8fHxyMrKgpeXFz799FOcPHkS5ubm1bpvHR0dLFmyBBcvXsS8efPQvXt35ObmIiEhAVlZWXB1dcWcOXOQkpKitsxcJpPhwIED+Pe//w1bW1ukpaXh5s2bGDJkCE6cOIH+/ftXes0xY8bgyy+/hIeHB27cuIHo6GhERUWpzVZt3rw5IiMjsXXrVgwYMADFxcWIj4/HlStX0K5dO7z77rs4dOiQxnN0cnJCfHw8Jk2ahObNm+Pvv/9GcXEx5s2bhyNHjuDBgwcAoLaNgdL8+fMRGRmJwMBAAMCZM2cgSRICAwMRFRWF999/v1rPtjKTJk0C8L+l9UREREQNlSTEI2ukiIiIiIjoudS1a1ecPXsWiYmJcHNzq5c+zJo1C6tWrcJ7772HL7/8sl76QERERFQVDIwSERERETUCcXFx6N27N5o1a4Zbt25BV1e3zvtQXFwMW1tb5OTkICEhAR4eHnXeByIiIqKq4lJ6IiIiIqLnxKVLl/D5558jNzdXLT02NhYjR44EAEyePLlegqIAsHz5cuTk5KBPnz4MihIREVGDxxmjRERERETPidOnT8PLywsymQxOTk4wMzNDRkYGbty4AQDo06cPfv/9dxgaGtZZnxITExEcHIxbt27h/PnzkCQJR48eRd++feusD0REREQ1wRmjRERERETPCUdHRyxYsACenp64c+cOEhIScO/ePfTu3RurV6/GkSNH6jQoCgC5ubmIiopCamoqXF1dsXPnTgZFiYiI6LnAGaNERERERERERETU5HDGKBERERERERERETU5DIwSERERERERERFRk8PAKBERERERERERETU5DIwSERERERERERFRk8PAKBERERERERERETU5DIwSERERERERERFRk8PAKBERERERERERETU5DIwSERERERERERFRk8PAKBERERERERERETU5DIwSERERERERERFRk8PAKBERERERERERETU5DIwSERERERERERFRk8PAKBERERERERERETU5DIwSERERERERERFRk8PAKBERERERERERETU5DIwSERERERERERFRk8PAKBERERERERERETU5DIwSERERERERERFRk8PAKBERERERERERETU5DIwSERERERERERFRk8PAKBERERERERERETU5DIwSERERERERERFRk/P/AdxyHQE450rEAAAAAElFTkSuQmCC", + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Show the per-class metrics bar chart\n", + "from IPython.display import Image, display\n", + "display(Image('results/per_class_metrics.png'))" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "cbaa4e56", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Total test examples : 50\n", + "Correct predictions : 13\n", + "Overall accuracy : 0.2600\n", + "\n", + "Sample predictions:\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
true_label_idpred_label_idtrue_labelpred_labelcorrect
011SportsSportsTrue
121BusinessSportsFalse
211SportsSportsTrue
321BusinessSportsFalse
431Sci/TechSportsFalse
501WorldSportsFalse
621BusinessSportsFalse
721BusinessSportsFalse
831Sci/TechSportsFalse
921BusinessSportsFalse
\n", + "
" + ], + "text/plain": [ + " true_label_id pred_label_id true_label pred_label correct\n", + "0 1 1 Sports Sports True\n", + "1 2 1 Business Sports False\n", + "2 1 1 Sports Sports True\n", + "3 2 1 Business Sports False\n", + "4 3 1 Sci/Tech Sports False\n", + "5 0 1 World Sports False\n", + "6 2 1 Business Sports False\n", + "7 2 1 Business Sports False\n", + "8 3 1 Sci/Tech Sports False\n", + "9 2 1 Business Sports False" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Peek at the raw predictions CSV — every test example with true and predicted label\n", + "import pandas as pd\n", + "pred_df = pd.read_csv('results/predictions.csv')\n", + "print(f'Total test examples : {len(pred_df):,}')\n", + "print(f'Correct predictions : {pred_df[\"correct\"].sum():,}')\n", + "print(f'Overall accuracy : {pred_df[\"correct\"].mean():.4f}')\n", + "print()\n", + "print('Sample predictions:')\n", + "pred_df.head(10)" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "c461384a", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Total errors: 37\n", + "\n", + "Most common misclassification pairs:\n", + "true_label pred_label count\n", + " Business Sports 16\n", + " World Sports 13\n", + " Sci/Tech Sports 8\n" + ] + } + ], + "source": [ + "# Show which class pairs the model confuses most\n", + "errors = pred_df[~pred_df['correct']]\n", + "print(f'Total errors: {len(errors):,}')\n", + "print()\n", + "print('Most common misclassification pairs:')\n", + "confusion_pairs = (\n", + " errors.groupby(['true_label', 'pred_label'])\n", + " .size()\n", + " .reset_index(name='count')\n", + " .sort_values('count', ascending=False)\n", + " .head(5)\n", + ")\n", + "print(confusion_pairs.to_string(index=False))" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}