From 1637a87a0123a394bee510176c2e9cb0ce7b5d4b Mon Sep 17 00:00:00 2001 From: Kshiti Deshpande <113427581+kshitideshpande@users.noreply.github.com> Date: Mon, 30 Mar 2026 19:48:03 -0400 Subject: [PATCH 1/5] Create .gitkeep --- .../projects/UmdTask437_DATA605_Spring2026_DocsGPT/.gitkeep | 1 + 1 file changed, 1 insertion(+) create mode 100644 class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/.gitkeep diff --git a/class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/.gitkeep b/class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/.gitkeep new file mode 100644 index 000000000..8b1378917 --- /dev/null +++ b/class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/.gitkeep @@ -0,0 +1 @@ + From c53a20000ccbdbefa7085947091fc5d7c59050da Mon Sep 17 00:00:00 2001 From: Kshiti Deshpande Date: Wed, 1 Apr 2026 11:33:05 -0400 Subject: [PATCH 2/5] project_templete files added in branch added project_template files --- .../Dockerfile | 30 + .../Dockerfile.python_slim | 28 + .../Dockerfile.ubuntu | 40 + .../Dockerfile.uv | 49 ++ .../README.md | 802 ++++++++++++++++++ .../bashrc | 1 + .../copy_docker_files.py | 140 +++ .../docker_bash.sh | 34 + .../docker_build.sh | 40 + .../docker_build.version.log | 1 + .../docker_clean.sh | 26 + .../docker_cmd.sh | 41 + .../docker_exec.sh | 25 + .../docker_jupyter.sh | 39 + .../docker_name.sh | 12 + .../docker_push.sh | 25 + .../etc_sudoers | 31 + .../requirements.txt | 4 + .../run_jupyter.sh | 35 + .../template.API.ipynb | 215 +++++ .../template.API.py | 129 +++ .../template.example.ipynb | 198 +++++ .../template.example.py | 125 +++ .../template_utils.py | 72 ++ .../test/test_docker_all.py | 48 ++ .../utils.sh | 607 +++++++++++++ .../version.sh | 28 + 27 files changed, 2825 insertions(+) create mode 100644 class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/Dockerfile create mode 100644 class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/Dockerfile.python_slim create mode 100644 class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/Dockerfile.ubuntu create mode 100644 class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/Dockerfile.uv create mode 100644 class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/README.md create mode 100644 class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/bashrc create mode 100644 class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/copy_docker_files.py create mode 100755 class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/docker_bash.sh create mode 100755 class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/docker_build.sh create mode 100644 class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/docker_build.version.log create mode 100755 class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/docker_clean.sh create mode 100755 class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/docker_cmd.sh create mode 100755 class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/docker_exec.sh create mode 100755 class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/docker_jupyter.sh create mode 100644 class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/docker_name.sh create mode 100755 class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/docker_push.sh create mode 100644 class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/etc_sudoers create mode 100644 class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/requirements.txt create mode 100755 class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/run_jupyter.sh create mode 100644 class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/template.API.ipynb create mode 100644 class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/template.API.py create mode 100644 class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/template.example.ipynb create mode 100644 class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/template.example.py create mode 100644 class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/template_utils.py create mode 100644 class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/test/test_docker_all.py create mode 100644 class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/utils.sh create mode 100755 class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/version.sh diff --git a/class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/Dockerfile b/class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/Dockerfile new file mode 100644 index 000000000..f5c02c562 --- /dev/null +++ b/class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/Dockerfile @@ -0,0 +1,30 @@ +# Use Python 3.12 slim (already has Python and pip). +FROM python:3.12-slim + +# Avoid interactive prompts during apt operations. +ENV DEBIAN_FRONTEND=noninteractive + +# Install CA certificates (needed for HTTPS). +RUN apt-get update && apt-get install -y \ + ca-certificates \ + && rm -rf /var/lib/apt/lists/* + +# Install project specific packages. +RUN mkdir -p /install +COPY requirements.txt /install/requirements.txt +RUN pip install --upgrade pip && \ + pip install --no-cache-dir jupyterlab jupyterlab_vim jupytext -r /install/requirements.txt + +# Config. +COPY etc_sudoers /install/ +COPY etc_sudoers /etc/sudoers +COPY bashrc /root/.bashrc + +# Report package versions. +COPY version.sh /install/ +RUN /install/version.sh 2>&1 | tee version.log + +# Jupyter. +EXPOSE 8888 + +CMD ["/bin/bash"] diff --git a/class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/Dockerfile.python_slim b/class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/Dockerfile.python_slim new file mode 100644 index 000000000..cc8f18f2f --- /dev/null +++ b/class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/Dockerfile.python_slim @@ -0,0 +1,28 @@ +# Use Python 3.12 slim (already has Python and pip). +FROM python:3.12-slim + +# Avoid interactive prompts during apt operations. +ENV DEBIAN_FRONTEND=noninteractive + +# Install CA certificates (needed for HTTPS). +RUN apt-get update && apt-get install -y \ + ca-certificates \ + && rm -rf /var/lib/apt/lists/* + +# Install project specific packages. +RUN mkdir -p /install +COPY requirements.txt /install/requirements.txt +RUN pip install --upgrade pip && \ + pip install --no-cache-dir jupyterlab jupyterlab_vim jupytext -r /install/requirements.txt + +# Config. +COPY etc_sudoers /install/ +COPY etc_sudoers /etc/sudoers +COPY bashrc /root/.bashrc + +# Report package versions. +COPY version.sh /install/ +RUN /install/version.sh 2>&1 | tee version.log + +# Jupyter. +EXPOSE 8888 diff --git a/class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/Dockerfile.ubuntu b/class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/Dockerfile.ubuntu new file mode 100644 index 000000000..705105d91 --- /dev/null +++ b/class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/Dockerfile.ubuntu @@ -0,0 +1,40 @@ +FROM ubuntu:24.04 +ENV DEBIAN_FRONTEND noninteractive + +# Install system utilities and Python in a single layer. +RUN apt-get update && \ + apt-get upgrade -y && \ + apt-get install -y --no-install-recommends \ + sudo \ + curl \ + git \ + build-essential \ + python3 \ + python3-pip \ + python3-dev \ + python3-venv \ + && rm -rf /var/lib/apt/lists/* + +# Create virtual environment. +RUN python3 -m venv /opt/venv + +# Make the venv the default Python. +ENV PATH="/opt/venv/bin:$PATH" + +# Install project specific packages. +RUN mkdir /install +COPY requirements.txt /install/requirements.txt +RUN pip install --upgrade pip && \ + pip install --no-cache-dir jupyterlab jupyterlab_vim jupytext -r /install/requirements.txt + +# Config. +COPY etc_sudoers /install/ +COPY etc_sudoers /etc/sudoers +COPY bashrc /root/.bashrc + +# Report package versions. +COPY version.sh /install/ +RUN /install/version.sh 2>&1 | tee version.log + +# Jupyter. +EXPOSE 8888 diff --git a/class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/Dockerfile.uv b/class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/Dockerfile.uv new file mode 100644 index 000000000..d3b2a0abc --- /dev/null +++ b/class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/Dockerfile.uv @@ -0,0 +1,49 @@ +FROM ubuntu:24.04 +ENV DEBIAN_FRONTEND noninteractive + +# Install system utilities and Python in a single layer. +RUN apt-get update && \ + apt-get upgrade -y && \ + apt-get install -y --no-install-recommends \ + sudo \ + curl \ + git \ + build-essential \ + python3 \ + python3-pip \ + python3-dev \ + python3-venv \ + libgomp1 \ + g++ \ + && rm -rf /var/lib/apt/lists/* + +# Install uv for package management. +RUN curl -LsSf https://astral.sh/uv/install.sh | sh +ENV PATH="/root/.local/bin:$PATH" + +# Install project specific packages using uv. +COPY pyproject.toml uv.lock /app/ +WORKDIR /app +RUN uv sync +ENV PATH="/app/.venv/bin:$PATH" + +# Install Jupyter. +RUN pip install --upgrade pip && \ + pip install --no-cache-dir jupyterlab jupyterlab_vim jupytext + +# Copy project files. +COPY . /app + +RUN mkdir /install + +# Config. +COPY etc_sudoers /install/ +COPY etc_sudoers /etc/sudoers +COPY bashrc /root/.bashrc + +# Report package versions. +COPY version.sh /install/ +RUN /install/version.sh 2>&1 | tee version.log + +# Jupyter. +EXPOSE 8888 diff --git a/class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/README.md b/class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/README.md new file mode 100644 index 000000000..58d90e2d1 --- /dev/null +++ b/class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/README.md @@ -0,0 +1,802 @@ +# Summary +This directory contains a Docker-based development environment template with: + +- Utility scripts for Docker operations (build, run, clean, push) +- Configuration files for Dockerfile and environment setup +- Jupyter notebook templates for standardized project development +- Shell utilities and Python helpers for container-based workflows + +A guide to set up Docker-based projects using the template, customize it for +your needs, and maintain it over time. + +## Description of Files +- `bashrc` + - Bash configuration file enabling `vi` mode for command-line editing + +- `copy_docker_files.py` + - Python script for copying Docker configuration files to destination + directories + +- `docker_build.version.log` + - Log file containing Python, `pip`, Jupyter, and package version information + from Docker build + +- `docker_cmd.sh` + - Shell script for executing arbitrary commands inside Docker containers with + volume mounting + +- `docker_jupyter.sh` + - Shell script for launching Jupyter Lab server inside Docker containers + +- `docker_name.sh` + - Configuration file defining Docker repository and image naming variables + +- `Dockerfile` + - Docker image build configuration with Ubuntu, Python, Jupyter, and project + dependencies + +- `etc_sudoers` + - Sudoers configuration file granting passwordless sudo access for postgres + user + +- `README.md` + - Documentation file describing directory contents, files, and executable + scripts + +- `template_utils.py` + - Python utility functions supporting tutorial notebooks with data processing + and modeling helpers + +- `template.API.ipynb` + - Jupyter notebook template for API exploration and library usage examples + +- `template.example.ipynb` + - Jupyter notebook template for project examples and demonstrations + +- `utils.sh` + - Bash utility library with reusable functions for Docker operations + - Provides centralized argument parsing (`parse_default_args`) for `-h` and + `-v` flags used by all `docker_*.sh` scripts + - Provides Jupyter configuration logic: vim keybindings, notification + settings, and Docker run option builders + - All `docker_*.sh`, `docker_jupyter.sh`, and `run_jupyter.sh` scripts across + the repo source this file from `class_project/project_template/utils.sh` + +## Workflows +- All commands should be run from inside the project directory + ```bash + > cd tutorials/FilterPy + ``` + +- To build the container for a project + ```bash + > cd $PROJECT + # Build the container. + > docker_build.sh + # Build without cache (pass extra args after -v). + > docker_build.sh --no-cache + # Test the container. + > docker_bash.sh ls + ``` + +- Enable verbose (trace) output with `-v` + ```bash + > docker_build.sh -v + > docker_bash.sh -v + ``` + +- Get help for any docker script + ```bash + > docker_build.sh -h + > docker_jupyter.sh -h + ``` + +- Start Jupyter + ```bash + > docker_jupyter.sh + # Go to localhost:8888 + ``` + +- Start Jupyter on a specific port with vim support + ```bash + > docker_jupyter.sh -p 8890 -u + # Go to localhost:8890 + ``` + +## How to Customize a Project Template +- Copy the template + ```bash + > cp -r class_project/project_template $TARGET + ``` + +## Description of Executables + +### `copy_docker_files.py` +- **What It Does** + - Copies Docker configuration and utility files from project_template to a + destination directory + - Preserves all file permissions and attributes during copying + - Creates destination directory if it doesn't exist + +- Copy all Docker files to a target directory: + ```bash + > ./copy_docker_files.py --dst_dir /path/to/destination + ``` + +- Copy with verbose logging: + ```bash + > ./copy_docker_files.py --dst_dir /path/to/destination -v DEBUG + ``` + +### `docker_bash.sh` +- **What It Does** + - Launches an interactive bash shell inside a Docker container + - Mounts the current working directory as `/data` inside the container + - Exposes port 8888 for potential services running in the container + - Accepts `-h` (help) and `-v` (verbose/trace) flags via `parse_default_args` + +- Launch bash shell in the container: + ```bash + > ./docker_bash.sh + ``` + +- Launch with verbose output (prints each command): + ```bash + > ./docker_bash.sh -v + ``` + +### `docker_build.sh` +- **What It Does** + - Builds Docker container images using Docker BuildKit + - Supports single-architecture builds (default) or multi-architecture builds + (`linux/arm64`, `linux/amd64`) + - Copies project files to temporary build directory and generates build logs + - Accepts `-h` (help) and `-v` (verbose/trace) flags; any extra arguments + after flags are forwarded to `docker build` + +- Build container image for current architecture: + ```bash + > ./docker_build.sh + ``` + +- Build without Docker layer cache: + ```bash + > ./docker_build.sh --no-cache + ``` + +- Build multi-architecture image (requires setting `DOCKER_BUILD_MULTI_ARCH=1` + in the script): + ```bash + > # Edit docker_build.sh to set DOCKER_BUILD_MULTI_ARCH=1 + > ./docker_build.sh + ``` + +### `docker_clean.sh` +- **What It Does** + +- Removes all Docker images matching the project's full image name +- Lists images before and after removal for verification +- Uses force removal to ensure cleanup completes + +- Remove project's Docker images: + ```bash + > ./docker_clean.sh + ``` + +### `docker_cmd.sh` +- **What It Does** + - Executes arbitrary commands inside a Docker container + - Mounts current directory as `/data` for accessing project files + - Automatically removes container after command execution completes + - Accepts `-h` (help) and `-v` (verbose/trace) flags; remaining arguments + form the command to execute + +- Run Python script inside container: + ```bash + > ./docker_cmd.sh python script.py --arg value + ``` + +- List files in the container: + ```bash + > ./docker_cmd.sh ls -la /data + ``` + +- Run tests inside container: + ```bash + > ./docker_cmd.sh pytest tests/ + ``` + +### `docker_exec.sh` +- **What It Does** + - Attaches to an already running Docker container with an interactive bash + shell + - Finds the container ID automatically based on the image name + - Useful for debugging or inspecting running containers + - Accepts `-h` (help) and `-v` (verbose/trace) flags via `parse_default_args` + +- Attach to running container: + ```bash + > ./docker_exec.sh + ``` + +### `docker_jupyter.sh` +- **What It Does** + - Launches Jupyter Lab server inside a Docker container + - Supports custom port configuration (default 8888), vim keybindings, and + custom directory mounting + - Runs `run_jupyter.sh` script inside the container with specified options + +- Start Jupyter on default port 8888: + ```bash + > ./docker_jupyter.sh + ``` + +- Start Jupyter on custom port with vim bindings: + ```bash + > ./docker_jupyter.sh -p 8889 -u + ``` + +- Start Jupyter with external directory mounted: + ```bash + > ./docker_jupyter.sh -d /path/to/notebooks -p 8889 + ``` + +- Start Jupyter in verbose mode: + ```bash + > ./docker_jupyter.sh -v -p 8890 + ``` + +### `docker_push.sh` +- **What It Does** + - Authenticates to Docker registry using credentials from + `~/.docker/passwd.$REPO_NAME.txt` + - Pushes the project's Docker image to the remote repository + - Lists images before pushing for verification + +- Push container image to registry: + ```bash + > ./docker_push.sh + ``` + +### `run_jupyter.sh` +- **What It Does** + - Launches Jupyter Lab server with no authentication (token and password + disabled) + - Binds to all network interfaces (0.0.0.0) on port 8888 + - Allows root access for container environments + - When `JUPYTER_USE_VIM=1`, verifies that `jupyterlab_vim` is installed + before enabling vim keybindings; exits with an error if not found + +- Start Jupyter Lab server (typically called from docker_jupyter.sh): + ```bash + > ./run_jupyter.sh + ``` + +- Start with vim keybindings (requires `jupyterlab_vim` installed in the + container): + ```bash + > JUPYTER_USE_VIM=1 ./run_jupyter.sh + ``` + +### `utils.sh` +- **What It Does** + - Central Bash library sourced by all `docker_*.sh` and `run_jupyter.sh` + scripts across the repository + - Provides `parse_default_args` which adds `-h` (help) and `-v` + (verbose/`set -x`) flags to every docker script + - Provides `build_container_image`, `push_container_image`, + `remove_container_image`, `kill_container`, `exec_container` utilities + - Provides Jupyter configuration helpers: vim keybindings, notification + suppression, and Docker run option builders + +### `version.sh` +- **What It Does** + - Reports version information for Python3, pip3, and Jupyter + - Lists all installed Python packages with versions + - Used during Docker image builds to log environment configuration + +- Display version information: + ```bash + > ./version.sh + ``` + +- Save version information to a log file: + ```bash + > ./version.sh 2>&1 | tee version.log + ``` + +# Template Customization and Maintenance + +## Quick Start for New Projects + +### Step 1: Copy the Template +```bash +> cd class_project/project_template +> cp -r . /path/to/your/new/project +> cd /path/to/your/new/project +``` + +### Step 2: Choose a Base Image +The template includes three Dockerfile options. Choose the one that best fits +your project: + +| Option | File | Best For | +| -------------------------- | ------------------------ | ---------------------------------------------------------------- | +| **Standard** | `Dockerfile.ubuntu` | Full Ubuntu environment with system tools | +| **Lightweight** | `Dockerfile.python_slim` | Minimal Python environment; reduced image size | +| **Modern Package Manager** | `Dockerfile.uv` | Fast dependency resolution with [uv](https://docs.astral.sh/uv/) | + +**How to choose:** + +- **Use Standard** if you need system-level tools (git, curl, graphviz, etc.) +- **Use Python Slim** to minimize image size and build time +- **Use uv** if you want faster, more reliable dependency management + +### Step 3: Set Up Your Dockerfile +- Delete unused reference files + ```bash + > rm Dockerfile.ubuntu Dockerfile.python_slim Dockerfile.uv + ``` + +- Create your working Dockerfile + ```bash + > cp Dockerfile.ubuntu Dockerfile + ``` + +- Add your dependencies + ```bash + > echo "numpy\npandas\nscikit-learn" > requirements.in + > pip-compile requirements.in > requirements.txt + ``` + +### Step 4: Keep Customization Minimal +- Only modify what's necessary for your project +- Use `requirements.txt` for all Python packages (don't edit Dockerfile for + this) +- Keep `bashrc` and `etc_sudoers` as-is unless you need custom shell setup +- Keep base image and Python version unless you have specific requirements + +## Understanding the Dockerfile Flow +Each Dockerfile follows the same structure. Here are the key stages: + +### Stage 1: Base Image and System Setup +```dockerfile +FROM ubuntu:24.04 # or python:3.12-slim, depending on your requirement +ENV DEBIAN_FRONTEND noninteractive +RUN apt-get -y update && apt-get -y upgrade +``` + +- **Purpose**: Start with a clean base image and disable interactive + installation prompts + +- **When to customize**: Only change the base image or version if your project + has specific requirements (different Ubuntu version, specific Python version, + etc.) + +### Stage 2: System Utilities (Ubuntu-based Dockerfiles Only) +```dockerfile +RUN apt install -y --no-install-recommends \ + sudo \ + curl \ + systemctl \ + gnupg \ + git \ + vim +``` + +- **Purpose**: Install essential system tools for development and container + management + +- **When to customize**: Add only if needed for your project + - `postgresql-client`: for database connections + - `graphviz`: for graph visualizations + - `ffmpeg`: for media processing + +- **Best practice**: Use `--no-install-recommends` to keep the image small + +### Stage 3: Python and Build Tools (Ubuntu-based Dockerfiles Only) +```dockerfile +RUN apt-get update && apt-get install -y --no-install-recommends \ + build-essential \ + python3 \ + python3-pip \ + python3-dev \ + python3-venv \ + && rm -rf /var/lib/apt/lists/* +``` + +- **Purpose**: Install Python 3, pip, and build tools needed for compiled + packages + +- **Why venv**: Creates an isolated Python environment separate from system + Python + +- **When to customize**: Rarely. Only change if you need a specific Python + version (e.g., `python3.11` instead of `python3`) + +### Stage 4: Virtual Environment Setup +```dockerfile +RUN python3 -m venv /opt/venv +ENV PATH="/opt/venv/bin:$PATH" +RUN python -m pip install --upgrade pip +``` + +- **Purpose**: Create and activate an isolated virtual environment for your + project + +- **Why this matters**: Ensures reproducibility and prevents dependency + conflicts across projects + +- **When to customize**: Never. This is a standard best practice + +### Stage 5: Jupyter Installation +```dockerfile +RUN pip install jupyterlab jupyterlab_vim +``` + +- **Purpose**: Install JupyterLab and the Vim keybinding extension for + interactive development + - `jupyterlab`: the main IDE for running notebooks in the browser + - `jupyterlab_vim`: adds Vim-style navigation to notebook cells + +- **Why in Dockerfile, not requirements.txt**: These are infrastructure + packages (the IDE itself), not project-specific dependencies + - Do NOT add `jupyterlab`, `jupyterlab-vim`, or `ipywidgets` to + `requirements.txt`; they are already installed here + +- **When to customize**: + - **Remove** this line if your project doesn't use Jupyter + - **Add more extensions** if needed (e.g., `jupyterlab-git`, + `jupyterlab-variableinspector`) + +### Stage 6: Project Dependencies +```dockerfile +COPY requirements.txt /install/requirements.txt +RUN pip install --no-cache-dir -r /install/requirements.txt +``` + +- **Purpose**: Install your project-specific Python packages + +- **When to customize**: This is the primary place to customize. Define all your + dependencies in `requirements.txt` + +- **Best practice**: + - **Pin all versions**: `numpy==1.24.0` (not `numpy>=1.20.0`) + - **Use `--no-cache-dir`**: Reduces image size by skipping pip cache + - **For complex dependencies**: Use `requirements.in` with `pip-tools` or + `pip-compile` + +- **Example requirements.txt**: + ```text + numpy==1.24.0 + pandas==2.0.0 + scikit-learn==1.2.2 + tensorflow==2.13.0 + ``` + +### Stage 7: Configuration +```dockerfile +COPY etc_sudoers /etc/sudoers +COPY bashrc /root/.bashrc +``` + +- **Purpose**: Apply custom bash configuration and sudo permissions + +- **When to customize**: + - **Edit `bashrc`**: to add aliases, environment variables, or custom prompt + - **Edit `etc_sudoers`**: if additional users need passwordless sudo access + +### Stage 8: Version Logging +```dockerfile +ADD version.sh /install/ +RUN /install/version.sh 2>&1 | tee version.log +``` + +- **Purpose**: Document the exact versions of Python, pip, Jupyter, and all + installed packages + +- **What it logs**: + - Python 3 version + - Pip version + - Jupyter version + - Complete list of all installed Python packages + +- **Why it matters**: Creates a detailed record of your container's environment + for troubleshooting and reproducibility + +- **How to use**: After building, review `version.log` to verify all + dependencies installed correctly + ```bash + > docker build -t my-project . + > cat version.log + ``` + +- **Extending it**: If you need to log additional tools (MongoDB, Node.js, + etc.), add them to `version.sh`: + ```bash + > echo "# mongo" + > mongod --version + ``` + +### Stage 9: Port Declaration +```dockerfile +EXPOSE 8888 +``` + +- **Purpose**: Declare that the container uses port 8888 (informational for + Docker) + +- **When to customize**: Add additional ports if your application needs them + (e.g., `EXPOSE 8888 5432 3000`) + +## Best Practices: Keep It Simple + +### The Core Principle +Only change what's necessary for your project. Everything else should inherit +from the template. + +This approach: + +- Makes Dockerfiles easier to understand and maintain +- Keeps images smaller and faster to build +- Simplifies future updates from the template +- Ensures consistency across similar projects + +### How to Do It Right +| What | Where | Example | +| :--------------------------- | :--------------------------- | :------------------------------ | +| Project Python packages | `requirements.txt` | `numpy==1.24.0` | +| Jupyter + Vim (always there) | Dockerfile Stage 5 | `jupyterlab jupyterlab_vim` | +| System tools | Dockerfile `apt-get` section | `postgresql-client` | +| Shell aliases | `bashrc` | `alias jlab="jupyter lab"` | +| Custom scripts | `scripts/` directory | Setup or initialization scripts | +| User permissions | `etc_sudoers` | Grant passwordless sudo | + +- **Do NOT add to `requirements.txt`**: `jupyterlab`, `jupyterlab-vim`, + `jupyterlab_vim`, or `ipywidgets` — these are Jupyter infrastructure packages + and are already installed in Stage 5 of the Dockerfile + +### Wrong Vs. Right Approach +- **Wrong**: Embed everything in the Dockerfile + ```dockerfile + RUN pip install my-package && python my_setup.py && npm install + ``` + +- **Right**: Use separate files and keep Dockerfile clean + ```dockerfile + COPY requirements.txt /install/ + RUN pip install -r /install/requirements.txt + COPY scripts/setup.sh /install/ + RUN /install/setup.sh + ``` + +## .Dockerignore Policy + +### Why It Matters +The `.dockerignore` file prevents unnecessary files from being added to the +Docker build context: + +- **Reduces build time**: Fewer files to transfer to Docker daemon +- **Reduces image size**: Only necessary files are included +- **Improves security**: Prevents leaking sensitive data + +### What to Exclude: Category Breakdown +- Python Artifacts (Always Exclude) + ```verbatim + __pycache__/ + *.pyc + *.pyo + *.pyd + ``` + - Why: Compiled bytecode generated at runtime. Regenerated in container, adds + bloat + +- Virtual Environments (Always Exclude) + ```verbatim + venv/ + .venv/ + env/ + .env/ + ``` + - Why: Local venvs aren't portable to containers. The Dockerfile creates its + own + +- Jupyter Checkpoints (Always Exclude) + ```verbatim + .ipynb_checkpoints/ + ``` + - Why: Auto-generated by Jupyter, not needed in the image + +- Git and Version Control (Always Exclude) + ```verbatim + .git/ + .gitignore + .gitattributes + ``` + - Why: Repository history not needed at runtime + +- Docker Build Scripts (Always Exclude) + ```verbatim + docker_build.sh + docker_push.sh + docker_clean.sh + docker_exec.sh + docker_cmd.sh + docker_bash.sh + docker_jupyter.sh + docker_name.sh + Dockerfile.* + ``` + - Why: Local development scripts don't run inside the container + +- Large Data Files (Recommended) + ```verbatim + data/ + *.csv + *.pkl + *.h5 + *.parquet + ``` + - Why: Don't ship large training and test data in the image. Mount via volume + instead + - Best practice: `bash > docker run -v /path/to/data:/data my-image ` + +- Test Files (Project-Dependent) + ```verbatim + tests/ + tutorials/ + ``` + - Why: Exclude if tests don't run in the container + - When to include: If CI and CD runs tests inside the container + +- Documentation (Recommended) + ```verbatim + README.md + docs/ + *.md + ``` + - Why: Not needed at runtime + - Exception: Only keep if your app reads these files at runtime + +- Generated Files (Always Exclude) + ```verbatim + *.log + *.tmp + *.cache + build/ + dist/ + ``` + - Why: Generated at runtime, not needed in the image + +## Workflow: From Template to Your Project + +### Complete Setup Checklist +- Copy the template + ```bash + > cp -r project_template my-new-project + > cd my-new-project + ``` + +- Keep all reference Dockerfiles + ```verbatim + Dockerfile.ubuntu_24_04 + Dockerfile.python_slim + Dockerfile.uv + ``` + +- Create your working Dockerfile + ```bash + > cp Dockerfile.ubuntu_24_04 Dockerfile + ``` + +- Add your dependencies + ```bash + > pip freeze > requirements.txt + ``` + +- Configure `.dockerignore`: Review the template `.dockerignore` and add your + project-specific exclusions (e.g., data directories) + +- Test the build + ```bash + > docker build -t my-project:latest . + > docker run -it my-project:latest bash + ``` + +- Test Jupyter (if using) + ```bash + > ./docker_jupyter.sh -p 8888 + ``` + +- Document customizations in your project README: + - Base image chosen and why + - Key dependencies + - Any Dockerfile modifications + - How to build and run + +## Maintaining Your Setup + +### Document Any Changes +- If you modify the Dockerfile, add explanatory comments: + ```dockerfile + # Custom: PostgreSQL client for database access + postgresql-client \ + + # Custom: Node.js for frontend builds + nodejs \ + ``` + +### Monitor Package Versions +- After each build, review `version.log`: + ```bash + > docker build -t my-project . + > cat version.log + ``` + +### Keep `.dockerignore` Updated +- If you add new directories or files, update `.dockerignore`. Add to + `.dockerignore` if the directory shouldn't be in the image: + ```verbatim + data/ + cache/ + .temp/ + ``` + +### Contribute Improvements Back +When you improve your project's Docker setup: + +- Test thoroughly in your project +- Document the improvement clearly +- Submit back to `project_template` +- Other projects can adopt it when they update + +Example improvements: + +- Better way to install TensorFlow with GPU support +- Optimized `.dockerignore` for data science projects +- Security hardening (non-root user setup) + +## Troubleshooting + +### Build Is Slow +- Check `.dockerignore`: Ensure large directories (data/, .git/) are excluded +- Check Docker daemon: Verify Docker is running properly +- Check layer caching: Docker reuses cached layers; avoid changing early layers + +### Image Is Too Large +- Check layer sizes: + ```bash + > docker history my-project:latest + ``` + +- Remove unnecessary packages or use `python_slim` base image + +### Package Not Found Error +- Verify package name in PyPI (packages are case-sensitive) +- Check Python version compatibility +- Pin specific version if needed + +### Permission Issues in Container +- Check `etc_sudoers`: Ensure user has appropriate permissions +- Check file ownership: Ensure COPY doesn't create root-only files + +### Jupyter Won't Connect +- Run Jupyter + ```bash + > ./docker_jupyter.sh -p 8888 + ``` + +- Verify http://localhost:8888 (not https). Check firewall if remote access + needed + +### Vim Keybindings Not Working +- If `run_jupyter.sh` exits with `ERROR: jupyterlab_vim is not installed`, it + means `jupyterlab_vim` is missing from the container image +- Make sure `jupyterlab_vim` is installed in the Dockerfile: + ```dockerfile + RUN pip install jupyterlab jupyterlab_vim + ``` +- Rebuild the image after adding the package: + ```bash + > ./docker_build.sh + ``` diff --git a/class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/bashrc b/class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/bashrc new file mode 100644 index 000000000..4b7ff4c49 --- /dev/null +++ b/class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/bashrc @@ -0,0 +1 @@ +set -o vi diff --git a/class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/copy_docker_files.py b/class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/copy_docker_files.py new file mode 100644 index 000000000..0e97c194c --- /dev/null +++ b/class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/copy_docker_files.py @@ -0,0 +1,140 @@ +#!/usr/bin/env python + +""" +Copy Docker-related files from the source directory to a destination directory. + +This script copies all Docker configuration and utility files from +class_project/project_template/ to a specified destination directory. + +Usage examples: + # Copy all files to a target directory. + > ./copy_docker_files.py --dst_dir /path/to/destination + + # Copy with verbose logging. + > ./copy_docker_files.py --dst_dir /path/to/destination -v DEBUG + +Import as: + +import class_project.project_template.copy_docker_files as cpdccodo +""" + +import argparse +import logging +import os +from typing import List + +import helpers.hdbg as hdbg +import helpers.hio as hio +import helpers.hparser as hparser +import helpers.hsystem as hsystem + +_LOG = logging.getLogger(__name__) + +# ############################################################################# +# Constants +# ############################################################################# + +# List of files to copy from the source directory. +_FILES_TO_COPY = [ + "bashrc", + "docker_bash.sh", + "docker_build.sh", + "docker_clean.sh", + "docker_cmd.sh", + "docker_exec.sh", + "docker_jupyter.sh", + "docker_name.sh", + "docker_push.sh", + "etc_sudoers", + "install_jupyter_extensions.sh", + "run_jupyter.sh" + "version.sh", +] + + +# ############################################################################# +# Helper functions +# ############################################################################# + + +def _get_source_dir() -> str: + """ + Get the absolute path to the source directory containing Docker files. + + :return: absolute path to class_project/project_template/ + """ + # Get the directory where this script is located. + script_dir = os.path.dirname(os.path.abspath(__file__)) + _LOG.debug("Script directory='%s'", script_dir) + return script_dir + + +def _copy_files( + *, + src_dir: str, + dst_dir: str, + files: List[str], +) -> None: + """ + Copy specified files from source directory to destination directory. + + :param src_dir: source directory path + :param dst_dir: destination directory path + :param files: list of filenames to copy + """ + # Verify source directory exists. + hdbg.dassert_dir_exists(src_dir, "Source directory does not exist:", src_dir) + # Create destination directory if it doesn't exist. + hio.create_dir(dst_dir, incremental=True) + _LOG.info("Copying %d files from '%s' to '%s'", len(files), src_dir, dst_dir) + # Copy each file. + copied_count = 0 + for filename in files: + src_path = os.path.join(src_dir, filename) + dst_path = os.path.join(dst_dir, filename) + # Verify source file exists. + hdbg.dassert_path_exists( + src_path, "Source file does not exist:", src_path + ) + # Copy the file using cp -a to preserve all permissions and attributes. + _LOG.debug("Copying '%s' -> '%s'", src_path, dst_path) + cmd = f"cp -a {src_path} {dst_path}" + hsystem.system(cmd) + copied_count += 1 + # + _LOG.info("Successfully copied %d files", copied_count) + + +# ############################################################################# + + +def _parse() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser( + description=__doc__, + formatter_class=argparse.RawDescriptionHelpFormatter, + ) + parser.add_argument( + "--dst_dir", + action="store", + required=True, + help="Destination directory where files will be copied", + ) + hparser.add_verbosity_arg(parser) + return parser + + +def _main(parser: argparse.ArgumentParser) -> None: + args = parser.parse_args() + hdbg.init_logger(verbosity=args.log_level, use_exec_path=True) + # Get source directory. + src_dir = _get_source_dir() + # Copy files to destination. + _copy_files( + src_dir=src_dir, + dst_dir=args.dst_dir, + files=_FILES_TO_COPY, + ) + + +if __name__ == "__main__": + _main(_parse()) diff --git a/class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/docker_bash.sh b/class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/docker_bash.sh new file mode 100755 index 000000000..0025e81f4 --- /dev/null +++ b/class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/docker_bash.sh @@ -0,0 +1,34 @@ +#!/bin/bash +# """ +# This script launches a Docker container with an interactive bash shell for +# development. +# """ + +# Exit immediately if any command exits with a non-zero status. +set -e + +# Import the utility functions from the project template. +GIT_ROOT=$(git rev-parse --show-toplevel) +source $GIT_ROOT/class_project/project_template/utils.sh + +# Parse default args (-h, -v) and enable set -x if -v is passed. +parse_default_args "$@" + +# Load Docker configuration variables for this script. +get_docker_vars_script ${BASH_SOURCE[0]} +source $DOCKER_NAME +print_docker_vars + +# List the available Docker images matching the expected image name. +run "docker image ls $FULL_IMAGE_NAME" + +# Configure and run the Docker container with interactive bash shell. +# - Container is removed automatically on exit (--rm) +# - Interactive mode with TTY allocation (-ti) +# - Port forwarding for Jupyter or other services +# - Git root mounted to /git_root inside container +CONTAINER_NAME=${IMAGE_NAME}_bash +PORT= +DOCKER_CMD=$(get_docker_bash_command) +DOCKER_CMD_OPTS=$(get_docker_bash_options $CONTAINER_NAME $PORT) +run "$DOCKER_CMD $DOCKER_CMD_OPTS $FULL_IMAGE_NAME" diff --git a/class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/docker_build.sh b/class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/docker_build.sh new file mode 100755 index 000000000..5b0957a99 --- /dev/null +++ b/class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/docker_build.sh @@ -0,0 +1,40 @@ +#!/bin/bash +# """ +# Build a Docker container image for the project. +# +# This script sets up the build environment with error handling and command +# tracing, loads Docker configuration from docker_name.sh, and builds the +# Docker image using the build_container_image utility function. It supports +# both single-architecture and multi-architecture builds via the +# DOCKER_BUILD_MULTI_ARCH environment variable. +# """ + +# Exit immediately if any command exits with a non-zero status. +set -e + +# Import the utility functions. +GIT_ROOT=$(git rev-parse --show-toplevel) +source $GIT_ROOT/class_project/project_template/utils.sh + +# Parse default args (-h, -v) and enable set -x if -v is passed. +# Shift processed option flags so remaining args are passed to the build. +parse_default_args "$@" +shift $((OPTIND-1)) + +# Load Docker configuration variables (REPO_NAME, IMAGE_NAME, FULL_IMAGE_NAME). +get_docker_vars_script ${BASH_SOURCE[0]} +source $DOCKER_NAME +print_docker_vars + +# Configure Docker build settings. +# Enable BuildKit for improved build performance and features. +export DOCKER_BUILDKIT=1 +#export DOCKER_BUILDKIT=0 + +# Configure single-architecture build (set to 1 for multi-arch build). +#export DOCKER_BUILD_MULTI_ARCH=1 +export DOCKER_BUILD_MULTI_ARCH=0 + +# Build the container image. +# Pass extra arguments (e.g., --no-cache) via command line after -v. +build_container_image "$@" diff --git a/class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/docker_build.version.log b/class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/docker_build.version.log new file mode 100644 index 000000000..8315eefe2 --- /dev/null +++ b/class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/docker_build.version.log @@ -0,0 +1 @@ +the input device is not a TTY diff --git a/class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/docker_clean.sh b/class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/docker_clean.sh new file mode 100755 index 000000000..7e40839ae --- /dev/null +++ b/class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/docker_clean.sh @@ -0,0 +1,26 @@ +#!/bin/bash +# """ +# Remove Docker container image for the project. +# +# This script cleans up Docker images by removing the container image +# matching the project configuration. Useful for freeing disk space or +# ensuring a fresh build. +# """ + +# Exit immediately if any command exits with a non-zero status. +set -e + +# Import the utility functions. +GIT_ROOT=$(git rev-parse --show-toplevel) +source $GIT_ROOT/class_project/project_template/utils.sh + +# Parse default args (-h, -v) and enable set -x if -v is passed. +parse_default_args "$@" + +# Load Docker configuration variables for this script. +get_docker_vars_script ${BASH_SOURCE[0]} +source $DOCKER_NAME +print_docker_vars + +# Remove the container image. +remove_container_image diff --git a/class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/docker_cmd.sh b/class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/docker_cmd.sh new file mode 100755 index 000000000..906d7a77b --- /dev/null +++ b/class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/docker_cmd.sh @@ -0,0 +1,41 @@ +#!/bin/bash +# """ +# Execute a command in a Docker container. +# +# This script runs a specified command inside a new Docker container instance. +# The container is removed automatically after the command completes. The +# git root is mounted to /git_root inside the container. +# """ + +# Exit immediately if any command exits with a non-zero status. +set -e + +# Import the utility functions. +GIT_ROOT=$(git rev-parse --show-toplevel) +source $GIT_ROOT/class_project/project_template/utils.sh + +# Parse default args (-h, -v) and enable set -x if -v is passed. +# Shift processed option flags so remaining args form the command. +parse_default_args "$@" +shift $((OPTIND-1)) + +# Capture the command to execute from remaining arguments. +CMD="$@" +echo "Executing: '$CMD'" + +# Load Docker configuration variables for this script. +get_docker_vars_script ${BASH_SOURCE[0]} +source $DOCKER_NAME +print_docker_vars + +# List available Docker images matching the expected image name. +run "docker image ls $FULL_IMAGE_NAME" +#(docker manifest inspect $FULL_IMAGE_NAME | grep arch) || true + +# Configure and run the Docker container with the specified command. +CONTAINER_NAME=$IMAGE_NAME +DOCKER_CMD=$(get_docker_cmd_command) +PORT="" +DOCKER_RUN_OPTS="" +DOCKER_CMD_OPTS=$(get_docker_bash_options $CONTAINER_NAME $PORT $DOCKER_RUN_OPTS) +run "$DOCKER_CMD $DOCKER_CMD_OPTS $FULL_IMAGE_NAME bash -c '$CMD'" diff --git a/class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/docker_exec.sh b/class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/docker_exec.sh new file mode 100755 index 000000000..24f8e401a --- /dev/null +++ b/class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/docker_exec.sh @@ -0,0 +1,25 @@ +#!/bin/bash +# """ +# Execute a bash shell in a running Docker container. +# +# This script connects to an already running Docker container and opens an +# interactive bash session for debugging or inspection purposes. +# """ + +# Exit immediately if any command exits with a non-zero status. +set -e + +# Import the utility functions. +GIT_ROOT=$(git rev-parse --show-toplevel) +source $GIT_ROOT/class_project/project_template/utils.sh + +# Parse default args (-h, -v) and enable set -x if -v is passed. +parse_default_args "$@" + +# Load Docker configuration variables for this script. +get_docker_vars_script ${BASH_SOURCE[0]} +source $DOCKER_NAME +print_docker_vars + +# Execute bash shell in the running container. +exec_container diff --git a/class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/docker_jupyter.sh b/class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/docker_jupyter.sh new file mode 100755 index 000000000..1a60dfd3a --- /dev/null +++ b/class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/docker_jupyter.sh @@ -0,0 +1,39 @@ +#!/bin/bash +# """ +# Execute Jupyter Lab in a Docker container. +# +# This script launches a Docker container running Jupyter Lab with +# configurable port, directory mounting, and vim bindings. It passes +# command-line options to the run_jupyter.sh script inside the container. +# +# Usage: +# > docker_jupyter.sh [options] +# """ + +# Exit immediately if any command exits with a non-zero status. +set -e + +# Import the utility functions. +GIT_ROOT=$(git rev-parse --show-toplevel) +source $GIT_ROOT/class_project/project_template/utils.sh + +# Parse command-line options and set Jupyter configuration variables. +parse_docker_jupyter_args "$@" + +# Load Docker configuration variables for this script. +get_docker_vars_script ${BASH_SOURCE[0]} +source $DOCKER_NAME +print_docker_vars + +# List available Docker images and inspect architecture. +list_and_inspect_docker_image + +# Run the Docker container with Jupyter Lab. +CMD=$(get_run_jupyter_cmd "${BASH_SOURCE[0]}" "$OLD_CMD_OPTS") +CONTAINER_NAME=$IMAGE_NAME +# Kill existing container if -f flag is set. +kill_existing_container_if_forced + +DOCKER_CMD=$(get_docker_jupyter_command) +DOCKER_CMD_OPTS=$(get_docker_jupyter_options $CONTAINER_NAME $JUPYTER_HOST_PORT $JUPYTER_USE_VIM) +run "$DOCKER_CMD $DOCKER_CMD_OPTS $FULL_IMAGE_NAME $CMD" diff --git a/class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/docker_name.sh b/class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/docker_name.sh new file mode 100644 index 000000000..32a546cf3 --- /dev/null +++ b/class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/docker_name.sh @@ -0,0 +1,12 @@ +#!/bin/bash +# """ +# Docker image naming configuration. +# +# This file defines the repository name, image name, and full image name +# variables used by all docker_*.sh scripts in the project template. +# """ + +REPO_NAME=gpsaggese +# The file should be all lower case. +IMAGE_NAME=umd_project_template +FULL_IMAGE_NAME=$REPO_NAME/$IMAGE_NAME diff --git a/class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/docker_push.sh b/class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/docker_push.sh new file mode 100755 index 000000000..27d752dd9 --- /dev/null +++ b/class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/docker_push.sh @@ -0,0 +1,25 @@ +#!/bin/bash +# """ +# Push Docker container image to Docker Hub or registry. +# +# This script authenticates with the Docker registry using credentials from +# ~/.docker/passwd.$REPO_NAME.txt and pushes the locally built container +# image to the remote repository. +# """ + +# Exit immediately if any command exits with a non-zero status. +set -e + +# Import the utility functions. +GIT_ROOT=$(git rev-parse --show-toplevel) +source $GIT_ROOT/class_project/project_template/utils.sh + +# Parse default args (-h, -v) and enable set -x if -v is passed. +parse_default_args "$@" + +# Load Docker image naming configuration. +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +source $SCRIPT_DIR/docker_name.sh + +# Push the container image to the registry. +push_container_image diff --git a/class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/etc_sudoers b/class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/etc_sudoers new file mode 100644 index 000000000..ee0816a15 --- /dev/null +++ b/class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/etc_sudoers @@ -0,0 +1,31 @@ +# +# This file MUST be edited with the 'visudo' command as root. +# +# Please consider adding local content in /etc/sudoers.d/ instead of +# directly modifying this file. +# +# See the man page for details on how to write a sudoers file. +# +Defaults env_reset +Defaults mail_badpass +Defaults secure_path="/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/snap/bin" + +# Host alias specification + +# User alias specification + +# Cmnd alias specification + +# User privilege specification +root ALL=(ALL:ALL) ALL + +# Members of the admin group may gain root privileges +%admin ALL=(ALL) ALL + +# Allow members of group sudo to execute any command +%sudo ALL=(ALL:ALL) ALL + +# See sudoers(5) for more information on "#include" directives: +postgres ALL=(ALL) NOPASSWD:ALL + +#includedir /etc/sudoers.d diff --git a/class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/requirements.txt b/class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/requirements.txt new file mode 100644 index 000000000..49aca3901 --- /dev/null +++ b/class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/requirements.txt @@ -0,0 +1,4 @@ +matplotlib +numpy +pandas +seaborn diff --git a/class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/run_jupyter.sh b/class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/run_jupyter.sh new file mode 100755 index 000000000..d725c3fe7 --- /dev/null +++ b/class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/run_jupyter.sh @@ -0,0 +1,35 @@ +#!/bin/bash +# """ +# Launch Jupyter Lab server. +# +# This script starts Jupyter Lab on port 8888 with the following configuration: +# - No browser auto-launch (useful for Docker containers) +# - Accessible from any IP address (0.0.0.0) +# - Root user allowed (required for Docker environments) +# - No authentication token or password (for development convenience) +# - Vim keybindings can be enabled via JUPYTER_USE_VIM environment variable +# """ + +# Exit immediately if any command exits with a non-zero status. +set -e + +# Print each command to stdout before executing it. +#set -x + +# Import the utility functions from /git_root. +GIT_ROOT=/git_root +source $GIT_ROOT/class_project/project_template/utils.sh + +# Load Docker configuration variables for this script. +get_docker_vars_script ${BASH_SOURCE[0]} +source $DOCKER_NAME +print_docker_vars + +# Setup Jupyter Lab environment. +setup_jupyter_environment + +# Initialize Jupyter Lab command with base configuration. +JUPYTER_ARGS=$(get_jupyter_args) + +# Start Jupyter Lab with development-friendly settings. +run "jupyter lab $JUPYTER_ARGS" diff --git a/class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/template.API.ipynb b/class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/template.API.ipynb new file mode 100644 index 000000000..3afca937c --- /dev/null +++ b/class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/template.API.ipynb @@ -0,0 +1,215 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "183c2248-ea3d-43ba-b87e-d821bba1bbc6", + "metadata": {}, + "source": [ + "# Template API Notebook\n", + "\n", + "This is a template notebook. The first heading should be the title of what notebook is about. For example, if it is a neo4j tutorial the heading should be `Neo4j API`.\n", + "\n", + "- Add description of what the notebook does.\n", + "- Point to references, e.g. (neo4j.API.md)\n", + "- Add citations.\n", + "- Keep the notebook flow clear.\n", + "- Comments should be imperative and have a period at the end.\n", + "- Your code should be well commented.\n", + "\n", + "The name of this notebook should in the following format:\n", + "- if the notebook is exploring `pycaret API`, then it is `pycaret.API.ipynb`\n", + "\n", + "Follow the reference to write notebooks in a clear manner: https://github.com/causify-ai/helpers/blob/master/docs/coding/all.jupyter_notebook.how_to_guide.md" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "265e0d58-a7cd-4edf-a0b4-96b60220e801", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The autoreload extension is already loaded. To reload it, use:\n", + " %reload_ext autoreload\n" + ] + } + ], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2\n", + "%matplotlib inline" + ] + }, + { + "cell_type": "markdown", + "id": "d3b2f997-5c9b-4238-b6d5-e5f2cea43809", + "metadata": {}, + "source": [ + "## Imports" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "d1480ee9-d6a6-437d-b927-da6cbb05bdf5", + "metadata": {}, + "outputs": [], + "source": [ + "import logging\n", + "# Import libraries in this section.\n", + "# Avoid imports like import *, from ... import ..., from ... import *, etc.\n", + "\n", + "import helpers.hdbg as hdbg\n", + "import helpers.hnotebook as hnotebo" + ] + }, + { + "cell_type": "markdown", + "id": "f9208cc9-837d-4fec-a312-9c4aa5b7648d", + "metadata": {}, + "source": [ + "## Configuration" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "9a2d7a9c-c6c5-48c9-8445-11c97045d00b", + "metadata": { + "lines_to_next_cell": 2 + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[0mWARNING: Running in Jupyter\n", + "INFO > cmd='/venv/lib/python3.12/site-packages/ipykernel_launcher.py -f /home/.local/share/jupyter/runtime/kernel-085a2ce7-6161-4c8a-92d5-492051832f3c.json'\n" + ] + } + ], + "source": [ + "hdbg.init_logger(verbosity=logging.INFO)\n", + "\n", + "_LOG = logging.getLogger(__name__)\n", + "\n", + "hnotebo.config_notebook()" + ] + }, + { + "cell_type": "markdown", + "id": "79c37ba3-bd5d-4a44-87df-645eee54977a", + "metadata": { + "lines_to_next_cell": 2 + }, + "source": [ + "## Make the notebook flow clear\n", + "Each notebook needs to follow a clear and logical flow, e.g:\n", + "- Load data\n", + "- Compute stats\n", + "- Clean data\n", + "- Compute stats\n", + "- Do analysis\n", + "- Show results\n", + "\n", + "\n", + "\n", + "\n", + "#############################################################################\n", + "Template\n", + "#############################################################################" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "a8a109cd-fc8e-4b9e-9dc0-4fc8d4126ad8", + "metadata": { + "lines_to_next_cell": 2 + }, + "outputs": [], + "source": [ + "class Template:\n", + " \"\"\"\n", + " Brief imperative description of what the class does in one line, if needed.\n", + " \"\"\"\n", + "\n", + " def __init__(self):\n", + " pass\n", + "\n", + " def method1(self, arg1: int) -> None:\n", + " \"\"\"\n", + " Brief imperative description of what the method does in one line.\n", + "\n", + " You can elaborate more in the method docstring in this section, for e.g. explaining\n", + " the formula/algorithm. Every method/function should have a docstring, typehints and include the\n", + " parameters and return as follows:\n", + "\n", + " :param arg1: description of arg1\n", + " :return: description of return\n", + " \"\"\"\n", + " # Code bloks go here.\n", + " # Make sure to include comments to explain what the code is doing.\n", + " # No empty lines between code blocks.\n", + " pass\n", + "\n", + "\n", + "def template_function(arg1: int) -> None:\n", + " \"\"\"\n", + " Brief imperative description of what the function does in one line.\n", + "\n", + " You can elaborate more in the function docstring in this section, for e.g. explaining\n", + " the formula/algorithm. Every function should have a docstring, typehints and include the\n", + " parameters and return as follows:\n", + "\n", + " :param arg1: description of arg1\n", + " :return: description of return\n", + " \"\"\"\n", + " # Code bloks go here.\n", + " # Make sure to include comments to explain what the code is doing.\n", + " # No empty lines between code blocks.\n", + " pass" + ] + }, + { + "cell_type": "markdown", + "id": "00926523-ae59-497d-bba8-b22e58333849", + "metadata": {}, + "source": [ + "## The flow should be highlighted using headings in markdown\n", + "```\n", + "# Level 1\n", + "## Level 2\n", + "### Level 3\n", + "```" + ] + } + ], + "metadata": { + "jupytext": { + "formats": "ipynb,py:percent" + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/template.API.py b/class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/template.API.py new file mode 100644 index 000000000..4192ef8fe --- /dev/null +++ b/class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/template.API.py @@ -0,0 +1,129 @@ +# --- +# jupyter: +# jupytext: +# formats: ipynb,py:percent +# text_representation: +# extension: .py +# format_name: percent +# format_version: '1.3' +# jupytext_version: 1.19.0 +# kernelspec: +# display_name: Python 3 (ipykernel) +# language: python +# name: python3 +# --- + +# %% [markdown] +# # Template API Notebook +# +# This is a template notebook. The first heading should be the title of what notebook is about. For example, if it is a neo4j tutorial the heading should be `Neo4j API`. +# +# - Add description of what the notebook does. +# - Point to references, e.g. (neo4j.API.md) +# - Add citations. +# - Keep the notebook flow clear. +# - Comments should be imperative and have a period at the end. +# - Your code should be well commented. +# +# The name of this notebook should in the following format: +# - if the notebook is exploring `pycaret API`, then it is `pycaret.API.ipynb` +# +# Follow the reference to write notebooks in a clear manner: https://github.com/causify-ai/helpers/blob/master/docs/coding/all.jupyter_notebook.how_to_guide.md + +# %% +# %load_ext autoreload +# %autoreload 2 +# %matplotlib inline + +# %% [markdown] +# ## Imports + +# %% +import logging +# Import libraries in this section. +# Avoid imports like import *, from ... import ..., from ... import *, etc. + +import helpers.hdbg as hdbg +import helpers.hnotebook as hnotebo + +# %% [markdown] +# ## Configuration + +# %% +hdbg.init_logger(verbosity=logging.INFO) + +_LOG = logging.getLogger(__name__) + +hnotebo.config_notebook() + + +# %% [markdown] +# ## Make the notebook flow clear +# Each notebook needs to follow a clear and logical flow, e.g: +# - Load data +# - Compute stats +# - Clean data +# - Compute stats +# - Do analysis +# - Show results +# +# +# +# + + +# ############################################################################# +# Template +# ############################################################################# + + +# %% +class Template: + """ + Brief imperative description of what the class does in one line, if needed. + """ + + def __init__(self): + pass + + def method1(self, arg1: int) -> None: + """ + Brief imperative description of what the method does in one line. + + You can elaborate more in the method docstring in this section, for e.g. explaining + the formula/algorithm. Every method/function should have a docstring, typehints and include the + parameters and return as follows: + + :param arg1: description of arg1 + :return: description of return + """ + # Code bloks go here. + # Make sure to include comments to explain what the code is doing. + # No empty lines between code blocks. + pass + + +def template_function(arg1: int) -> None: + """ + Brief imperative description of what the function does in one line. + + You can elaborate more in the function docstring in this section, for e.g. explaining + the formula/algorithm. Every function should have a docstring, typehints and include the + parameters and return as follows: + + :param arg1: description of arg1 + :return: description of return + """ + # Code bloks go here. + # Make sure to include comments to explain what the code is doing. + # No empty lines between code blocks. + pass + + +# %% [markdown] +# ## The flow should be highlighted using headings in markdown +# ``` +# # Level 1 +# ## Level 2 +# ### Level 3 +# ``` diff --git a/class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/template.example.ipynb b/class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/template.example.ipynb new file mode 100644 index 000000000..a2e9aedd7 --- /dev/null +++ b/class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/template.example.ipynb @@ -0,0 +1,198 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "50f78f7e-2dee-45d6-9d37-7a55eeaae283", + "metadata": {}, + "source": [ + "# Template Example Notebook\n", + "\n", + "This is a template notebook. The first heading should be the title of what notebook is about. For example, if it is a project on neo4j tutorial the heading should be `Project Title`.\n", + "\n", + "- Add description of what the notebook does.\n", + "- Point to references, e.g. (neo4j.example.md)\n", + "- Add citations.\n", + "- Keep the notebook flow clear.\n", + "- Comments should be imperative and have a period at the end.\n", + "- Your code should be well commented.\n", + "\n", + "The name of this notebook should in the following format:\n", + "- if the notebook is exploring `pycaret API`, then it is `pycaret.example.ipynb`\n", + "\n", + "Follow the reference to write notebooks in a clear manner: https://github.com/causify-ai/helpers/blob/master/docs/coding/all.jupyter_notebook.how_to_guide.md" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "6226667e-cab5-479c-be6a-6b7d6f580a97", + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2\n", + "%matplotlib inline" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "8020901a-4bc7-4b73-95e8-aaa462b4fc19", + "metadata": {}, + "outputs": [], + "source": [ + "import logging\n", + "# Import libraries in this section.\n", + "# Avoid imports like import *, from ... import ..., from ... import *, etc.\n", + "\n", + "import helpers.hdbg as hdbg\n", + "import helpers.hnotebook as hnotebo" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "4ecb72b2-b21d-4fb0-ac92-e7174da390e6", + "metadata": { + "lines_to_next_cell": 2 + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[0mWARNING: Running in Jupyter\n", + "INFO > cmd='/venv/lib/python3.12/site-packages/ipykernel_launcher.py -f /home/.local/share/jupyter/runtime/kernel-783e0930-1631-4d64-8bb4-f3a98bb74fcd.json'\n" + ] + } + ], + "source": [ + "hdbg.init_logger(verbosity=logging.INFO)\n", + "\n", + "_LOG = logging.getLogger(__name__)\n", + "\n", + "hnotebo.config_notebook()" + ] + }, + { + "cell_type": "markdown", + "id": "1ede6422-bff2-4f0a-8d28-29a01d4786b2", + "metadata": { + "lines_to_next_cell": 2 + }, + "source": [ + "## Make the notebook flow clear\n", + "Each notebook needs to follow a clear and logical flow, e.g:\n", + "- Load data\n", + "- Compute stats\n", + "- Clean data\n", + "- Compute stats\n", + "- Do analysis\n", + "- Show results\n", + "\n", + "\n", + "\n", + "\n", + "#############################################################################\n", + "Template\n", + "#############################################################################" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "8bbd660d-d22f-44fa-bf53-dd622dee0f53", + "metadata": { + "lines_to_next_cell": 2 + }, + "outputs": [], + "source": [ + "class Template:\n", + " \"\"\"\n", + " Brief imperative description of what the class does in one line, if needed.\n", + " \"\"\"\n", + "\n", + " def __init__(self):\n", + " pass\n", + "\n", + " def method1(self, arg1: int) -> None:\n", + " \"\"\"\n", + " Brief imperative description of what the method does in one line.\n", + "\n", + " You can elaborate more in the method docstring in this section, for e.g. explaining\n", + " the formula/algorithm. Every method/function should have a docstring, typehints and include the\n", + " parameters and return as follows:\n", + "\n", + " :param arg1: description of arg1\n", + " :return: description of return\n", + " \"\"\"\n", + " # Code bloks go here.\n", + " # Make sure to include comments to explain what the code is doing.\n", + " # No empty lines between code blocks.\n", + " pass\n", + "\n", + "\n", + "def template_function(arg1: int) -> None:\n", + " \"\"\"\n", + " Brief imperative description of what the function does in one line.\n", + "\n", + " You can elaborate more in the function docstring in this section, for e.g. explaining\n", + " the formula/algorithm. Every function should have a docstring, typehints and include the\n", + " parameters and return as follows:\n", + "\n", + " :param arg1: description of arg1\n", + " :return: description of return\n", + " \"\"\"\n", + " # Code bloks go here.\n", + " # Make sure to include comments to explain what the code is doing.\n", + " # No empty lines between code blocks.\n", + " pass" + ] + }, + { + "cell_type": "markdown", + "id": "103f6e36-54cf-442c-b137-8091d48805a7", + "metadata": {}, + "source": [ + "## The flow should be highlighted using headings in markdown\n", + "```\n", + "# Level 1\n", + "## Level 2\n", + "### Level 3\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d05d52af-67ba-4a4f-a561-af453e43854f", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "jupytext": { + "formats": "ipynb,py:percent" + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/template.example.py b/class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/template.example.py new file mode 100644 index 000000000..8566ff277 --- /dev/null +++ b/class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/template.example.py @@ -0,0 +1,125 @@ +# --- +# jupyter: +# jupytext: +# formats: ipynb,py:percent +# text_representation: +# extension: .py +# format_name: percent +# format_version: '1.3' +# jupytext_version: 1.19.0 +# kernelspec: +# display_name: Python 3 (ipykernel) +# language: python +# name: python3 +# --- + +# %% [markdown] +# # Template Example Notebook +# +# This is a template notebook. The first heading should be the title of what notebook is about. For example, if it is a project on neo4j tutorial the heading should be `Project Title`. +# +# - Add description of what the notebook does. +# - Point to references, e.g. (neo4j.example.md) +# - Add citations. +# - Keep the notebook flow clear. +# - Comments should be imperative and have a period at the end. +# - Your code should be well commented. +# +# The name of this notebook should in the following format: +# - if the notebook is exploring `pycaret API`, then it is `pycaret.example.ipynb` +# +# Follow the reference to write notebooks in a clear manner: https://github.com/causify-ai/helpers/blob/master/docs/coding/all.jupyter_notebook.how_to_guide.md + +# %% +# %load_ext autoreload +# %autoreload 2 +# %matplotlib inline + +# %% +import logging +# Import libraries in this section. +# Avoid imports like import *, from ... import ..., from ... import *, etc. + +import helpers.hdbg as hdbg +import helpers.hnotebook as hnotebo + +# %% +hdbg.init_logger(verbosity=logging.INFO) + +_LOG = logging.getLogger(__name__) + +hnotebo.config_notebook() + + +# %% [markdown] +# ## Make the notebook flow clear +# Each notebook needs to follow a clear and logical flow, e.g: +# - Load data +# - Compute stats +# - Clean data +# - Compute stats +# - Do analysis +# - Show results +# +# +# +# + + +# ############################################################################# +# Template +# ############################################################################# + + +# %% +class Template: + """ + Brief imperative description of what the class does in one line, if needed. + """ + + def __init__(self): + pass + + def method1(self, arg1: int) -> None: + """ + Brief imperative description of what the method does in one line. + + You can elaborate more in the method docstring in this section, for e.g. explaining + the formula/algorithm. Every method/function should have a docstring, typehints and include the + parameters and return as follows: + + :param arg1: description of arg1 + :return: description of return + """ + # Code bloks go here. + # Make sure to include comments to explain what the code is doing. + # No empty lines between code blocks. + pass + + +def template_function(arg1: int) -> None: + """ + Brief imperative description of what the function does in one line. + + You can elaborate more in the function docstring in this section, for e.g. explaining + the formula/algorithm. Every function should have a docstring, typehints and include the + parameters and return as follows: + + :param arg1: description of arg1 + :return: description of return + """ + # Code bloks go here. + # Make sure to include comments to explain what the code is doing. + # No empty lines between code blocks. + pass + + +# %% [markdown] +# ## The flow should be highlighted using headings in markdown +# ``` +# # Level 1 +# ## Level 2 +# ### Level 3 +# ``` + +# %% diff --git a/class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/template_utils.py b/class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/template_utils.py new file mode 100644 index 000000000..f8916102e --- /dev/null +++ b/class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/template_utils.py @@ -0,0 +1,72 @@ +""" +template_utils.py + +This file contains utility functions that support the tutorial notebooks. + +- Notebooks should call these functions instead of writing raw logic inline. +- This helps keep the notebooks clean, modular, and easier to debug. +- Students should implement functions here for data preprocessing, + model setup, evaluation, or any reusable logic. + +Import as: + +import class_project.project_template.template_utils as cpptteut +""" + +import pandas as pd +import logging +from sklearn.model_selection import train_test_split +from pycaret.classification import compare_models + +# ----------------------------------------------------------------------------- +# Logging +# ----------------------------------------------------------------------------- + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +# ----------------------------------------------------------------------------- +# Example 1: Split the dataset into train and test sets +# ----------------------------------------------------------------------------- + + +def split_data(df: pd.DataFrame, target_column: str, test_size: float = 0.2): + """ + Split the dataset into training and testing sets. + + :param df: full dataset + :param target_column: name of the target column + :param test_size: proportion of test data (default = 0.2) + + :return: X_train, X_test, y_train, y_test + """ + logger.info("Splitting data into train and test sets") + X = df.drop(columns=[target_column]) + y = df[target_column] + return train_test_split(X, y, test_size=test_size, random_state=42) + + +# ----------------------------------------------------------------------------- +# Example 2: PyCaret classification pipeline +# ----------------------------------------------------------------------------- + + +def run_pycaret_classification( + df: pd.DataFrame, target_column: str +) -> pd.DataFrame: + """ + Run a basic PyCaret classification experiment. + + :param df: dataset containing features and target + :param target_column: name of the target column + + :return: comparison of top-performing models + """ + logger.info("Initializing PyCaret classification setup") + ... + + logger.info("Comparing models") + results = compare_models() + ... + + return results diff --git a/class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/test/test_docker_all.py b/class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/test/test_docker_all.py new file mode 100644 index 000000000..904cdd7af --- /dev/null +++ b/class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/test/test_docker_all.py @@ -0,0 +1,48 @@ +""" +Run each notebook in class_project/project_template/ inside Docker using docker_cmd.sh. + +Import as: + +import class_project.project_template.test.test_docker_all as tptdal +""" + +import logging + +import pytest + +import helpers.hdocker_tests as hdoctest + +_LOG = logging.getLogger(__name__) + + +# ############################################################################# +# Test_docker +# ############################################################################# + + +class Test_docker(hdoctest.DockerTestCase): + """ + Run all Docker tests for class_project/project_template/. + """ + + _test_file = __file__ + + @pytest.mark.slow + def test1(self) -> None: + """ + Test that template.example.ipynb runs without error inside Docker. + """ + # Prepare inputs. + notebook_name = "template.example.ipynb" + # Run test. + self._helper(notebook_name) + + @pytest.mark.slow + def test2(self) -> None: + """ + Test that template.API.ipynb runs without error inside Docker. + """ + # Prepare inputs. + notebook_name = "template.API.ipynb" + # Run test. + self._helper(notebook_name) diff --git a/class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/utils.sh b/class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/utils.sh new file mode 100644 index 000000000..cc0ed8c4a --- /dev/null +++ b/class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/utils.sh @@ -0,0 +1,607 @@ +#!/bin/bash +# """ +# Utility functions for Docker container management. +# """ + + +# ############################################################################# +# General utilities +# ############################################################################# + + +run() { + # """ + # Execute a command with echo output. + # + # :param cmd: Command string to execute + # :return: Exit status of the executed command + # """ + cmd="$*" + echo "> $cmd" + eval "$cmd" +} + + +enable_verbose_mode() { + # """ + # Enable shell command tracing (set -x) when VERBOSE is set to 1. + # + # Reads the VERBOSE variable set by parse_docker_jupyter_args. + # Call this after parsing args to activate tracing for the rest of the script. + # """ + if [[ $VERBOSE == 1 ]]; then + set -x + fi +} + + +# ############################################################################# +# Argument parsing +# ############################################################################# + + +_print_default_help() { + # """ + # Print usage information and available default options for docker scripts. + # """ + echo "Usage: $(basename $0) [options]" + echo "" + echo "Options:" + echo " -f Force kill existing container with same name before starting" + echo " -h Print this help message and exit" + echo " -v Enable verbose output (set -x)" +} + + +parse_default_args() { + # """ + # Parse default command-line arguments for docker scripts. + # + # Sets VERBOSE and FORCE variables in the caller's scope. Enables set -x + # when -v is passed. Prints help and exits when -h is passed. + # Updates OPTIND so the caller can shift away processed arguments. + # + # :param @: command-line arguments forwarded from the calling script + # """ + VERBOSE=0 + FORCE=0 + while getopts "fhv" flag; do + case "${flag}" in + f) FORCE=1;; + h) _print_default_help; exit 0;; + v) VERBOSE=1;; + *) _print_default_help; exit 1;; + esac + done + enable_verbose_mode +} + + +_print_docker_jupyter_help() { + # """ + # Print usage information and available options for docker_jupyter.sh. + # """ + echo "Usage: $(basename $0) [options]" + echo "" + echo "Launch Jupyter Lab inside a Docker container." + echo "" + echo "Options:" + echo " -f Force kill existing container with same name before starting" + echo " -h Print this help message and exit" + echo " -p PORT Host port to forward to Jupyter Lab (default: 8888)" + echo " -u Enable vim keybindings in Jupyter Lab" + echo " -v Enable verbose output (set -x)" +} + + +parse_docker_jupyter_args() { + # """ + # Parse command-line arguments for docker_jupyter.sh. + # + # Sets JUPYTER_HOST_PORT, JUPYTER_USE_VIM, TARGET_DIR, VERBOSE, FORCE, and + # OLD_CMD_OPTS in the caller's scope. Enables set -x when -v is passed. + # Prints help and exits when -h is passed. + # + # :param @: command-line arguments forwarded from the calling script + # """ + # Set defaults. + JUPYTER_HOST_PORT=8888 + JUPYTER_USE_VIM=0 + VERBOSE=0 + FORCE=0 + # Save original args to pass through to run_jupyter.sh. + OLD_CMD_OPTS="$*" + # Parse options. + while getopts "fhp:uv" flag; do + case "${flag}" in + f) FORCE=1;; + h) _print_docker_jupyter_help; exit 0;; + p) JUPYTER_HOST_PORT=${OPTARG};; # Port for Jupyter Lab. + u) JUPYTER_USE_VIM=1;; # Enable vim bindings. + v) VERBOSE=1;; # Enable verbose output. + *) _print_docker_jupyter_help; exit 1;; + esac + done + # Enable command tracing if verbose mode is requested. + enable_verbose_mode +} + + +# ############################################################################# +# Docker image management +# ############################################################################# + + +get_docker_vars_script() { + # """ + # Load Docker variables from docker_name.sh script. + # + # :param script_path: Path to the script to determine the Docker configuration directory + # :return: Sources REPO_NAME, IMAGE_NAME, and FULL_IMAGE_NAME variables + # """ + local script_path=$1 + # Find the name of the container. + SCRIPT_DIR=$(dirname $script_path) + DOCKER_NAME="$SCRIPT_DIR/docker_name.sh" + if [[ ! -e $SCRIPT_DIR ]]; then + echo "Can't find $DOCKER_NAME" + exit -1 + fi; + source $DOCKER_NAME +} + + +print_docker_vars() { + # """ + # Print current Docker variables to stdout. + # """ + echo "REPO_NAME=$REPO_NAME" + echo "IMAGE_NAME=$IMAGE_NAME" + echo "FULL_IMAGE_NAME=$FULL_IMAGE_NAME" +} + + +build_container_image() { + # """ + # Build a Docker container image. + # + # Supports both single-architecture and multi-architecture builds. + # Creates temporary build directory, copies files, and builds the image. + # + # :param @: Additional options to pass to docker build/buildx build + # """ + echo "# ${FUNCNAME[0]} ..." + FULL_IMAGE_NAME=$REPO_NAME/$IMAGE_NAME + echo "FULL_IMAGE_NAME=$FULL_IMAGE_NAME" + # Prepare build area. + #tar -czh . | docker build $OPTS -t $IMAGE_NAME - + DIR="../tmp.build" + if [[ -d $DIR ]]; then + rm -rf $DIR + fi; + cp -Lr . $DIR || true + # Build container. + echo "DOCKER_BUILDKIT=$DOCKER_BUILDKIT" + echo "DOCKER_BUILD_MULTI_ARCH=$DOCKER_BUILD_MULTI_ARCH" + if [[ $DOCKER_BUILD_MULTI_ARCH != 1 ]]; then + # Build for a single architecture. + echo "Building for current architecture..." + OPTS="--progress plain $@" + (cd $DIR; docker build $OPTS -t $FULL_IMAGE_NAME . 2>&1 | tee ../docker_build.log; exit ${PIPESTATUS[0]}) + else + # Build for multiple architectures. + echo "Building for multiple architectures..." + OPTS="$@" + export DOCKER_CLI_EXPERIMENTAL=enabled + # Create a new builder. + #docker buildx rm --all-inactive --force + #docker buildx create --name mybuilder + #docker buildx use mybuilder + # Use the default builder. + docker buildx use multiarch + docker buildx inspect --bootstrap + # Note that one needs to push to the repo since otherwise it is not + # possible to keep multiple. + (cd $DIR; docker buildx build --push --platform linux/arm64,linux/amd64 $OPTS --tag $FULL_IMAGE_NAME . 2>&1 | tee ../docker_build.log; exit ${PIPESTATUS[0]}) + # Report the status. + docker buildx imagetools inspect $FULL_IMAGE_NAME + fi; + # Report build version. + if [ -f docker_build.version.log ]; then + rm docker_build.version.log + fi + (cd $DIR; docker run --rm -it -v $(pwd):/data $FULL_IMAGE_NAME bash -c "/data/version.sh") 2>&1 | tee docker_build.version.log + # + docker image ls $REPO_NAME/$IMAGE_NAME + rm -rf $DIR + echo "*****************************" + echo "SUCCESS" + echo "*****************************" +} + + +remove_container_image() { + # """ + # Remove Docker container image(s) matching the current configuration. + # """ + echo "# ${FUNCNAME[0]} ..." + FULL_IMAGE_NAME=$REPO_NAME/$IMAGE_NAME + echo "FULL_IMAGE_NAME=$FULL_IMAGE_NAME" + docker image ls | grep $FULL_IMAGE_NAME + docker image ls | grep $FULL_IMAGE_NAME | awk '{print $1}' | xargs -n 1 -t docker image rm -f + docker image ls + echo "${FUNCNAME[0]} ... done" +} + + +push_container_image() { + # """ + # Push Docker container image to registry. + # + # Authenticates using credentials from ~/.docker/passwd.$REPO_NAME.txt. + # """ + echo "# ${FUNCNAME[0]} ..." + FULL_IMAGE_NAME=$REPO_NAME/$IMAGE_NAME + echo "FULL_IMAGE_NAME=$FULL_IMAGE_NAME" + docker login --username $REPO_NAME --password-stdin <~/.docker/passwd.$REPO_NAME.txt + docker images $FULL_IMAGE_NAME + docker push $FULL_IMAGE_NAME + echo "${FUNCNAME[0]} ... done" +} + + +pull_container_image() { + # """ + # Pull Docker container image from registry. + # """ + echo "# ${FUNCNAME[0]} ..." + FULL_IMAGE_NAME=$REPO_NAME/$IMAGE_NAME + echo "FULL_IMAGE_NAME=$FULL_IMAGE_NAME" + docker pull $FULL_IMAGE_NAME + echo "${FUNCNAME[0]} ... done" +} + + +# ############################################################################# +# Docker container management +# ############################################################################# + + +kill_container() { + # """ + # Kill and remove Docker container(s) matching the current configuration. + # """ + echo "# ${FUNCNAME[0]} ..." + FULL_IMAGE_NAME=$REPO_NAME/$IMAGE_NAME + echo "FULL_IMAGE_NAME=$FULL_IMAGE_NAME" + docker container ls + # + CONTAINER_ID=$(docker container ls -a | grep $FULL_IMAGE_NAME | awk '{print $1}') + echo "CONTAINER_ID=$CONTAINER_ID" + if [[ ! -z $CONTAINER_ID ]]; then + docker container rm -f $CONTAINER_ID + docker container ls + fi; + echo "${FUNCNAME[0]} ... done" +} + + +kill_container_by_name() { + # """ + # Kill and remove a Docker container by its name. + # + # :param container_name: Name of the container to kill + # """ + local container_name=$1 + echo "# ${FUNCNAME[0]}: $container_name" + # Check if container exists (running or stopped). + local container_id=$(docker container ls -a --filter "name=^${container_name}$" --format "{{.ID}}") + if [[ -n $container_id ]]; then + echo "Killing container: $container_name (ID: $container_id)" + docker container rm -f $container_id + else + echo "Container '$container_name' not found" + fi + echo "${FUNCNAME[0]} ... done" +} + + +exec_container() { + # """ + # Execute bash shell in running Docker container. + # + # Opens an interactive bash session in the first container matching the + # current configuration. + # """ + echo "# ${FUNCNAME[0]} ..." + FULL_IMAGE_NAME=$REPO_NAME/$IMAGE_NAME + echo "FULL_IMAGE_NAME=$FULL_IMAGE_NAME" + docker container ls + # + CONTAINER_ID=$(docker container ls -a | grep $FULL_IMAGE_NAME | awk '{print $1}') + echo "CONTAINER_ID=$CONTAINER_ID" + docker exec -it $CONTAINER_ID bash + echo "${FUNCNAME[0]} ... done" +} + + +# ############################################################################# +# Docker common options +# ############################################################################# + + +get_docker_common_options() { + # """ + # Return docker run options common to all container types. + # + # Includes volume mount for the git root, plus environment variables for + # PYTHONPATH and host OS name. + # + # :return: docker run options string with volume mounts and env vars + # """ + echo "-v $GIT_ROOT:/git_root \ + -e PYTHONPATH=/git_root:/git_root/helpers_root:/git_root/msml610/tutorials \ + -e CSFY_GIT_ROOT_PATH=/git_root \ + -e CSFY_HOST_OS_NAME=$(uname -s) \ + -e CSFY_HOST_NAME=$(uname -n)" +} + + +# ############################################################################# +# Docker bash +# ############################################################################# + + +get_docker_bash_command() { + # """ + # Return the base docker run command for an interactive bash shell. + # + # :return: docker run command string with --rm and -ti flags + # """ + if [ -t 0 ]; then + echo "docker run --rm -ti" + else + echo "docker run --rm -i" + fi +} + + +get_docker_bash_options() { + # """ + # Return docker run options for a Docker container. + # + # :param container_name: Name for the Docker container + # :param port: Port number to forward (optional, skipped if empty) + # :param extra_opts: Additional docker run options (optional) + # :return: docker run options string with name, volume mounts, and env vars + # """ + local container_name=$1 + local port=$2 + local extra_opts=$3 + local port_opt="" + if [[ -n $port ]]; then + port_opt="-p $port:$port" + fi + echo "--name $container_name \ + $port_opt \ + $extra_opts \ + $(get_docker_common_options)" +} + + +# ############################################################################# +# Docker cmd +# ############################################################################# + + +get_docker_cmd_command() { + # """ + # Return the base docker run command for executing a non-interactive command. + # + # :return: docker run command string with --rm and -i flags + # """ + echo "docker run --rm -i" +} + + +# ############################################################################# +# Docker Jupyter +# ############################################################################# + + +get_docker_jupyter_command() { + # """ + # Return the base docker run command for running Jupyter Lab interactively. + # + # :return: docker run command string with --rm and -ti flags (if TTY available) + # """ + local docker_cmd="docker run --rm" + # Add interactive and TTY flags only if stdin is a TTY. + if [[ -t 0 ]]; then + docker_cmd="$docker_cmd -ti" + fi + echo "$docker_cmd" +} + + +get_docker_jupyter_options() { + # """ + # Return docker run options for a Jupyter Lab container. + # + # :param container_name: Name for the Docker container + # :param host_port: Host port to forward to container port 8888 + # :param jupyter_use_vim: 0 or 1 to enable vim bindings + # :return: docker run options string + # """ + local container_name=$1 + local host_port=$2 + local jupyter_use_vim=$3 + # Run as the current user when user is saggese. + if [[ "$(whoami)" == "saggese" ]]; then + echo "Overwriting jupyter_use_vim since user='saggese'" >&2 + jupyter_use_vim=1 + fi + echo "--name $container_name \ + -p $host_port:8888 \ + $(get_docker_common_options) \ + -e JUPYTER_USE_VIM=$jupyter_use_vim" +} + + +configure_jupyter_vim_keybindings() { + # """ + # Configure JupyterLab vim keybindings based on JUPYTER_USE_VIM env var. + # + # Reads JUPYTER_USE_VIM; if 1, verifies jupyterlab_vim is installed and + # writes enabled settings; otherwise writes disabled settings. + # """ + mkdir -p ~/.jupyter/lab/user-settings/@axlair/jupyterlab_vim + if [[ $JUPYTER_USE_VIM == 1 ]]; then + # Check that jupyterlab_vim is installed before trying to enable it. + if ! pip show jupyterlab_vim > /dev/null 2>&1; then + echo "ERROR: jupyterlab_vim is not installed but vim bindings were requested." + echo "Install it with: pip install jupyterlab_vim" + exit 1 + fi + echo "Enabling vim." + cat < ~/.jupyter/lab/user-settings/\@axlair/jupyterlab_vim/plugin.jupyterlab-settings +{ + "enabled": true, + "enabledInEditors": true, + "extraKeybindings": [], + "autosaveInterval": 6 +} +EOF + else + echo "Disabling vim." + cat < ~/.jupyter/lab/user-settings/\@axlair/jupyterlab_vim/plugin.jupyterlab-settings +{ + "enabled": false, + "enabledInEditors": false, + "extraKeybindings": [], + "autosaveInterval": 6 +} +EOF + fi; +} + + +configure_jupyter_notifications() { + # """ + # Disable JupyterLab news fetching and update checks. + # """ + mkdir -p ~/.jupyter/lab/user-settings/@jupyterlab/apputils-extension + cat < ~/.jupyter/lab/user-settings/\@jupyterlab/apputils-extension/notification.jupyterlab-settings +{ + // Notifications + // @jupyterlab/apputils-extension:notification + // Notifications settings. + + // Fetch official Jupyter news + // Whether to fetch news from the Jupyter news feed. If Always (`true`), it will make a request to a website. + "fetchNews": "false", + "checkForUpdates": false +} +EOF +} + + +configure_jupyter_autosave() { + # """ + # Configure JupyterLab global autosave interval to 6 seconds. + # """ + mkdir -p ~/.jupyter/lab/user-settings/@jupyterlab/docmanager-extension + cat < ~/.jupyter/lab/user-settings/\@jupyterlab/docmanager-extension/plugin.jupyterlab-settings +{ + "autosaveInterval": 6 +} +EOF +} + + +check_jupytext_installed() { + # """ + # Verify that jupytext is installed before starting Jupyter Lab. + # + # Jupytext is required for pair notebook/Python file functionality. + # Exits with error if jupytext is not installed. + # """ + if ! pip show jupytext > /dev/null 2>&1; then + echo "ERROR: jupytext is not installed but is required to run Jupyter Lab." + echo "Install it with: pip install jupytext" + exit 1 + fi +} + + +setup_jupyter_environment() { + # """ + # Configure Jupyter Lab environment before launching. + # + # Performs all necessary setup steps: + # - Configure vim keybindings + # - Disable notifications + # - Configure autosave interval + # - Verify jupytext is installed + # """ + configure_jupyter_vim_keybindings + configure_jupyter_notifications + configure_jupyter_autosave + check_jupytext_installed +} + + +get_jupyter_args() { + # """ + # Print the standard Jupyter Lab command-line arguments. + # + # :return: space-separated Jupyter Lab args for port 8888 with no browser, + # allow root, and no authentication + # """ + echo "--port=8888 --no-browser --ip=0.0.0.0 --allow-root --ServerApp.token='' --ServerApp.password=''" +} + + +get_run_jupyter_cmd() { + # """ + # Return the command to run run_jupyter.sh inside a container. + # + # Computes the script's path relative to GIT_ROOT and builds the + # corresponding /git_root/... path used inside the container. + # + # :param script_path: path of the calling script (pass ${BASH_SOURCE[0]}) + # :param cmd_opts: options to forward to run_jupyter.sh + # :return: full command string to run run_jupyter.sh + # """ + local script_path=$1 + local cmd_opts=$2 + local script_dir + script_dir=$(cd "$(dirname "$script_path")" && pwd) + local rel_dir="${script_dir#${GIT_ROOT}/}" + echo "/git_root/${rel_dir}/run_jupyter.sh $cmd_opts" +} + + +list_and_inspect_docker_image() { + # """ + # List available Docker images and inspect their architecture. + # + # Lists all images matching FULL_IMAGE_NAME and attempts to inspect + # their architecture using docker manifest inspect. + # """ + run "docker image ls $FULL_IMAGE_NAME" + (docker manifest inspect $FULL_IMAGE_NAME | grep arch) || true +} + + +kill_existing_container_if_forced() { + # """ + # Kill existing container if FORCE flag is set. + # + # If FORCE is set to 1, kills and removes the container with name + # CONTAINER_NAME. This is typically set by the -f flag. + # """ + if [[ $FORCE == 1 ]]; then + kill_container_by_name $CONTAINER_NAME + fi +} diff --git a/class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/version.sh b/class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/version.sh new file mode 100755 index 000000000..c46ed254c --- /dev/null +++ b/class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/version.sh @@ -0,0 +1,28 @@ +#!/bin/bash +# """ +# Display versions of installed tools and packages. +# +# This script prints version information for Python, pip, Jupyter, and all +# installed Python packages. Used for debugging and documentation purposes +# to verify the Docker container environment setup. +# """ + +# Display Python 3 version. +echo "# Python3" +python3 --version + +# Display pip version. +echo "# pip3" +pip3 --version + +# Display Jupyter version. +echo "# jupyter" +jupyter --version + +# List all installed Python packages and their versions. +echo "# Python packages" +pip3 list + +# Template for adding additional tool versions. +# echo "# mongo" +# mongod --version From d8f2d5b423e1432ff4aa67336904b656b42287c2 Mon Sep 17 00:00:00 2001 From: Kshiti Deshpande Date: Thu, 7 May 2026 09:36:19 -0400 Subject: [PATCH 3/5] Adding DocsGPT files Adding final DocsGPT files --- .../.env.example | 1 + .../.gitignore | 7 + .../README.md | 822 +---------- .../docker_build.version.log | 179 ++- .../docker_jupyter.sh | 5 +- .../docker_name.sh | 6 +- .../docsgpt.API.ipynb | 1275 +++++++++++++++++ .../docsgpt.API.py | 580 ++++++++ .../docsgpt.example.ipynb | 1259 ++++++++++++++++ .../docsgpt.example.py | 555 +++++++ .../docsgpt_utils.py | 1016 +++++++++++++ .../project_template_README.md | 802 +++++++++++ .../requirements.txt | 50 +- .../run_jupyter.sh | 4 + .../template.API.ipynb | 215 --- .../template.API.py | 129 -- .../template.example.ipynb | 198 --- .../template.example.py | 125 -- .../template_utils.py | 72 - .../Spring2026/projects/docker_build.log | 57 + 20 files changed, 5819 insertions(+), 1538 deletions(-) create mode 100644 class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/.env.example create mode 100644 class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/.gitignore create mode 100644 class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/docsgpt.API.ipynb create mode 100644 class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/docsgpt.API.py create mode 100644 class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/docsgpt.example.ipynb create mode 100644 class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/docsgpt.example.py create mode 100644 class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/docsgpt_utils.py create mode 100644 class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/project_template_README.md delete mode 100644 class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/template.API.ipynb delete mode 100644 class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/template.API.py delete mode 100644 class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/template.example.ipynb delete mode 100644 class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/template.example.py delete mode 100644 class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/template_utils.py create mode 100644 class_project/data605/Spring2026/projects/docker_build.log diff --git a/class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/.env.example b/class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/.env.example new file mode 100644 index 000000000..85ec0d413 --- /dev/null +++ b/class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/.env.example @@ -0,0 +1 @@ +DOCSGPT_API_KEY='add-agent-key' \ No newline at end of file diff --git a/class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/.gitignore b/class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/.gitignore new file mode 100644 index 000000000..264017868 --- /dev/null +++ b/class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/.gitignore @@ -0,0 +1,7 @@ +.env +!.env.example +.gradio/ +__pycache__/ +.ipynb_checkpoints/ +.DS_Store +.venv \ No newline at end of file diff --git a/class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/README.md b/class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/README.md index 58d90e2d1..11e38d349 100644 --- a/class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/README.md +++ b/class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/README.md @@ -1,802 +1,46 @@ -# Summary -This directory contains a Docker-based development environment template with: +# DocsGPT Tutorial — Intelligent Documentation Assistant -- Utility scripts for Docker operations (build, run, clean, push) -- Configuration files for Dockerfile and environment setup -- Jupyter notebook templates for standardized project development -- Shell utilities and Python helpers for container-based workflows +DocsGPT is an open-source, RAG-based AI platform (15k+ GitHub stars) that +retrieves relevant document chunks and passes them to a language model to +generate grounded answers. This tutorial builds a complete documentation +assistant on top of the DocsGPT Cloud API — covering summarisation, FAQ +generation, evaluation, multi-language output, streaming, and an interactive UI. -A guide to set up Docker-based projects using the template, customize it for -your needs, and maintain it over time. +## Quick Start -## Description of Files -- `bashrc` - - Bash configuration file enabling `vi` mode for command-line editing - -- `copy_docker_files.py` - - Python script for copying Docker configuration files to destination - directories - -- `docker_build.version.log` - - Log file containing Python, `pip`, Jupyter, and package version information - from Docker build - -- `docker_cmd.sh` - - Shell script for executing arbitrary commands inside Docker containers with - volume mounting - -- `docker_jupyter.sh` - - Shell script for launching Jupyter Lab server inside Docker containers - -- `docker_name.sh` - - Configuration file defining Docker repository and image naming variables - -- `Dockerfile` - - Docker image build configuration with Ubuntu, Python, Jupyter, and project - dependencies - -- `etc_sudoers` - - Sudoers configuration file granting passwordless sudo access for postgres - user - -- `README.md` - - Documentation file describing directory contents, files, and executable - scripts - -- `template_utils.py` - - Python utility functions supporting tutorial notebooks with data processing - and modeling helpers - -- `template.API.ipynb` - - Jupyter notebook template for API exploration and library usage examples - -- `template.example.ipynb` - - Jupyter notebook template for project examples and demonstrations - -- `utils.sh` - - Bash utility library with reusable functions for Docker operations - - Provides centralized argument parsing (`parse_default_args`) for `-h` and - `-v` flags used by all `docker_*.sh` scripts - - Provides Jupyter configuration logic: vim keybindings, notification - settings, and Docker run option builders - - All `docker_*.sh`, `docker_jupyter.sh`, and `run_jupyter.sh` scripts across - the repo source this file from `class_project/project_template/utils.sh` - -## Workflows -- All commands should be run from inside the project directory - ```bash - > cd tutorials/FilterPy - ``` - -- To build the container for a project - ```bash - > cd $PROJECT - # Build the container. - > docker_build.sh - # Build without cache (pass extra args after -v). - > docker_build.sh --no-cache - # Test the container. - > docker_bash.sh ls - ``` - -- Enable verbose (trace) output with `-v` - ```bash - > docker_build.sh -v - > docker_bash.sh -v - ``` - -- Get help for any docker script - ```bash - > docker_build.sh -h - > docker_jupyter.sh -h - ``` - -- Start Jupyter - ```bash - > docker_jupyter.sh - # Go to localhost:8888 - ``` - -- Start Jupyter on a specific port with vim support - ```bash - > docker_jupyter.sh -p 8890 -u - # Go to localhost:8890 - ``` - -## How to Customize a Project Template -- Copy the template - ```bash - > cp -r class_project/project_template $TARGET - ``` - -## Description of Executables - -### `copy_docker_files.py` -- **What It Does** - - Copies Docker configuration and utility files from project_template to a - destination directory - - Preserves all file permissions and attributes during copying - - Creates destination directory if it doesn't exist - -- Copy all Docker files to a target directory: - ```bash - > ./copy_docker_files.py --dst_dir /path/to/destination - ``` - -- Copy with verbose logging: - ```bash - > ./copy_docker_files.py --dst_dir /path/to/destination -v DEBUG - ``` - -### `docker_bash.sh` -- **What It Does** - - Launches an interactive bash shell inside a Docker container - - Mounts the current working directory as `/data` inside the container - - Exposes port 8888 for potential services running in the container - - Accepts `-h` (help) and `-v` (verbose/trace) flags via `parse_default_args` - -- Launch bash shell in the container: - ```bash - > ./docker_bash.sh - ``` - -- Launch with verbose output (prints each command): - ```bash - > ./docker_bash.sh -v - ``` - -### `docker_build.sh` -- **What It Does** - - Builds Docker container images using Docker BuildKit - - Supports single-architecture builds (default) or multi-architecture builds - (`linux/arm64`, `linux/amd64`) - - Copies project files to temporary build directory and generates build logs - - Accepts `-h` (help) and `-v` (verbose/trace) flags; any extra arguments - after flags are forwarded to `docker build` - -- Build container image for current architecture: - ```bash - > ./docker_build.sh - ``` - -- Build without Docker layer cache: - ```bash - > ./docker_build.sh --no-cache - ``` - -- Build multi-architecture image (requires setting `DOCKER_BUILD_MULTI_ARCH=1` - in the script): - ```bash - > # Edit docker_build.sh to set DOCKER_BUILD_MULTI_ARCH=1 - > ./docker_build.sh - ``` - -### `docker_clean.sh` -- **What It Does** - -- Removes all Docker images matching the project's full image name -- Lists images before and after removal for verification -- Uses force removal to ensure cleanup completes - -- Remove project's Docker images: - ```bash - > ./docker_clean.sh - ``` - -### `docker_cmd.sh` -- **What It Does** - - Executes arbitrary commands inside a Docker container - - Mounts current directory as `/data` for accessing project files - - Automatically removes container after command execution completes - - Accepts `-h` (help) and `-v` (verbose/trace) flags; remaining arguments - form the command to execute - -- Run Python script inside container: - ```bash - > ./docker_cmd.sh python script.py --arg value - ``` - -- List files in the container: - ```bash - > ./docker_cmd.sh ls -la /data - ``` - -- Run tests inside container: - ```bash - > ./docker_cmd.sh pytest tests/ - ``` - -### `docker_exec.sh` -- **What It Does** - - Attaches to an already running Docker container with an interactive bash - shell - - Finds the container ID automatically based on the image name - - Useful for debugging or inspecting running containers - - Accepts `-h` (help) and `-v` (verbose/trace) flags via `parse_default_args` - -- Attach to running container: - ```bash - > ./docker_exec.sh - ``` - -### `docker_jupyter.sh` -- **What It Does** - - Launches Jupyter Lab server inside a Docker container - - Supports custom port configuration (default 8888), vim keybindings, and - custom directory mounting - - Runs `run_jupyter.sh` script inside the container with specified options - -- Start Jupyter on default port 8888: - ```bash - > ./docker_jupyter.sh - ``` - -- Start Jupyter on custom port with vim bindings: - ```bash - > ./docker_jupyter.sh -p 8889 -u - ``` - -- Start Jupyter with external directory mounted: - ```bash - > ./docker_jupyter.sh -d /path/to/notebooks -p 8889 - ``` - -- Start Jupyter in verbose mode: - ```bash - > ./docker_jupyter.sh -v -p 8890 - ``` - -### `docker_push.sh` -- **What It Does** - - Authenticates to Docker registry using credentials from - `~/.docker/passwd.$REPO_NAME.txt` - - Pushes the project's Docker image to the remote repository - - Lists images before pushing for verification - -- Push container image to registry: - ```bash - > ./docker_push.sh - ``` - -### `run_jupyter.sh` -- **What It Does** - - Launches Jupyter Lab server with no authentication (token and password - disabled) - - Binds to all network interfaces (0.0.0.0) on port 8888 - - Allows root access for container environments - - When `JUPYTER_USE_VIM=1`, verifies that `jupyterlab_vim` is installed - before enabling vim keybindings; exits with an error if not found - -- Start Jupyter Lab server (typically called from docker_jupyter.sh): - ```bash - > ./run_jupyter.sh - ``` - -- Start with vim keybindings (requires `jupyterlab_vim` installed in the - container): - ```bash - > JUPYTER_USE_VIM=1 ./run_jupyter.sh - ``` - -### `utils.sh` -- **What It Does** - - Central Bash library sourced by all `docker_*.sh` and `run_jupyter.sh` - scripts across the repository - - Provides `parse_default_args` which adds `-h` (help) and `-v` - (verbose/`set -x`) flags to every docker script - - Provides `build_container_image`, `push_container_image`, - `remove_container_image`, `kill_container`, `exec_container` utilities - - Provides Jupyter configuration helpers: vim keybindings, notification - suppression, and Docker run option builders - -### `version.sh` -- **What It Does** - - Reports version information for Python3, pip3, and Jupyter - - Lists all installed Python packages with versions - - Used during Docker image builds to log environment configuration - -- Display version information: - ```bash - > ./version.sh - ``` - -- Save version information to a log file: - ```bash - > ./version.sh 2>&1 | tee version.log - ``` - -# Template Customization and Maintenance - -## Quick Start for New Projects - -### Step 1: Copy the Template ```bash -> cd class_project/project_template -> cp -r . /path/to/your/new/project -> cd /path/to/your/new/project +cd tutorials/docsgpt +cp .env.example .env # add your agent key to .env +./docker_build.sh # build the Docker image +./docker_jupyter.sh # launch Jupyter Lab at localhost:8888 ``` -### Step 2: Choose a Base Image -The template includes three Dockerfile options. Choose the one that best fits -your project: - -| Option | File | Best For | -| -------------------------- | ------------------------ | ---------------------------------------------------------------- | -| **Standard** | `Dockerfile.ubuntu` | Full Ubuntu environment with system tools | -| **Lightweight** | `Dockerfile.python_slim` | Minimal Python environment; reduced image size | -| **Modern Package Manager** | `Dockerfile.uv` | Fast dependency resolution with [uv](https://docs.astral.sh/uv/) | - -**How to choose:** - -- **Use Standard** if you need system-level tools (git, curl, graphviz, etc.) -- **Use Python Slim** to minimize image size and build time -- **Use uv** if you want faster, more reliable dependency management - -### Step 3: Set Up Your Dockerfile -- Delete unused reference files - ```bash - > rm Dockerfile.ubuntu Dockerfile.python_slim Dockerfile.uv - ``` - -- Create your working Dockerfile - ```bash - > cp Dockerfile.ubuntu Dockerfile - ``` - -- Add your dependencies - ```bash - > echo "numpy\npandas\nscikit-learn" > requirements.in - > pip-compile requirements.in > requirements.txt - ``` - -### Step 4: Keep Customization Minimal -- Only modify what's necessary for your project -- Use `requirements.txt` for all Python packages (don't edit Dockerfile for - this) -- Keep `bashrc` and `etc_sudoers` as-is unless you need custom shell setup -- Keep base image and Python version unless you have specific requirements - -## Understanding the Dockerfile Flow -Each Dockerfile follows the same structure. Here are the key stages: - -### Stage 1: Base Image and System Setup -```dockerfile -FROM ubuntu:24.04 # or python:3.12-slim, depending on your requirement -ENV DEBIAN_FRONTEND noninteractive -RUN apt-get -y update && apt-get -y upgrade -``` - -- **Purpose**: Start with a clean base image and disable interactive - installation prompts - -- **When to customize**: Only change the base image or version if your project - has specific requirements (different Ubuntu version, specific Python version, - etc.) - -### Stage 2: System Utilities (Ubuntu-based Dockerfiles Only) -```dockerfile -RUN apt install -y --no-install-recommends \ - sudo \ - curl \ - systemctl \ - gnupg \ - git \ - vim -``` - -- **Purpose**: Install essential system tools for development and container - management - -- **When to customize**: Add only if needed for your project - - `postgresql-client`: for database connections - - `graphviz`: for graph visualizations - - `ffmpeg`: for media processing - -- **Best practice**: Use `--no-install-recommends` to keep the image small - -### Stage 3: Python and Build Tools (Ubuntu-based Dockerfiles Only) -```dockerfile -RUN apt-get update && apt-get install -y --no-install-recommends \ - build-essential \ - python3 \ - python3-pip \ - python3-dev \ - python3-venv \ - && rm -rf /var/lib/apt/lists/* -``` - -- **Purpose**: Install Python 3, pip, and build tools needed for compiled - packages - -- **Why venv**: Creates an isolated Python environment separate from system - Python - -- **When to customize**: Rarely. Only change if you need a specific Python - version (e.g., `python3.11` instead of `python3`) - -### Stage 4: Virtual Environment Setup -```dockerfile -RUN python3 -m venv /opt/venv -ENV PATH="/opt/venv/bin:$PATH" -RUN python -m pip install --upgrade pip -``` - -- **Purpose**: Create and activate an isolated virtual environment for your - project - -- **Why this matters**: Ensures reproducibility and prevents dependency - conflicts across projects - -- **When to customize**: Never. This is a standard best practice - -### Stage 5: Jupyter Installation -```dockerfile -RUN pip install jupyterlab jupyterlab_vim -``` - -- **Purpose**: Install JupyterLab and the Vim keybinding extension for - interactive development - - `jupyterlab`: the main IDE for running notebooks in the browser - - `jupyterlab_vim`: adds Vim-style navigation to notebook cells - -- **Why in Dockerfile, not requirements.txt**: These are infrastructure - packages (the IDE itself), not project-specific dependencies - - Do NOT add `jupyterlab`, `jupyterlab-vim`, or `ipywidgets` to - `requirements.txt`; they are already installed here - -- **When to customize**: - - **Remove** this line if your project doesn't use Jupyter - - **Add more extensions** if needed (e.g., `jupyterlab-git`, - `jupyterlab-variableinspector`) - -### Stage 6: Project Dependencies -```dockerfile -COPY requirements.txt /install/requirements.txt -RUN pip install --no-cache-dir -r /install/requirements.txt -``` - -- **Purpose**: Install your project-specific Python packages - -- **When to customize**: This is the primary place to customize. Define all your - dependencies in `requirements.txt` - -- **Best practice**: - - **Pin all versions**: `numpy==1.24.0` (not `numpy>=1.20.0`) - - **Use `--no-cache-dir`**: Reduces image size by skipping pip cache - - **For complex dependencies**: Use `requirements.in` with `pip-tools` or - `pip-compile` - -- **Example requirements.txt**: - ```text - numpy==1.24.0 - pandas==2.0.0 - scikit-learn==1.2.2 - tensorflow==2.13.0 - ``` - -### Stage 7: Configuration -```dockerfile -COPY etc_sudoers /etc/sudoers -COPY bashrc /root/.bashrc -``` - -- **Purpose**: Apply custom bash configuration and sudo permissions - -- **When to customize**: - - **Edit `bashrc`**: to add aliases, environment variables, or custom prompt - - **Edit `etc_sudoers`**: if additional users need passwordless sudo access - -### Stage 8: Version Logging -```dockerfile -ADD version.sh /install/ -RUN /install/version.sh 2>&1 | tee version.log -``` - -- **Purpose**: Document the exact versions of Python, pip, Jupyter, and all - installed packages - -- **What it logs**: - - Python 3 version - - Pip version - - Jupyter version - - Complete list of all installed Python packages - -- **Why it matters**: Creates a detailed record of your container's environment - for troubleshooting and reproducibility - -- **How to use**: After building, review `version.log` to verify all - dependencies installed correctly - ```bash - > docker build -t my-project . - > cat version.log - ``` - -- **Extending it**: If you need to log additional tools (MongoDB, Node.js, - etc.), add them to `version.sh`: - ```bash - > echo "# mongo" - > mongod --version - ``` - -### Stage 9: Port Declaration -```dockerfile -EXPOSE 8888 -``` - -- **Purpose**: Declare that the container uses port 8888 (informational for - Docker) - -- **When to customize**: Add additional ports if your application needs them - (e.g., `EXPOSE 8888 5432 3000`) - -## Best Practices: Keep It Simple - -### The Core Principle -Only change what's necessary for your project. Everything else should inherit -from the template. - -This approach: - -- Makes Dockerfiles easier to understand and maintain -- Keeps images smaller and faster to build -- Simplifies future updates from the template -- Ensures consistency across similar projects - -### How to Do It Right -| What | Where | Example | -| :--------------------------- | :--------------------------- | :------------------------------ | -| Project Python packages | `requirements.txt` | `numpy==1.24.0` | -| Jupyter + Vim (always there) | Dockerfile Stage 5 | `jupyterlab jupyterlab_vim` | -| System tools | Dockerfile `apt-get` section | `postgresql-client` | -| Shell aliases | `bashrc` | `alias jlab="jupyter lab"` | -| Custom scripts | `scripts/` directory | Setup or initialization scripts | -| User permissions | `etc_sudoers` | Grant passwordless sudo | - -- **Do NOT add to `requirements.txt`**: `jupyterlab`, `jupyterlab-vim`, - `jupyterlab_vim`, or `ipywidgets` — these are Jupyter infrastructure packages - and are already installed in Stage 5 of the Dockerfile - -### Wrong Vs. Right Approach -- **Wrong**: Embed everything in the Dockerfile - ```dockerfile - RUN pip install my-package && python my_setup.py && npm install - ``` - -- **Right**: Use separate files and keep Dockerfile clean - ```dockerfile - COPY requirements.txt /install/ - RUN pip install -r /install/requirements.txt - COPY scripts/setup.sh /install/ - RUN /install/setup.sh - ``` - -## .Dockerignore Policy - -### Why It Matters -The `.dockerignore` file prevents unnecessary files from being added to the -Docker build context: - -- **Reduces build time**: Fewer files to transfer to Docker daemon -- **Reduces image size**: Only necessary files are included -- **Improves security**: Prevents leaking sensitive data - -### What to Exclude: Category Breakdown -- Python Artifacts (Always Exclude) - ```verbatim - __pycache__/ - *.pyc - *.pyo - *.pyd - ``` - - Why: Compiled bytecode generated at runtime. Regenerated in container, adds - bloat - -- Virtual Environments (Always Exclude) - ```verbatim - venv/ - .venv/ - env/ - .env/ - ``` - - Why: Local venvs aren't portable to containers. The Dockerfile creates its - own - -- Jupyter Checkpoints (Always Exclude) - ```verbatim - .ipynb_checkpoints/ - ``` - - Why: Auto-generated by Jupyter, not needed in the image - -- Git and Version Control (Always Exclude) - ```verbatim - .git/ - .gitignore - .gitattributes - ``` - - Why: Repository history not needed at runtime - -- Docker Build Scripts (Always Exclude) - ```verbatim - docker_build.sh - docker_push.sh - docker_clean.sh - docker_exec.sh - docker_cmd.sh - docker_bash.sh - docker_jupyter.sh - docker_name.sh - Dockerfile.* - ``` - - Why: Local development scripts don't run inside the container - -- Large Data Files (Recommended) - ```verbatim - data/ - *.csv - *.pkl - *.h5 - *.parquet - ``` - - Why: Don't ship large training and test data in the image. Mount via volume - instead - - Best practice: `bash > docker run -v /path/to/data:/data my-image ` - -- Test Files (Project-Dependent) - ```verbatim - tests/ - tutorials/ - ``` - - Why: Exclude if tests don't run in the container - - When to include: If CI and CD runs tests inside the container - -- Documentation (Recommended) - ```verbatim - README.md - docs/ - *.md - ``` - - Why: Not needed at runtime - - Exception: Only keep if your app reads these files at runtime - -- Generated Files (Always Exclude) - ```verbatim - *.log - *.tmp - *.cache - build/ - dist/ - ``` - - Why: Generated at runtime, not needed in the image - -## Workflow: From Template to Your Project - -### Complete Setup Checklist -- Copy the template - ```bash - > cp -r project_template my-new-project - > cd my-new-project - ``` - -- Keep all reference Dockerfiles - ```verbatim - Dockerfile.ubuntu_24_04 - Dockerfile.python_slim - Dockerfile.uv - ``` - -- Create your working Dockerfile - ```bash - > cp Dockerfile.ubuntu_24_04 Dockerfile - ``` - -- Add your dependencies - ```bash - > pip freeze > requirements.txt - ``` - -- Configure `.dockerignore`: Review the template `.dockerignore` and add your - project-specific exclusions (e.g., data directories) - -- Test the build - ```bash - > docker build -t my-project:latest . - > docker run -it my-project:latest bash - ``` - -- Test Jupyter (if using) - ```bash - > ./docker_jupyter.sh -p 8888 - ``` - -- Document customizations in your project README: - - Base image chosen and why - - Key dependencies - - Any Dockerfile modifications - - How to build and run - -## Maintaining Your Setup - -### Document Any Changes -- If you modify the Dockerfile, add explanatory comments: - ```dockerfile - # Custom: PostgreSQL client for database access - postgresql-client \ - - # Custom: Node.js for frontend builds - nodejs \ - ``` - -### Monitor Package Versions -- After each build, review `version.log`: - ```bash - > docker build -t my-project . - > cat version.log - ``` - -### Keep `.dockerignore` Updated -- If you add new directories or files, update `.dockerignore`. Add to - `.dockerignore` if the directory shouldn't be in the image: - ```verbatim - data/ - cache/ - .temp/ - ``` - -### Contribute Improvements Back -When you improve your project's Docker setup: - -- Test thoroughly in your project -- Document the improvement clearly -- Submit back to `project_template` -- Other projects can adopt it when they update - -Example improvements: - -- Better way to install TensorFlow with GPU support -- Optimized `.dockerignore` for data science projects -- Security hardening (non-root user setup) - -## Troubleshooting - -### Build Is Slow -- Check `.dockerignore`: Ensure large directories (data/, .git/) are excluded -- Check Docker daemon: Verify Docker is running properly -- Check layer caching: Docker reuses cached layers; avoid changing early layers +Get your Agent key: https://app.docsgpt.cloud → Settings → Agents → Create New -### Image Is Too Large -- Check layer sizes: - ```bash - > docker history my-project:latest - ``` +Open **http://localhost:8888** and work through the notebooks in order: -- Remove unnecessary packages or use `python_slim` base image +1. **`docsgpt.API.ipynb`** (20 min) — Walks through every real DocsGPT Cloud + endpoint with raw HTTP calls and `docsgpt_utils` wrapper calls side by side: + `/api/answer`, `/stream`, `/api/store_attachment`, `/api/task_status`. + Also covers multi-turn conversation, SSE streaming, and the file attachment flow. -### Package Not Found Error -- Verify package name in PyPI (packages are case-sensitive) -- Check Python version compatibility -- Pin specific version if needed +2. **`docsgpt.example.ipynb`** (25 min) — End-to-end documentation assistant: + loads data from three real datasets (Awesome ML, Stack Overflow, The Pile), + summarises each document, generates FAQs, evaluates output with ROUGE + BLEU, + produces multi-language output in 9 languages, and launches an interactive + Gradio UI. -### Permission Issues in Container -- Check `etc_sudoers`: Ensure user has appropriate permissions -- Check file ownership: Ensure COPY doesn't create root-only files -### Jupyter Won't Connect -- Run Jupyter - ```bash - > ./docker_jupyter.sh -p 8888 - ``` +## Key files -- Verify http://localhost:8888 (not https). Check firewall if remote access - needed +| File | Purpose | +|------|---------| +| `docsgpt_utils.py` | All reusable functions and API wrappers | +| `docsgpt.API.ipynb` + `.py` | API walkthrough (Jupytext paired) | +| `docsgpt.example.ipynb` + `.py` | Full application (Jupytext paired) | +| `Dockerfile` + `docker_*.sh` | Container setup and management | +| `requirements.txt` | Pinned Python dependencies | +| `.env.example` | Template for API key configuration | -### Vim Keybindings Not Working -- If `run_jupyter.sh` exits with `ERROR: jupyterlab_vim is not installed`, it - means `jupyterlab_vim` is missing from the container image -- Make sure `jupyterlab_vim` is installed in the Dockerfile: - ```dockerfile - RUN pip install jupyterlab jupyterlab_vim - ``` -- Rebuild the image after adding the package: - ```bash - > ./docker_build.sh - ``` +See [project template README](../../project_template_README.md) for full +Docker usage details. \ No newline at end of file diff --git a/class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/docker_build.version.log b/class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/docker_build.version.log index 8315eefe2..d1d5f7837 100644 --- a/class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/docker_build.version.log +++ b/class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/docker_build.version.log @@ -1 +1,178 @@ -the input device is not a TTY +# Python3 +Python 3.12.13 +# pip3 +pip 26.1.1 from /usr/local/lib/python3.12/site-packages/pip (python 3.12) +# jupyter +Selected Jupyter core packages... +IPython : 9.13.0 +ipykernel : 7.2.0 +ipywidgets : not installed +jupyter_client : 8.8.0 +jupyter_core : 5.9.1 +jupyter_server : 2.18.2 +jupyterlab : 4.2.5 +nbclient : 0.10.4 +nbconvert : 7.17.1 +nbformat : 5.10.4 +notebook : not installed +qtconsole : not installed +traitlets : 5.15.0 +# Python packages +Package Version +------------------------- ----------- +absl-py 2.4.0 +aiohappyeyeballs 2.6.1 +aiohttp 3.13.5 +aiosignal 1.4.0 +annotated-doc 0.0.4 +annotated-types 0.7.0 +anyio 4.13.0 +argon2-cffi 25.1.0 +argon2-cffi-bindings 25.1.0 +arrow 1.4.0 +astroid 3.3.11 +asttokens 3.0.1 +async-lru 2.3.0 +attrs 26.1.0 +babel 2.18.0 +beautifulsoup4 4.14.3 +black 24.8.0 +bleach 6.3.0 +brotli 1.2.0 +certifi 2026.4.22 +cffi 2.0.0 +charset-normalizer 3.4.7 +click 8.3.3 +comm 0.2.3 +datasets 2.20.0 +debugpy 1.8.20 +decorator 5.2.1 +deep-translator 1.11.4 +defusedxml 0.7.1 +dill 0.3.8 +executing 2.2.1 +fastapi 0.136.1 +fastjsonschema 2.21.2 +filelock 3.29.0 +fqdn 1.5.1 +frozenlist 1.8.0 +fsspec 2024.5.0 +gradio 6.14.0 +gradio_client 2.5.0 +groovy 0.1.2 +h11 0.16.0 +hf-gradio 0.4.1 +hf-xet 1.5.0 +httpcore 1.0.9 +httpx 0.28.1 +huggingface_hub 1.14.0 +idna 3.13 +ipykernel 7.2.0 +ipython 9.13.0 +ipython_pygments_lexers 1.1.1 +isoduration 20.11.0 +isort 5.13.2 +jedi 0.20.0 +Jinja2 3.1.6 +joblib 1.5.3 +json5 0.14.0 +jsonpointer 3.1.1 +jsonschema 4.26.0 +jsonschema-specifications 2025.9.1 +jupyter_client 8.8.0 +jupyter_core 5.9.1 +jupyter-events 0.12.1 +jupyter-lsp 2.3.1 +jupyter_server 2.18.2 +jupyter_server_terminals 0.5.4 +jupyterlab 4.2.5 +jupyterlab_pygments 0.3.0 +jupyterlab_server 2.28.0 +jupyterlab-vim 4.1.4 +jupytext 1.16.4 +lark 1.3.1 +markdown-it-py 4.1.0 +MarkupSafe 3.0.3 +matplotlib-inline 0.2.1 +mccabe 0.7.0 +mdit-py-plugins 0.5.0 +mdurl 0.1.2 +mistune 3.2.1 +multidict 6.7.1 +multiprocess 0.70.16 +mypy_extensions 1.1.0 +nbclient 0.10.4 +nbconvert 7.17.1 +nbformat 5.10.4 +nest-asyncio 1.6.0 +nltk 3.9.1 +notebook_shim 0.2.4 +numpy 2.4.4 +orjson 3.11.9 +packaging 26.2 +pandas 2.2.2 +pandocfilters 1.5.1 +parso 0.8.7 +pathspec 1.1.1 +pexpect 4.9.0 +pillow 12.2.0 +pip 26.1.1 +platformdirs 4.9.6 +prometheus_client 0.25.0 +prompt_toolkit 3.0.52 +propcache 0.4.1 +psutil 7.2.2 +ptyprocess 0.7.0 +pure_eval 0.2.3 +pyarrow 24.0.0 +pyarrow-hotfix 0.7 +pycparser 3.0 +pydantic 2.13.4 +pydantic_core 2.46.4 +pydub 0.25.1 +Pygments 2.20.0 +pylint 3.3.1 +python-dateutil 2.9.0.post0 +python-dotenv 1.2.2 +python-json-logger 4.1.0 +python-multipart 0.0.27 +pytz 2026.2 +PyYAML 6.0.3 +pyzmq 27.1.0 +referencing 0.37.0 +regex 2026.4.4 +requests 2.32.3 +rfc3339-validator 0.1.4 +rfc3986-validator 0.1.1 +rfc3987-syntax 1.1.0 +rich 15.0.0 +rouge_score 0.1.2 +rpds-py 0.30.0 +safehttpx 0.1.7 +semantic-version 2.10.0 +Send2Trash 2.1.0 +setuptools 82.0.1 +shellingham 1.5.4 +six 1.17.0 +soupsieve 2.8.3 +stack-data 0.6.3 +starlette 1.0.0 +terminado 0.18.1 +tinycss2 1.4.0 +tomlkit 0.14.0 +tornado 6.5.5 +tqdm 4.67.3 +traitlets 5.15.0 +typer 0.25.1 +typing_extensions 4.15.0 +typing-inspection 0.4.2 +tzdata 2026.2 +uri-template 1.3.0 +urllib3 2.6.3 +uvicorn 0.46.0 +wcwidth 0.7.0 +webcolors 25.10.0 +webencodings 0.5.1 +websocket-client 1.9.0 +xxhash 3.7.0 +yarl 1.23.0 diff --git a/class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/docker_jupyter.sh b/class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/docker_jupyter.sh index 1a60dfd3a..3d1899b40 100755 --- a/class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/docker_jupyter.sh +++ b/class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/docker_jupyter.sh @@ -26,7 +26,8 @@ source $DOCKER_NAME print_docker_vars # List available Docker images and inspect architecture. -list_and_inspect_docker_image +run "docker image ls $FULL_IMAGE_NAME" +(docker manifest inspect $FULL_IMAGE_NAME | grep arch) || true # Run the Docker container with Jupyter Lab. CMD=$(get_run_jupyter_cmd "${BASH_SOURCE[0]}" "$OLD_CMD_OPTS") @@ -36,4 +37,4 @@ kill_existing_container_if_forced DOCKER_CMD=$(get_docker_jupyter_command) DOCKER_CMD_OPTS=$(get_docker_jupyter_options $CONTAINER_NAME $JUPYTER_HOST_PORT $JUPYTER_USE_VIM) -run "$DOCKER_CMD $DOCKER_CMD_OPTS $FULL_IMAGE_NAME $CMD" +run "$DOCKER_CMD $DOCKER_CMD_OPTS $FULL_IMAGE_NAME $CMD" \ No newline at end of file diff --git a/class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/docker_name.sh b/class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/docker_name.sh index 32a546cf3..6d30d3a82 100644 --- a/class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/docker_name.sh +++ b/class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/docker_name.sh @@ -6,7 +6,7 @@ # variables used by all docker_*.sh scripts in the project template. # """ -REPO_NAME=gpsaggese +REPO_NAME=kshitideshpande # The file should be all lower case. -IMAGE_NAME=umd_project_template -FULL_IMAGE_NAME=$REPO_NAME/$IMAGE_NAME +IMAGE_NAME=docsgpt_project +FULL_IMAGE_NAME=$REPO_NAME/$IMAGE_NAME \ No newline at end of file diff --git a/class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/docsgpt.API.ipynb b/class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/docsgpt.API.ipynb new file mode 100644 index 000000000..9d2791884 --- /dev/null +++ b/class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/docsgpt.API.ipynb @@ -0,0 +1,1275 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "title", + "metadata": {}, + "source": [ + "# DocsGPT API Overview\n", + "\n", + "This notebook walks through every DocsGPT Cloud API endpoint one by one.\n", + "Each section shows the raw HTTP request first, then the same call using the\n", + "`docsgpt_utils` wrapper so you can see what the wrapper is doing under the hood.\n", + "\n", + "**Endpoints covered:**\n", + "\n", + "| Method | Endpoint | Purpose |\n", + "|--------|----------|---------|\n", + "| `POST` | `/api/answer` | Send a question, get a full JSON response |\n", + "| `POST` | `/stream` | Send a question, receive the answer token by token |\n", + "| `POST` | `/api/store_attachment` | Upload a file so the agent can read it |\n", + "| `GET` | `/api/task_status?task_id=...` | Check whether the file upload has finished |\n", + "\n", + "**Setup:**\n", + "```bash\n", + "./docker_build.sh\n", + "./docker_jupyter.sh\n", + "# Set your agent key before running:\n", + "cp .env.example .env\n", + "# Open .env and set DOCSGPT_API_KEY=your-agent-key\n", + "```\n", + "\n", + "> Get your key at https://app.docsgpt.cloud → Settings → Agents → Create New → copy the Key field.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "setup", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Base URL : https://gptcloud.arc53.com\n", + "API key : 5ffa3a62-f***\n", + "Ready ✓\n" + ] + } + ], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2\n", + "\n", + "import json\n", + "import logging\n", + "import os\n", + "import tempfile\n", + "\n", + "import requests\n", + "import docsgpt_utils as tdgputi\n", + "\n", + "from dotenv import load_dotenv\n", + "load_dotenv()\n", + "\n", + "logging.basicConfig(\n", + " level=logging.INFO,\n", + " format=\"%(asctime)s [%(levelname)s] %(message)s\",\n", + ")\n", + "_LOG = logging.getLogger(__name__)\n", + "\n", + "BASE_URL = tdgputi.get_base_url()\n", + "API_KEY = tdgputi.get_api_key()\n", + "\n", + "print(f\"Base URL : {BASE_URL}\")\n", + "print(f\"API key : {API_KEY[:10]}***\")\n", + "print(\"Ready ✓\")" + ] + }, + { + "cell_type": "markdown", + "id": "ep1-md", + "metadata": {}, + "source": [ + "---\n", + "\n", + "## Endpoint 1: `POST /api/answer`\n", + "\n", + "`/api/answer` is the main endpoint for asking questions. You send it a JSON\n", + "body with your question and API key. DocsGPT looks up relevant content from\n", + "the agent's document index, passes it to the language model, and returns the\n", + "complete answer in one JSON response — you wait for the whole thing before\n", + "getting anything back.\n", + "\n", + "**Request body:**\n", + "```json\n", + "{\n", + " \"question\": \"Your question here\",\n", + " \"api_key\": \"your-agent-key\"\n", + "}\n", + "```\n", + "\n", + "**Response fields:**\n", + "```json\n", + "{\n", + " \"answer\": \"The full answer text\",\n", + " \"sources\": [{\"title\": \"...\", \"text\": \"...\"}],\n", + " \"conversation_id\": \"abc123\",\n", + " \"thought\": \"agent reasoning steps (if enabled)\",\n", + " \"tool_calls\": []\n", + "}\n", + "```\n", + "\n", + "The cell below makes a raw `requests.post()` call so you can see the exact\n", + "response shape. The cell after that makes the same call using `query_docsgpt()`.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "ep1-raw", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Request:\n", + " POST https://gptcloud.arc53.com/api/answer\n", + " {question: 'What is Python and what is it mainly used for?', api_key: '...'}\n", + "\n", + "Status : 200\n", + "Response keys: ['answer', 'conversation_id', 'sources', 'thought', 'tool_calls']\n", + "\n", + "answer : **Python** is a high-level, interpreted, general-purpose programming language known for its emphasis on code readability and simplicity. Created by Guido van Rossum and first released in 1991, it has become one of the most popular programming languages in the world.\n", + "\n", + "### Key Characteristics\n", + "* **Ea\n", + "sources : 0 source(s)\n", + "conversation_id: 6d5bcb29-74c9-4c35-b7c6-005055317abf\n", + "thought : \n" + ] + } + ], + "source": [ + "# ── Raw request — see exactly what /api/answer returns ─────────────────────\n", + "question = \"What is Python and what is it mainly used for?\"\n", + "\n", + "print(f\"Request:\")\n", + "print(f\" POST {BASE_URL}/api/answer\")\n", + "print(f\" {{question: '{question}', api_key: '...'}}\")\n", + "print()\n", + "\n", + "raw_resp = requests.post(\n", + " f\"{BASE_URL}/api/answer\",\n", + " json={\"question\": question, \"api_key\": API_KEY},\n", + " timeout=60,\n", + ")\n", + "\n", + "print(f\"Status : {raw_resp.status_code}\")\n", + "data = raw_resp.json()\n", + "print(f\"Response keys: {list(data.keys())}\")\n", + "print(f\"\\nanswer : {data.get('answer', '')[:300]}\")\n", + "print(f\"sources : {len(data.get('sources', []))} source(s)\")\n", + "print(f\"conversation_id: {data.get('conversation_id', '(none)')}\")\n", + "print(f\"thought : {str(data.get('thought', '(none)'))[:100]}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "ep1-wrapper", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2026-05-06 13:05:28,479 [INFO] query_docsgpt | What is Python and what is it mainly used for?\n", + "2026-05-06 13:05:30,711 [INFO] answer received (2866 chars)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "[/api/answer] ANSWER:\n", + "**Python** is a high-level, interpreted, general-purpose programming language known for its emphasis on code readability and simplicity. Created by Guido van Rossum and first released in 1991, it has become one of the most popular programming languages in the world.\n", + "\n", + "### Key Characteristics\n", + "* **Easy to Read:** Python's syntax is designed to be intuitive, almost resembling English.\n", + "* **Interpreted:** Code is executed line-by-line, which makes debugging easier.\n", + "* **Versatile:** It supports multiple programming paradigms, including object-oriented, imperative, and functional programming.\n", + "* **Large Ecosystem:** It has a massive library of pre-built modules and packages (via PyPI) that allow developers to perform complex tasks without writing code from scratch.\n", + "\n", + "---\n", + "\n", + "### What is Python mainly used for?\n", + "\n", + "Python is used across almost every industry. Here are its primary applications:\n", + "\n", + "#### 1. Data Science and Data Analysis\n", + "Python is the industry standard for data science. Libraries like **Pandas**, **NumPy**, and **Matplotlib** allow for data manipulation and visualization.\n", + "* **Use case:** Analyzing sales trends or cleaning large datasets for business intelligence.\n", + "\n", + "#### 2. Artificial Intelligence (AI) and Machine Learning (ML)\n", + "Because of its stability and vast libraries like **TensorFlow**, **PyTorch**, and **Scikit-learn**, Python is the go-to language for building neural networks and predictive models.\n", + "\n", + "#### 3. Web Development (Back-end)\n", + "Python is used to build the \"server-side\" of web applications. Frameworks like **Django** and **Flask** make it easy to create secure and scalable websites.\n", + "* **Examples:** Instagram, Spotify, and Pinterest all use Python in their back-end.\n", + "\n", + "#### 4. Automation and Scripting\n", + "Python is perfect for automating repetitive tasks. This is often called \"scripting.\"\n", + "* **Use case:** Renaming thousands of files at once, web scraping (extracting data from websites), or sending automated emails.\n", + "\n", + "#### 5. Software Testing and Prototyping\n", + "In software development, Python is used for automated testing and for creating \"Proof of Concepts\" (PoCs) because it allows for very fast development.\n", + "\n", + "---\n", + "\n", + "### Visualization of Python Use Cases\n", + "\n", + "```mermaid\n", + "pie title Main Uses of Python\n", + " \"Data Science & ML\" : 40\n", + " \"Web Development\" : 25\n", + " \"Automation/Scripting\" : 20\n", + " \"Software Testing\" : 10\n", + " \"Others (Game Dev, Finance, etc.)\" : 5\n", + "```\n", + "\n", + "---\n", + "\n", + "### Example Code: \"Hello World\" Comparison\n", + "To illustrate Python's simplicity, here is how you print \"Hello, World!\" compared to a language like C++.\n", + "\n", + "**In Python:**\n", + "```python\n", + "print(\"Hello, World!\")\n", + "```\n", + "\n", + "**In C++:**\n", + "```cpp\n", + "#include \n", + "\n", + "int main() {\n", + " std::cout << \"Hello, World!\";\n", + " return 0;\n", + "}\n", + "```\n", + "\n", + "As you can see, Python eliminates the \"boilerplate\" code, allowing developers to focus on solving the problem rather than managing language syntax.\n" + ] + } + ], + "source": [ + "# ── Using the utility wrapper ───────────────────────────────────────────────\n", + "# query_docsgpt() is the same call, with error handling and logging built in.\n", + "result = tdgputi.query_docsgpt(question, API_KEY, BASE_URL)\n", + "tdgputi.print_answer(result, label=\"/api/answer\")" + ] + }, + { + "cell_type": "markdown", + "id": "ep2-md", + "metadata": {}, + "source": [ + "---\n", + "\n", + "## Endpoint 1b: `POST /api/answer` with `history` — Multi-Turn Conversations\n", + "\n", + "By default, each call to `/api/answer` is independent — DocsGPT does not\n", + "remember what was said before. To make it answer follow-up questions in\n", + "context, you pass a `history` field containing the previous turns.\n", + "\n", + "**How the history field works:**\n", + "- It is a list of past turns, where each turn has the question under `\"prompt\"`\n", + " and the answer under `\"response\"`\n", + "- It must be sent as a **JSON-encoded string** (not a raw Python list) —\n", + " so you wrap it with `json.dumps(...)` before putting it in the payload\n", + "\n", + "```python\n", + "# Correct format\n", + "history = json.dumps([{\"prompt\": \"What is Python?\", \"response\": \"Python is...\"}])\n", + "```\n", + "\n", + "The first cell below builds the history manually so you can see exactly what\n", + "gets sent. The second cell uses `multi_turn_conversation()`, which does this\n", + "automatically across any number of turns.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "ep2-manual", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Turn 1 Q: What is Python?\n", + "Turn 1 A: Python is a high-level, interpreted, and general-purpose programming language known for its emphasis on code readability. Created by **Guido van Rossum** and first released in 1991, its design philoso...\n", + "\n", + "History passed as JSON string:\n", + " [{\"prompt\": \"What is Python?\", \"response\": \"Python is a high-level, interpreted, and general-purpose programming languag...\n", + "\n", + "Turn 2 Q: What are its main use cases?\n", + "Turn 2 A: Python's versatility is its greatest strength. It is used across almost every industry, from finance to healthcare. Here is a detailed breakdown of its main use cases:\n", + "\n", + "### 1. Data Science and Data An...\n" + ] + } + ], + "source": [ + "# ── Manual multi-turn to show history construction explicitly ──────────────\n", + "\n", + "# Turn 1\n", + "q1 = \"What is Python?\"\n", + "r1 = requests.post(\n", + " f\"{BASE_URL}/api/answer\",\n", + " json={\"question\": q1, \"api_key\": API_KEY},\n", + " timeout=60,\n", + ").json()\n", + "a1 = r1.get(\"answer\", \"\")\n", + "\n", + "print(f\"Turn 1 Q: {q1}\")\n", + "print(f\"Turn 1 A: {a1[:200]}...\")\n", + "\n", + "# Build history in the CORRECT format\n", + "history = json.dumps([{\"prompt\": q1, \"response\": a1}]) # ← \"response\", not \"answer\"\n", + "print(f\"\\nHistory passed as JSON string:\")\n", + "print(f\" {history[:120]}...\")\n", + "\n", + "# Turn 2 — follow-up question with history attached\n", + "q2 = \"What are its main use cases?\"\n", + "r2 = requests.post(\n", + " f\"{BASE_URL}/api/answer\",\n", + " json={\"question\": q2, \"api_key\": API_KEY, \"history\": history},\n", + " timeout=60,\n", + ").json()\n", + "a2 = r2.get(\"answer\", \"\")\n", + "\n", + "print(f\"\\nTurn 2 Q: {q2}\")\n", + "print(f\"Turn 2 A: {a2[:200]}...\")" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "ep2-wrapper", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2026-05-06 13:06:41,113 [INFO] query_docsgpt | What is Python?\n", + "2026-05-06 13:06:43,055 [INFO] answer received (2215 chars)\n", + "2026-05-06 13:06:43,056 [INFO] multi-turn: turn 1 complete\n", + "2026-05-06 13:06:43,057 [INFO] query_docsgpt | What are its main use cases?\n", + "2026-05-06 13:06:46,003 [INFO] answer received (3279 chars)\n", + "2026-05-06 13:06:46,004 [INFO] multi-turn: turn 2 complete\n", + "2026-05-06 13:06:46,005 [INFO] query_docsgpt | How does it compare to Java for those use cases?\n", + "2026-05-06 13:06:54,750 [INFO] answer received (3258 chars)\n", + "2026-05-06 13:06:54,752 [INFO] multi-turn: turn 3 complete\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "--- Turn 1 ---\n", + "Q: What is Python?\n", + "A: Python is a high-level, interpreted, and general-purpose programming language known for its emphasis on code readability. Created by **Guido van Rossum** and first released in 1991, its design philosophy focuses on making code easy to write and understand, often using English-like keywords.\n", + "\n", + "### Key...\n", + "\n", + "--- Turn 2 ---\n", + "Q: What are its main use cases?\n", + "A: Python's versatility is its greatest strength. It is used across almost every industry, from finance to healthcare. Here is a detailed breakdown of its main use cases:\n", + "\n", + "### 1. Data Science and Data Analysis\n", + "Python is the industry standard for data professionals. It allows for the processing of massi...\n", + "\n", + "--- Turn 3 ---\n", + "Q: How does it compare to Java for those use cases?\n", + "A: Comparing Python and Java is a classic debate. Both are extremely powerful and popular, but they serve different philosophies. While **Java is built for performance and enterprise stability**, **Python is built for developer speed and simplicity.**\n", + "\n", + "Here is how they compare across the main use cases...\n" + ] + } + ], + "source": [ + "# ── Using the wrapper — history is managed automatically ───────────────────\n", + "turns = tdgputi.multi_turn_conversation(\n", + " [\n", + " \"What is Python?\",\n", + " \"What are its main use cases?\",\n", + " \"How does it compare to Java for those use cases?\",\n", + " ],\n", + " API_KEY,\n", + " BASE_URL,\n", + ")\n", + "\n", + "for i, turn in enumerate(turns, 1):\n", + " print(f\"\\n--- Turn {i} ---\")\n", + " print(f\"Q: {turn['question']}\")\n", + " print(f\"A: {turn['answer'][:300]}{'...' if len(turn['answer']) > 300 else ''}\")" + ] + }, + { + "cell_type": "markdown", + "id": "ep3-md", + "metadata": {}, + "source": [ + "---\n", + "\n", + "## Endpoint 2: `POST /stream` — Streaming Responses\n", + "\n", + "`/stream` works like `/api/answer` but instead of waiting for the full answer,\n", + "it sends back small pieces of text as the model generates them. This is called\n", + "**Server-Sent Events (SSE)** — the server keeps the connection open and pushes\n", + "data line by line.\n", + "\n", + "Each line that comes back starts with `data:` followed by a JSON object.\n", + "That JSON object always has a `\"type\"` field that tells you what kind of\n", + "message it is:\n", + "\n", + "| `type` | What it contains |\n", + "|--------|------------------|\n", + "| `\"answer\"` | One chunk of the answer text, in the `\"answer\"` field |\n", + "| `\"source\"` | The document chunks the agent used to answer |\n", + "| `\"thought\"` | The agent's reasoning steps (if your agent has this enabled) |\n", + "| `\"id\"` | The conversation ID, sent at the very end |\n", + "| `\"end\"` | Signals that the stream is finished |\n", + "| `\"error\"` | An error message |\n", + "\n", + "The first cell below reads the raw SSE stream line by line so you can see\n", + "every event. The second uses `stream_docsgpt()` which handles the parsing\n", + "and returns the assembled answer string. The third uses\n", + "`stream_docsgpt_events()` which yields every event dict if you need to\n", + "access sources or the conversation ID.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "ep3-raw", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Request:\n", + " POST https://gptcloud.arc53.com/stream\n", + " Accept: text/event-stream\n", + " {question: 'What programming paradigms does Python support? Give a one-l...'}\n", + "\n", + "⚡ Live SSE events:\n", + "-------------------------------------------------------\n", + "Python is a **multi-paradigm** programming language, allowing developers to choose the style that best fits their problem. Here are the primary paradigms it supports with one-line examples:\n", + "\n", + "### 1. Procedural Programming\n", + "Focuses on a sequence of statements and functions to complete a task.\n", + "```python\n", + "print(\"The sum is:\", sum([10, 20, 30]))\n", + "```\n", + "\n", + "### 2. Object-Oriented Programming (OOP)\n", + "Organizes code into objects that combine data (attributes) and behavior (methods).\n", + "```python\n", + "class Dog: pass; my_dog = Dog()\n", + "```\n", + "\n", + "### 3. Functional Programming\n", + "Treats computation as the evaluation of mathematical functions and avoids changing-state and mutable data.\n", + "```python\n", + "squared_numbers = list(map(lambda x: x**2, [1, 2, 3, 4]))\n", + "```\n", + "\n", + "### 4. Imperative Programming\n", + "Uses statements that change a program's state (directly instructing the computer *how* to do something).\n", + "```python\n", + "x = 5; x = x + 10; print(x)\n", + "```\n", + "\n", + "---\n", + "\n", + "### Visualization of Python's Paradigms\n", + "\n", + "```mermaid\n", + "graph TD\n", + " A[Python Multi-Paradigm] --> B[Procedural]\n", + " A --> C[Object-Oriented]\n", + " A --> D[Functional]\n", + " A --> E[Imperative]\n", + "\n", + " B --> B1[Focus: Functions & Sequences]\n", + " C --> C1[Focus: Classes & Objects]\n", + " D --> D1[Focus: Pure Functions & Lambda]\n", + " E --> E1[Focus: State Changes]\n", + "```\n", + "\n", + "Python also supports features of **Reflective** (metaprogramming) and **Aspect-Oriented** programming (via decorators), though the four listed above are the most common styles used by developers.\n", + "[ID event — conversation_id: 6ebb0e75-7a93-4566-b657-d93aa9a1af42]\n", + "\n", + "[END]\n", + "-------------------------------------------------------\n", + "\n", + "Event type breakdown: {'message_id': 1, 'answer': 16, 'tool_calls': 1, 'id': 1, 'end': 1}\n" + ] + } + ], + "source": [ + "# ── Raw SSE streaming — annotated to show every line ──────────────────────\n", + "stream_q = \"What programming paradigms does Python support? Give a one-line example of each.\"\n", + "\n", + "print(f\"Request:\")\n", + "print(f\" POST {BASE_URL}/stream\")\n", + "print(f\" Accept: text/event-stream\")\n", + "print(f\" {{question: '{stream_q[:60]}...'}}\")\n", + "print(\"\\n⚡ Live SSE events:\")\n", + "print(\"-\" * 55)\n", + "\n", + "event_log: list = []\n", + "\n", + "with requests.post(\n", + " f\"{BASE_URL}/stream\",\n", + " json={\"question\": stream_q, \"api_key\": API_KEY},\n", + " stream=True,\n", + " timeout=120,\n", + " headers={\"Accept\": \"text/event-stream\"},\n", + ") as resp:\n", + " resp.raise_for_status()\n", + " for raw in resp.iter_lines():\n", + " if not raw:\n", + " continue\n", + " line = raw.decode(\"utf-8\") if isinstance(raw, bytes) else raw\n", + " if not line.startswith(\"data:\"):\n", + " continue\n", + " try:\n", + " event = json.loads(line[5:].strip())\n", + " event_log.append(event)\n", + " etype = event.get(\"type\", \"?\")\n", + " if etype == \"answer\":\n", + " print(event[\"answer\"], end=\"\", flush=True) # ← field is \"answer\"\n", + " elif etype == \"end\":\n", + " print(\"\\n[END]\")\n", + " break\n", + " elif etype == \"source\":\n", + " print(f\"\\n[SOURCE event — {len(event.get('sources',[]))} source(s)]\")\n", + " elif etype == \"id\":\n", + " print(f\"\\n[ID event — conversation_id: {event.get('id', '')}]\")\n", + " elif etype == \"error\":\n", + " print(f\"\\n[ERROR: {event.get('error', '')}]\")\n", + " except json.JSONDecodeError:\n", + " pass\n", + "\n", + "print(\"-\" * 55)\n", + "type_counts = {}\n", + "for e in event_log:\n", + " t = e.get(\"type\", \"?\")\n", + " type_counts[t] = type_counts.get(t, 0) + 1\n", + "print(f\"\\nEvent type breakdown: {type_counts}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "ep3-wrapper", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2026-05-06 13:09:12,763 [INFO] stream_docsgpt | What is the GIL in Python and why does it matter?\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Using stream_docsgpt() wrapper:\n", + "-------------------------------------------------------\n", + "The **Global Interpreter Lock (GIL)** is a mutex (or a lock) that allows only one thread to hold control of the Python interpreter at a time. This means that even on a multi-core processor, only one thread can execute Python bytecode at any given moment within a single process.\n", + "\n", + "### Why does the GIL exist?\n", + "The primary reason for the GIL is **memory management**. CPython (the standard Python implementation) uses reference counting to manage memory. Without the GIL, two threads could simultaneously increase or decrease the reference count of an object, leading to:\n", + "1. **Memory leaks:** Where memory is never released.\n", + "2. **Crashes:** Where an object is deleted while another thread still needs it.\n", + "\n", + "By using a single lock, CPython ensures thread safety for these internal operations simply and efficiently.\n", + "\n", + "---\n", + "\n", + "### How the GIL Works (Visualization)\n", + "Even if you have multiple threads, they must \"take turns\" holding the lock to execute code.\n", + "\n", + "```mermaid\n", + "sequenceDiagram\n", + " participant CPU1 as Core 1\n", + " participant CPU2 as Core 2\n", + " participant GIL as Global Interpreter Lock\n", + "\n", + " Note over CPU1, CPU2: Multi-threaded Python Process\n", + " CPU1->>GIL: Acquire Lock\n", + " Note right of CPU1: Thread 1 Executing\n", + " CPU2->>GIL: Wait...\n", + " CPU1->>GIL: Release Lock (I/O or Tick)\n", + " CPU2->>GIL: Acquire Lock\n", + " Note right of CPU2: Thread 2 Executing\n", + " CPU1->>GIL: Wait...\n", + "```\n", + "\n", + "---\n", + "\n", + "### Why the GIL Matters\n", + "\n", + "#### 1. Impact on CPU-Bound Tasks\n", + "For tasks that require heavy computation (like mathematical calculations or image processing), the GIL becomes a bottleneck. Since only one thread runs at a time, adding more threads does not make the program faster; in fact, it can make it slower due to the overhead of switching between threads.\n", + "\n", + "#### 2. Impact on I/O-Bound Tasks\n", + "For tasks that spend a lot of time waiting (like network requests or reading from a disk), the GIL is **not a big problem**. When a thread waits for I/O, it releases the GIL, allowing another thread to run in the meantime.\n", + "\n", + "#### 3. Simplicity for C Extensions\n", + "Many C libraries (like NumPy) are wrapped in Python. The GIL makes it easier to write these extensions because the developer doesn't have to worry about complex thread-safety for Python objects within their C code.\n", + "\n", + "---\n", + "\n", + "### How to Bypass the GIL\n", + "If you need true parallelism in Python, you generally have three options:\n", + "\n", + "1. **Multiprocessing:** Use the `multiprocessing` module. This creates separate instances of the Python interpreter (separate processes), each with its own GIL.\n", + "2. **C Extensions:** Libraries like NumPy or SciPy perform heavy calculations in C, releasing the GIL while they work.\n", + "3. **Alternative Interpreters:** Implementations like **Jython** or **IronPython** do not have a GIL.\n", + "\n", + "### Code Example: Threading vs. Multiprocessing\n", + "\n", + "```python\n", + "import time\n", + "from threading import Thread\n", + "from multiprocessing import Process\n", + "\n", + "def cpu_bound_task(n):\n", + " while n > 0:\n", + " n -= 1\n", + "\n", + "count = 50_000_000\n", + "\n", + "# Using Threads (Limited by GIL)\n", + "t1 = Thread(target=cpu_bound_task, args=(count,))\n", + "t2 = Thread(target=cpu_bound_task, args=(count,))\n", + "\n", + "start = time.time()\n", + "t1.start(); t2.start()\n", + "t1.join(); t2.join()\n", + "print(f\"Threading time: {time.time() - start:.2f}s\")\n", + "\n", + "# Using Processes (Bypasses GIL)\n", + "p1 = Process(target=cpu_bound_task, args=(count,))\n", + "p2 = Process(target=cpu_bound_task, args=(count,))\n", + "\n", + "start = time.time()\n", + "p1.start(); p2.start()\n", + "p1.join(); p2.join()\n", + "print(f\"Multiprocessing time: {time.time() - start:.2f}s\")\n", + "```\n", + "\n", + "### The Future: Python 3.13 and Beyond\n", + "With the acceptance of **PEP 703**, Python is moving toward making the GIL optional (Experimental \"Free-threading\" builds). This will eventually allow Python to take full advantage of multi-core processors without the current locking limitations, though it will take several years for the ecosystem to fully adapt." + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2026-05-06 13:09:25,935 [INFO] stream complete (3830 chars)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "-------------------------------------------------------\n", + "Total chars assembled: 3830\n" + ] + } + ], + "source": [ + "# ── stream_docsgpt() wrapper ───────────────────────────────────────────────\n", + "# Handles all the SSE parsing and returns the full assembled answer string.\n", + "print(\"Using stream_docsgpt() wrapper:\")\n", + "print(\"-\" * 55)\n", + "\n", + "answer = tdgputi.stream_docsgpt(\n", + " \"What is the GIL in Python and why does it matter?\",\n", + " API_KEY,\n", + " BASE_URL,\n", + " print_live=True,\n", + ")\n", + "print(\"-\" * 55)\n", + "print(f\"Total chars assembled: {len(answer)}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "ep3-events", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Using stream_docsgpt_events() to capture all event types:\n", + "\n", + "→ Conversation ID: bfdd4883-530c-495d-acc1-3c5d92557ad6\n", + "→ Stream ended\n", + "\n", + "Full answer: Python is well-known for its robust ecosystem of web frameworks. Here are three of the most popular ones:\n", + "\n", + "### 1. Django\n", + "Django is a high-level, \"batteries-included\" framework that encourages rapid development and clean, pragmatic design. It handles much of the complexity of web development out of t\n" + ] + } + ], + "source": [ + "# ── stream_docsgpt_events() — get every event type, not just tokens ────────\n", + "# Useful when you need the sources the RAG retrieved, or the conversation ID.\n", + "print(\"Using stream_docsgpt_events() to capture all event types:\\n\")\n", + "\n", + "answer_parts = []\n", + "sources = []\n", + "conv_id = \"\"\n", + "\n", + "for event in tdgputi.stream_docsgpt_events(\n", + " \"Name three popular Python web frameworks.\",\n", + " API_KEY, BASE_URL,\n", + "):\n", + " etype = event.get(\"type\")\n", + " if etype == \"answer\":\n", + " answer_parts.append(event.get(\"answer\", \"\"))\n", + " elif etype == \"source\":\n", + " sources = event.get(\"sources\", [])\n", + " print(f\"→ Received {len(sources)} source(s) from RAG retrieval\")\n", + " elif etype == \"id\":\n", + " conv_id = event.get(\"id\", \"\")\n", + " print(f\"→ Conversation ID: {conv_id}\")\n", + " elif etype == \"end\":\n", + " print(f\"→ Stream ended\")\n", + "\n", + "print(f\"\\nFull answer: {''.join(answer_parts)[:300]}\")" + ] + }, + { + "cell_type": "markdown", + "id": "ep4-md", + "metadata": {}, + "source": [ + "---\n", + "\n", + "## Endpoints 3 & 4: File Attachment Flow\n", + "\n", + "DocsGPT processes uploaded files in the background, so attaching a file\n", + "to a query takes three separate steps:\n", + "\n", + "**Step 1 — Upload the file** (`POST /api/store_attachment`)\n", + "Send the file as multipart form data. The server queues it for processing\n", + "and immediately returns a `task_id`.\n", + "\n", + "**Step 2 — Wait for processing** (`GET /api/task_status?task_id=...`)\n", + "Poll this endpoint every few seconds. When `status` becomes `\"SUCCESS\"`,\n", + "the response includes an `attachment_id`.\n", + "\n", + "**Step 3 — Use the attachment** (`POST /stream`)\n", + "Pass the `attachment_id` in the `\"attachments\"` list. The model can now\n", + "read the file when generating its answer.\n", + "\n", + "```\n", + "POST /api/store_attachment → { task_id: \"abc\" }\n", + " ↓\n", + "GET /api/task_status → { status: \"SUCCESS\", attachment_id: \"xyz\" }\n", + " ↓\n", + "POST /stream (attachments: [\"xyz\"]) → answer stream\n", + "```\n", + "\n", + "The three cells below go through each step. `upload_and_attach()` combines\n", + "steps 1 and 2 into a single function call.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "ep4-upload", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "📁 Temp file : /var/folders/lj/89tkjhqd6034t1cfsz5drsy80000gn/T/api_test_a080pyn5.txt\n", + " Content : 494 chars\n", + "\n", + "Request:\n", + " POST https://gptcloud.arc53.com/api/store_attachment\n", + " multipart/form-data: file=, api_key=\n", + "\n", + "Status : 200\n", + "Response : {'message': 'File uploaded successfully. Processing started.', 'success': True, 'task_id': '3f8c0dad-1202-4582-b1f3-d4c904600b11'}\n", + "task_id : 3f8c0dad-1202-4582-b1f3-d4c904600b11\n" + ] + } + ], + "source": [ + "# ── Step 1: POST /api/store_attachment ────────────────────────────────────\n", + "attach_content = (\n", + " \"Transformers: A Neural Network Architecture\\n\\n\"\n", + " \"Introduced in 'Attention is All You Need' (Vaswani et al. 2017).\\n\"\n", + " \"Key innovation: self-attention allows parallel processing of sequences.\\n\\n\"\n", + " \"Core components:\\n\"\n", + " \" 1. Multi-head self-attention — attends to all positions simultaneously\\n\"\n", + " \" 2. Positional encoding — injects sequence order without recurrence\\n\"\n", + " \" 3. Feed-forward layers — applies non-linear transformations\\n\\n\"\n", + " \"Famous models: BERT (encoder-only), GPT (decoder-only), T5 (encoder-decoder).\"\n", + ")\n", + "\n", + "with tempfile.NamedTemporaryFile(\n", + " mode=\"w\", suffix=\".txt\", prefix=\"api_test_\",\n", + " delete=False, encoding=\"utf-8\",\n", + ") as tmp:\n", + " tmp.write(attach_content)\n", + " tmp_path = tmp.name\n", + "\n", + "print(f\"📁 Temp file : {tmp_path}\")\n", + "print(f\" Content : {len(attach_content)} chars\")\n", + "print()\n", + "print(f\"Request:\")\n", + "print(f\" POST {BASE_URL}/api/store_attachment\")\n", + "print(f\" multipart/form-data: file=, api_key=\")\n", + "print()\n", + "\n", + "with open(tmp_path, \"rb\") as fh:\n", + " upload_resp = requests.post(\n", + " f\"{BASE_URL}/api/store_attachment\",\n", + " files={\"file\": (os.path.basename(tmp_path), fh)},\n", + " data={\"api_key\": API_KEY},\n", + " timeout=60,\n", + " )\n", + "\n", + "print(f\"Status : {upload_resp.status_code}\")\n", + "upload_data = upload_resp.json()\n", + "print(f\"Response : {upload_data}\")\n", + "task_id = upload_data.get(\"task_id\", \"\")\n", + "print(f\"task_id : {task_id}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "ep5-poll", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Request:\n", + " GET https://gptcloud.arc53.com/api/task_status?task_id=3f8c0dad-1202-4582-b1f3-d4c904600b11\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2026-05-06 13:10:00,249 [INFO] polling task_status for task_id=3f8c0dad-1202-4582-b1f3-d4c904600b11\n", + "2026-05-06 13:10:00,444 [INFO] attachment ready: df880cfb-389c-4054-ae6a-0f6c1df39825\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Single poll result: {'result': {'attachment_id': 'df880cfb-389c-4054-ae6a-0f6c1df39825', 'filename': 'api_test_a080pyn5.txt', 'metadata': {'bucket_name': 'docsgpt-user-buffer', 'region': 'eu-west-2', 'storage_type': 's3', 'uri': 's3://docsgpt-user-buffer/inputs/user_3DHWYxAbVPcuNzGPycPJ3RZoWGT/attachments/df880cfb-389c-4054-ae6a-0f6c1df39825/api_test_a080pyn5.txt'}, 'mime_type': 'text/plain', 'path': 'inputs/user_3DHWYxAbVPcuNzGPycPJ3RZoWGT/attachments/df880cfb-389c-4054-ae6a-0f6c1df39825/api_test_a080pyn5.txt', 'token_count': 113}, 'status': 'SUCCESS'}\n", + "\n", + "Polling until SUCCESS...\n", + "✅ attachment_id: df880cfb-389c-4054-ae6a-0f6c1df39825\n" + ] + } + ], + "source": [ + "# ── Step 2: GET /api/task_status — poll until SUCCESS ─────────────────────\n", + "print(f\"Request:\")\n", + "print(f\" GET {BASE_URL}/api/task_status?task_id={task_id}\")\n", + "print()\n", + "\n", + "attachment_id = \"\"\n", + "\n", + "if task_id:\n", + " # Show one raw poll to see the response shape\n", + " single_poll = requests.get(\n", + " f\"{BASE_URL}/api/task_status\",\n", + " params={\"task_id\": task_id},\n", + " timeout=15,\n", + " ).json()\n", + " print(f\"Single poll result: {single_poll}\")\n", + " print()\n", + "\n", + " # Now use the wrapper which polls automatically until SUCCESS\n", + " print(\"Polling until SUCCESS...\")\n", + " try:\n", + " attachment_id = tdgputi.poll_attachment_status(\n", + " task_id, BASE_URL, timeout_sec=90, poll_interval=3.0\n", + " )\n", + " print(f\"✅ attachment_id: {attachment_id}\")\n", + " except (TimeoutError, RuntimeError) as e:\n", + " print(f\"⚠️ {e}\")\n", + "else:\n", + " print(\"No task_id available — skipping poll step.\")\n", + "\n", + "os.unlink(tmp_path) # clean up temp file" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "ep5-use", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2026-05-06 13:10:00,468 [INFO] stream_docsgpt | What are the three core components described in this document? Explain each one.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Request:\n", + " POST https://gptcloud.arc53.com/stream\n", + " {question: '...', api_key: '...', attachments: ['df880cfb-389...']}\n", + "\n", + "⚡ Streaming response with attachment:\n", + "-------------------------------------------------------\n", + "Based on the document provided, the three core components of the Transformer architecture are:\n", + "\n", + "1. **Multi-head self-attention**: This component allows the model to attend to all positions in a sequence simultaneously. It enables the network to weigh the importance of different words in a sentence, regardless of their distance from one another.\n", + "2. **Positional encoding**: Since Transformers do not use recurrence (like RNNs), they don't inherently know the order of words. Positional encoding injects information about the sequence order into the model without needing sequential processing.\n", + "3. **Feed-forward layers**: These layers apply non-linear transformations to the data after it has passed through the attention mechanisms, helping the model learn complex patterns.\n", + "\n", + "### Visualization of Transformer Components\n", + "To help you visualize how these components interact within the architecture, here is a diagram:\n", + "\n", + "```mermaid\n", + "graph TD\n", + " A[Input Embeddings] --> B[Positional Encoding]\n", + " B --> C[Multi-Head Self-Attention]\n", + " C --> D[Add & Norm]\n", + " D --> E[Feed-Forward Layers]\n", + " E --> F[Add & Norm]\n", + " F --> G[Linear/Softmax Output]\n", + "\n", + " subgraph \"Core Components\"\n", + " B\n", + " C\n", + " E\n", + " end\n", + "```\n", + "\n", + "These components work together to allow the parallel processing of sequences, which was the key innovation introduced in the \"Attention is All You Need\" paper." + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2026-05-06 13:10:28,398 [INFO] stream complete (1370 chars)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "-------------------------------------------------------\n", + "✅ Answer: 1370 chars\n" + ] + } + ], + "source": [ + "# ── Step 3: POST /stream with attachments=[attachment_id] ─────────────────\n", + "if attachment_id:\n", + " print(f\"Request:\")\n", + " print(f\" POST {BASE_URL}/stream\")\n", + " print(f\" {{question: '...', api_key: '...', attachments: ['{attachment_id[:12]}...']}}\")\n", + " print()\n", + " print(\"⚡ Streaming response with attachment:\\n\" + \"-\"*55)\n", + "\n", + " answer = tdgputi.stream_docsgpt(\n", + " \"What are the three core components described in this document? Explain each one.\",\n", + " API_KEY,\n", + " BASE_URL,\n", + " attachments=[attachment_id],\n", + " print_live=True,\n", + " )\n", + " print(\"-\"*55)\n", + " print(f\"✅ Answer: {len(answer)} chars\")\n", + "else:\n", + " print(\"No attachment_id — skipping. (Try again if the upload failed above.)\")" + ] + }, + { + "cell_type": "markdown", + "id": "inline-md", + "metadata": {}, + "source": [ + "---\n", + "\n", + "## Summarisation & FAQ — Sending Text in the Prompt\n", + "\n", + "Summarisation and FAQ generation do not use separate endpoints.\n", + "Both work by embedding the document text directly inside the question\n", + "and sending it to `/api/answer`.\n", + "\n", + "`summarize_document()` builds a prompt like this and sends it:\n", + "```\n", + "Read the following document and write a summary in under {max_words} words.\n", + "\n", + "DOCUMENT:\n", + "\n", + "```\n", + "\n", + "`generate_faqs()` builds a prompt that asks for Q:/A: formatted pairs:\n", + "```\n", + "Generate exactly {n} FAQs with answers. Use this format:\n", + "Q: \n", + "A: \n", + "\n", + "DOCUMENT:\n", + "\n", + "```\n", + "\n", + "After DocsGPT responds, `parse_faqs()` splits the raw text on `Q:` markers\n", + "and extracts each question-answer pair into a list of dicts.\n", + "\n", + "The text is auto-truncated to 4000 characters before sending to stay within\n", + "the prompt size limit.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "inline-demo", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2026-05-06 13:16:48,012 [INFO] summarize_document: 'api_walkthrough' (522 chars)\n", + "2026-05-06 13:16:48,013 [INFO] query_docsgpt | Read the following document carefully and write a concise, well-structured summary in no more than 8\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "📝 SUMMARISATION via POST /api/answer (inline prompt)\n", + "=======================================================\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2026-05-06 13:16:49,536 [INFO] answer received (418 chars)\n", + "2026-05-06 13:16:49,537 [INFO] generate_faqs: 3 Qs for 'api_walkthrough'\n", + "2026-05-06 13:16:49,538 [INFO] query_docsgpt | Based on the document below, generate exactly 3 frequently asked questions (FAQs) with detailed, hel\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Summary (61 words):\n", + "Created by Guido van Rossum in 1991, Python is a high-level, interpreted programming language prized for its readability and support for procedural, object-oriented, and functional paradigms. It is widely used in data science, machine learning, and web development. Python’s strength lies in its extensive standard library and a vast open-source ecosystem, supported by the pip package manager and the PyPI repository.\n", + "\n", + "❓ FAQ GENERATION via POST /api/answer (inline prompt)\n", + "=======================================================\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2026-05-06 13:16:52,404 [INFO] answer received (1208 chars)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Generated 3 FAQs:\n", + "\n", + "Q1: What are the primary characteristics and programming paradigms supported by Python?\n", + "A1: Python is a high-level, interpreted programming language distinguished by its clear syntax and focus on readability. It is highly flexible, supporting multiple programming paradigms including procedural, object-oriented, and functional programming, which allows developers to choose the best approach for their specific project needs.\n", + "\n", + "Q2: What are the most common real-world applications for the Python language?\n", + "A2: Python is a versatile tool widely utilized across several major technical domains. It is a leading language in the fields of data science and machine learning, a popular choice for building backend systems in web development, and a frequently used language for streamlining tasks through automation.\n", + "\n", + "Q3: Who created Python and how does its ecosystem support modern development?\n", + "A3: Python was created by Guido van Rossum and first released in 1991. Today, it supports developers through a massive standard library and a vibrant open-source ecosystem. Its dedicated package manager, pip, provides seamless access to hundreds of thousands of third-party packages hosted on the Python Package Index (PyPI).\n" + ] + } + ], + "source": [ + "# ── Inline summarisation and FAQ demo ─────────────────────────────────────\n", + "sample_doc = (\n", + " \"Python is a high-level, interpreted programming language known for its clear \"\n", + " \"syntax and readability. It supports multiple programming paradigms including \"\n", + " \"procedural, object-oriented, and functional programming. Python is widely used \"\n", + " \"in data science, machine learning, web development, and automation. It was \"\n", + " \"created by Guido van Rossum and first released in 1991. Python has a large \"\n", + " \"standard library and a vibrant open-source ecosystem. Its package manager pip \"\n", + " \"provides access to hundreds of thousands of packages on PyPI.\"\n", + ")\n", + "\n", + "# ── Summarisation ──────────────────────────────────────────────────────────\n", + "print(\"📝 SUMMARISATION via POST /api/answer (inline prompt)\")\n", + "print(\"=\" * 55)\n", + "\n", + "summary = tdgputi.summarize_document(\n", + " sample_doc, API_KEY, BASE_URL, max_words=80, source_label=\"api_walkthrough\"\n", + ")\n", + "print(f\"Summary ({len(summary.split())} words):\\n{summary}\")\n", + "\n", + "# ── FAQ Generation ─────────────────────────────────────────────────────────\n", + "print(\"\\n❓ FAQ GENERATION via POST /api/answer (inline prompt)\")\n", + "print(\"=\" * 55)\n", + "\n", + "faqs = tdgputi.generate_faqs(\n", + " sample_doc, API_KEY, BASE_URL, n_questions=3, source_label=\"api_walkthrough\"\n", + ")\n", + "print(f\"Generated {len(faqs)} FAQs:\")\n", + "tdgputi.print_faqs(faqs)" + ] + }, + { + "cell_type": "markdown", + "id": "3edf40e4", + "metadata": {}, + "source": [ + "## Evaluation — ROUGE and BLEU\n", + "\n", + "We use two standard metrics to measure how well the generated summaries and\n", + "FAQs capture the content of the original document.\n", + "\n", + "**ROUGE** (Recall-Oriented Understudy for Gisting Evaluation) counts how many\n", + "words or phrases from the reference text appear in the generated output.\n", + "We compute three variants:\n", + "\n", + "- **ROUGE-1** — counts single word matches. If the reference says\n", + " \"machine learning\" and the summary says \"machine learning\", both words count.\n", + "- **ROUGE-2** — counts two-word phrase matches. \"machine learning\" as a pair\n", + " has to appear in both texts to score.\n", + "- **ROUGE-L** — finds the longest sequence of words that appear in the same\n", + " order in both texts, even if not consecutive.\n", + "\n", + "All three return an F1 score between 0 and 1. Scores between 0.2 and 0.4 are\n", + "typical for AI-generated summaries — the model paraphrases rather than copying,\n", + "so exact word matches will naturally be lower.\n", + "\n", + "**BLEU** (Bilingual Evaluation Understudy) works from the other direction — it\n", + "checks how many words in the generated text also appear in the reference, and\n", + "applies a penalty if the output is too short. Also 0 to 1, higher is better.\n", + "\n", + "`evaluate_output(hypothesis, reference)` runs both metrics and returns a single\n", + "dict with keys `rouge1`, `rouge2`, `rougeL`, and `bleu`. `evaluate_all()` runs\n", + "it across every dataset, using the first 500 characters of each source text as\n", + "the reference." + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "e5099415", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2026-05-06 13:16:53,230 [INFO] Using default tokenizer.\n", + "2026-05-06 13:16:53,232 [INFO] evaluate_output: {'rouge1': 0.7338, 'rouge2': 0.4672, 'rougeL': 0.5324, 'bleu': 0.3158}\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "📊 EVALUATION (ROUGE + BLEU)\n", + "=======================================================\n", + " [summary] ROUGE1: 0.7338\n", + " [summary] ROUGE2: 0.4672\n", + " [summary] ROUGEL: 0.5324\n", + " [summary] BLEU: 0.3158\n" + ] + } + ], + "source": [ + "# ── Evaluation ─────────────────────────────────────────────────────────────\n", + "print(\"\\n📊 EVALUATION (ROUGE + BLEU)\")\n", + "print(\"=\" * 55)\n", + "\n", + "scores = tdgputi.evaluate_output(summary, sample_doc[:500])\n", + "tdgputi.print_scores(scores, label=\"summary\")" + ] + }, + { + "cell_type": "markdown", + "id": "summary-md", + "metadata": {}, + "source": [ + "---\n", + "\n", + "## Summary\n", + "\n", + "| Endpoint | Method | Wrapper function |\n", + "|----------|--------|------------------|\n", + "| `/api/answer` | POST | `query_docsgpt()` |\n", + "| `/api/answer` + history | POST | `multi_turn_conversation()` |\n", + "| `/stream` | POST | `stream_docsgpt()`, `stream_docsgpt_events()` |\n", + "| `/api/store_attachment` | POST | `store_attachment()` |\n", + "| `/api/task_status` | GET | `poll_attachment_status()` |\n", + "\n", + "**Things to remember:**\n", + "- History is a JSON-encoded string, not a raw list\n", + "- The history reply key is `\"response\"`, not `\"answer\"`\n", + "- SSE answer chunks have `type == \"answer\"` and the text is in `event[\"answer\"]`\n", + "- File uploads are async — always poll until SUCCESS before using the attachment\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv (3.13.7)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.7" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/docsgpt.API.py b/class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/docsgpt.API.py new file mode 100644 index 000000000..3b22f4fb0 --- /dev/null +++ b/class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/docsgpt.API.py @@ -0,0 +1,580 @@ +# --- +# jupyter: +# jupytext: +# text_representation: +# extension: .py +# format_name: percent +# format_version: '1.3' +# jupytext_version: 1.16.4 +# kernelspec: +# display_name: .venv (3.13.7) +# language: python +# name: python3 +# --- + +# %% [markdown] +# # DocsGPT API Overview +# +# This notebook walks through every DocsGPT Cloud API endpoint one by one. +# Each section shows the raw HTTP request first, then the same call using the +# `docsgpt_utils` wrapper so you can see what the wrapper is doing under the hood. +# +# **Endpoints covered:** +# +# | Method | Endpoint | Purpose | +# |--------|----------|---------| +# | `POST` | `/api/answer` | Send a question, get a full JSON response | +# | `POST` | `/stream` | Send a question, receive the answer token by token | +# | `POST` | `/api/store_attachment` | Upload a file so the agent can read it | +# | `GET` | `/api/task_status?task_id=...` | Check whether the file upload has finished | +# +# **Setup:** +# ```bash +# ./docker_build.sh +# ./docker_jupyter.sh +# # Set your agent key before running: +# cp .env.example .env +# # Open .env and set DOCSGPT_API_KEY=your-agent-key +# ``` +# +# > Get your key at https://app.docsgpt.cloud → Settings → Agents → Create New → copy the Key field. +# + +# %% +# %load_ext autoreload +# %autoreload 2 + +import json +import logging +import os +import tempfile + +import requests +import docsgpt_utils as tdgputi + +from dotenv import load_dotenv +load_dotenv() + +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s [%(levelname)s] %(message)s", +) +_LOG = logging.getLogger(__name__) + +BASE_URL = tdgputi.get_base_url() +API_KEY = tdgputi.get_api_key() + +print(f"Base URL : {BASE_URL}") +print(f"API key : {API_KEY[:10]}***") +print("Ready ✓") + +# %% [markdown] +# --- +# +# ## Endpoint 1: `POST /api/answer` +# +# `/api/answer` is the main endpoint for asking questions. You send it a JSON +# body with your question and API key. DocsGPT looks up relevant content from +# the agent's document index, passes it to the language model, and returns the +# complete answer in one JSON response — you wait for the whole thing before +# getting anything back. +# +# **Request body:** +# ```json +# { +# "question": "Your question here", +# "api_key": "your-agent-key" +# } +# ``` +# +# **Response fields:** +# ```json +# { +# "answer": "The full answer text", +# "sources": [{"title": "...", "text": "..."}], +# "conversation_id": "abc123", +# "thought": "agent reasoning steps (if enabled)", +# "tool_calls": [] +# } +# ``` +# +# The cell below makes a raw `requests.post()` call so you can see the exact +# response shape. The cell after that makes the same call using `query_docsgpt()`. +# + +# %% +# ── Raw request — see exactly what /api/answer returns ───────────────────── +question = "What is Python and what is it mainly used for?" + +print(f"Request:") +print(f" POST {BASE_URL}/api/answer") +print(f" {{question: '{question}', api_key: '...'}}") +print() + +raw_resp = requests.post( + f"{BASE_URL}/api/answer", + json={"question": question, "api_key": API_KEY}, + timeout=60, +) + +print(f"Status : {raw_resp.status_code}") +data = raw_resp.json() +print(f"Response keys: {list(data.keys())}") +print(f"\nanswer : {data.get('answer', '')[:300]}") +print(f"sources : {len(data.get('sources', []))} source(s)") +print(f"conversation_id: {data.get('conversation_id', '(none)')}") +print(f"thought : {str(data.get('thought', '(none)'))[:100]}") + +# %% +# ── Using the utility wrapper ─────────────────────────────────────────────── +# query_docsgpt() is the same call, with error handling and logging built in. +result = tdgputi.query_docsgpt(question, API_KEY, BASE_URL) +tdgputi.print_answer(result, label="/api/answer") + +# %% [markdown] +# --- +# +# ## Endpoint 1b: `POST /api/answer` with `history` — Multi-Turn Conversations +# +# By default, each call to `/api/answer` is independent — DocsGPT does not +# remember what was said before. To make it answer follow-up questions in +# context, you pass a `history` field containing the previous turns. +# +# **How the history field works:** +# - It is a list of past turns, where each turn has the question under `"prompt"` +# and the answer under `"response"` +# - It must be sent as a **JSON-encoded string** (not a raw Python list) — +# so you wrap it with `json.dumps(...)` before putting it in the payload +# +# ```python +# # Correct format +# history = json.dumps([{"prompt": "What is Python?", "response": "Python is..."}]) +# ``` +# +# The first cell below builds the history manually so you can see exactly what +# gets sent. The second cell uses `multi_turn_conversation()`, which does this +# automatically across any number of turns. +# + +# %% +# ── Manual multi-turn to show history construction explicitly ────────────── + +# Turn 1 +q1 = "What is Python?" +r1 = requests.post( + f"{BASE_URL}/api/answer", + json={"question": q1, "api_key": API_KEY}, + timeout=60, +).json() +a1 = r1.get("answer", "") + +print(f"Turn 1 Q: {q1}") +print(f"Turn 1 A: {a1[:200]}...") + +# Build history in the CORRECT format +history = json.dumps([{"prompt": q1, "response": a1}]) # ← "response", not "answer" +print(f"\nHistory passed as JSON string:") +print(f" {history[:120]}...") + +# Turn 2 — follow-up question with history attached +q2 = "What are its main use cases?" +r2 = requests.post( + f"{BASE_URL}/api/answer", + json={"question": q2, "api_key": API_KEY, "history": history}, + timeout=60, +).json() +a2 = r2.get("answer", "") + +print(f"\nTurn 2 Q: {q2}") +print(f"Turn 2 A: {a2[:200]}...") + +# %% +# ── Using the wrapper — history is managed automatically ─────────────────── +turns = tdgputi.multi_turn_conversation( + [ + "What is Python?", + "What are its main use cases?", + "How does it compare to Java for those use cases?", + ], + API_KEY, + BASE_URL, +) + +for i, turn in enumerate(turns, 1): + print(f"\n--- Turn {i} ---") + print(f"Q: {turn['question']}") + print(f"A: {turn['answer'][:300]}{'...' if len(turn['answer']) > 300 else ''}") + +# %% [markdown] +# --- +# +# ## Endpoint 2: `POST /stream` — Streaming Responses +# +# `/stream` works like `/api/answer` but instead of waiting for the full answer, +# it sends back small pieces of text as the model generates them. This is called +# **Server-Sent Events (SSE)** — the server keeps the connection open and pushes +# data line by line. +# +# Each line that comes back starts with `data:` followed by a JSON object. +# That JSON object always has a `"type"` field that tells you what kind of +# message it is: +# +# | `type` | What it contains | +# |--------|------------------| +# | `"answer"` | One chunk of the answer text, in the `"answer"` field | +# | `"source"` | The document chunks the agent used to answer | +# | `"thought"` | The agent's reasoning steps (if your agent has this enabled) | +# | `"id"` | The conversation ID, sent at the very end | +# | `"end"` | Signals that the stream is finished | +# | `"error"` | An error message | +# +# The first cell below reads the raw SSE stream line by line so you can see +# every event. The second uses `stream_docsgpt()` which handles the parsing +# and returns the assembled answer string. The third uses +# `stream_docsgpt_events()` which yields every event dict if you need to +# access sources or the conversation ID. +# + +# %% +# ── Raw SSE streaming — annotated to show every line ────────────────────── +stream_q = "What programming paradigms does Python support? Give a one-line example of each." + +print(f"Request:") +print(f" POST {BASE_URL}/stream") +print(f" Accept: text/event-stream") +print(f" {{question: '{stream_q[:60]}...'}}") +print("\n⚡ Live SSE events:") +print("-" * 55) + +event_log: list = [] + +with requests.post( + f"{BASE_URL}/stream", + json={"question": stream_q, "api_key": API_KEY}, + stream=True, + timeout=120, + headers={"Accept": "text/event-stream"}, +) as resp: + resp.raise_for_status() + for raw in resp.iter_lines(): + if not raw: + continue + line = raw.decode("utf-8") if isinstance(raw, bytes) else raw + if not line.startswith("data:"): + continue + try: + event = json.loads(line[5:].strip()) + event_log.append(event) + etype = event.get("type", "?") + if etype == "answer": + print(event["answer"], end="", flush=True) # ← field is "answer" + elif etype == "end": + print("\n[END]") + break + elif etype == "source": + print(f"\n[SOURCE event — {len(event.get('sources',[]))} source(s)]") + elif etype == "id": + print(f"\n[ID event — conversation_id: {event.get('id', '')}]") + elif etype == "error": + print(f"\n[ERROR: {event.get('error', '')}]") + except json.JSONDecodeError: + pass + +print("-" * 55) +type_counts = {} +for e in event_log: + t = e.get("type", "?") + type_counts[t] = type_counts.get(t, 0) + 1 +print(f"\nEvent type breakdown: {type_counts}") + +# %% +# ── stream_docsgpt() wrapper ─────────────────────────────────────────────── +# Handles all the SSE parsing and returns the full assembled answer string. +print("Using stream_docsgpt() wrapper:") +print("-" * 55) + +answer = tdgputi.stream_docsgpt( + "What is the GIL in Python and why does it matter?", + API_KEY, + BASE_URL, + print_live=True, +) +print("-" * 55) +print(f"Total chars assembled: {len(answer)}") + +# %% +# ── stream_docsgpt_events() — get every event type, not just tokens ──────── +# Useful when you need the sources the RAG retrieved, or the conversation ID. +print("Using stream_docsgpt_events() to capture all event types:\n") + +answer_parts = [] +sources = [] +conv_id = "" + +for event in tdgputi.stream_docsgpt_events( + "Name three popular Python web frameworks.", + API_KEY, BASE_URL, +): + etype = event.get("type") + if etype == "answer": + answer_parts.append(event.get("answer", "")) + elif etype == "source": + sources = event.get("sources", []) + print(f"→ Received {len(sources)} source(s) from RAG retrieval") + elif etype == "id": + conv_id = event.get("id", "") + print(f"→ Conversation ID: {conv_id}") + elif etype == "end": + print(f"→ Stream ended") + +print(f"\nFull answer: {''.join(answer_parts)[:300]}") + +# %% [markdown] +# --- +# +# ## Endpoints 3 & 4: File Attachment Flow +# +# DocsGPT processes uploaded files in the background, so attaching a file +# to a query takes three separate steps: +# +# **Step 1 — Upload the file** (`POST /api/store_attachment`) +# Send the file as multipart form data. The server queues it for processing +# and immediately returns a `task_id`. +# +# **Step 2 — Wait for processing** (`GET /api/task_status?task_id=...`) +# Poll this endpoint every few seconds. When `status` becomes `"SUCCESS"`, +# the response includes an `attachment_id`. +# +# **Step 3 — Use the attachment** (`POST /stream`) +# Pass the `attachment_id` in the `"attachments"` list. The model can now +# read the file when generating its answer. +# +# ``` +# POST /api/store_attachment → { task_id: "abc" } +# ↓ +# GET /api/task_status → { status: "SUCCESS", attachment_id: "xyz" } +# ↓ +# POST /stream (attachments: ["xyz"]) → answer stream +# ``` +# +# The three cells below go through each step. `upload_and_attach()` combines +# steps 1 and 2 into a single function call. +# + +# %% +# ── Step 1: POST /api/store_attachment ──────────────────────────────────── +attach_content = ( + "Transformers: A Neural Network Architecture\n\n" + "Introduced in 'Attention is All You Need' (Vaswani et al. 2017).\n" + "Key innovation: self-attention allows parallel processing of sequences.\n\n" + "Core components:\n" + " 1. Multi-head self-attention — attends to all positions simultaneously\n" + " 2. Positional encoding — injects sequence order without recurrence\n" + " 3. Feed-forward layers — applies non-linear transformations\n\n" + "Famous models: BERT (encoder-only), GPT (decoder-only), T5 (encoder-decoder)." +) + +with tempfile.NamedTemporaryFile( + mode="w", suffix=".txt", prefix="api_test_", + delete=False, encoding="utf-8", +) as tmp: + tmp.write(attach_content) + tmp_path = tmp.name + +print(f"📁 Temp file : {tmp_path}") +print(f" Content : {len(attach_content)} chars") +print() +print(f"Request:") +print(f" POST {BASE_URL}/api/store_attachment") +print(f" multipart/form-data: file=, api_key=") +print() + +with open(tmp_path, "rb") as fh: + upload_resp = requests.post( + f"{BASE_URL}/api/store_attachment", + files={"file": (os.path.basename(tmp_path), fh)}, + data={"api_key": API_KEY}, + timeout=60, + ) + +print(f"Status : {upload_resp.status_code}") +upload_data = upload_resp.json() +print(f"Response : {upload_data}") +task_id = upload_data.get("task_id", "") +print(f"task_id : {task_id}") + +# %% +# ── Step 2: GET /api/task_status — poll until SUCCESS ───────────────────── +print(f"Request:") +print(f" GET {BASE_URL}/api/task_status?task_id={task_id}") +print() + +attachment_id = "" + +if task_id: + # Show one raw poll to see the response shape + single_poll = requests.get( + f"{BASE_URL}/api/task_status", + params={"task_id": task_id}, + timeout=15, + ).json() + print(f"Single poll result: {single_poll}") + print() + + # Now use the wrapper which polls automatically until SUCCESS + print("Polling until SUCCESS...") + try: + attachment_id = tdgputi.poll_attachment_status( + task_id, BASE_URL, timeout_sec=90, poll_interval=3.0 + ) + print(f"✅ attachment_id: {attachment_id}") + except (TimeoutError, RuntimeError) as e: + print(f"⚠️ {e}") +else: + print("No task_id available — skipping poll step.") + +os.unlink(tmp_path) # clean up temp file + +# %% +# ── Step 3: POST /stream with attachments=[attachment_id] ───────────────── +if attachment_id: + print(f"Request:") + print(f" POST {BASE_URL}/stream") + print(f" {{question: '...', api_key: '...', attachments: ['{attachment_id[:12]}...']}}") + print() + print("⚡ Streaming response with attachment:\n" + "-"*55) + + answer = tdgputi.stream_docsgpt( + "What are the three core components described in this document? Explain each one.", + API_KEY, + BASE_URL, + attachments=[attachment_id], + print_live=True, + ) + print("-"*55) + print(f"✅ Answer: {len(answer)} chars") +else: + print("No attachment_id — skipping. (Try again if the upload failed above.)") + +# %% [markdown] +# --- +# +# ## Summarisation & FAQ — Sending Text in the Prompt +# +# Summarisation and FAQ generation do not use separate endpoints. +# Both work by embedding the document text directly inside the question +# and sending it to `/api/answer`. +# +# `summarize_document()` builds a prompt like this and sends it: +# ``` +# Read the following document and write a summary in under {max_words} words. +# +# DOCUMENT: +# +# ``` +# +# `generate_faqs()` builds a prompt that asks for Q:/A: formatted pairs: +# ``` +# Generate exactly {n} FAQs with answers. Use this format: +# Q: +# A: +# +# DOCUMENT: +# +# ``` +# +# After DocsGPT responds, `parse_faqs()` splits the raw text on `Q:` markers +# and extracts each question-answer pair into a list of dicts. +# +# The text is auto-truncated to 4000 characters before sending to stay within +# the prompt size limit. +# + +# %% +# ── Inline summarisation and FAQ demo ───────────────────────────────────── +sample_doc = ( + "Python is a high-level, interpreted programming language known for its clear " + "syntax and readability. It supports multiple programming paradigms including " + "procedural, object-oriented, and functional programming. Python is widely used " + "in data science, machine learning, web development, and automation. It was " + "created by Guido van Rossum and first released in 1991. Python has a large " + "standard library and a vibrant open-source ecosystem. Its package manager pip " + "provides access to hundreds of thousands of packages on PyPI." +) + +# ── Summarisation ────────────────────────────────────────────────────────── +print("📝 SUMMARISATION via POST /api/answer (inline prompt)") +print("=" * 55) + +summary = tdgputi.summarize_document( + sample_doc, API_KEY, BASE_URL, max_words=80, source_label="api_walkthrough" +) +print(f"Summary ({len(summary.split())} words):\n{summary}") + +# ── FAQ Generation ───────────────────────────────────────────────────────── +print("\n❓ FAQ GENERATION via POST /api/answer (inline prompt)") +print("=" * 55) + +faqs = tdgputi.generate_faqs( + sample_doc, API_KEY, BASE_URL, n_questions=3, source_label="api_walkthrough" +) +print(f"Generated {len(faqs)} FAQs:") +tdgputi.print_faqs(faqs) + +# %% [markdown] +# ## Evaluation — ROUGE and BLEU +# +# We use two standard metrics to measure how well the generated summaries and +# FAQs capture the content of the original document. +# +# **ROUGE** (Recall-Oriented Understudy for Gisting Evaluation) counts how many +# words or phrases from the reference text appear in the generated output. +# We compute three variants: +# +# - **ROUGE-1** — counts single word matches. If the reference says +# "machine learning" and the summary says "machine learning", both words count. +# - **ROUGE-2** — counts two-word phrase matches. "machine learning" as a pair +# has to appear in both texts to score. +# - **ROUGE-L** — finds the longest sequence of words that appear in the same +# order in both texts, even if not consecutive. +# +# All three return an F1 score between 0 and 1. Scores between 0.2 and 0.4 are +# typical for AI-generated summaries — the model paraphrases rather than copying, +# so exact word matches will naturally be lower. +# +# **BLEU** (Bilingual Evaluation Understudy) works from the other direction — it +# checks how many words in the generated text also appear in the reference, and +# applies a penalty if the output is too short. Also 0 to 1, higher is better. +# +# `evaluate_output(hypothesis, reference)` runs both metrics and returns a single +# dict with keys `rouge1`, `rouge2`, `rougeL`, and `bleu`. `evaluate_all()` runs +# it across every dataset, using the first 500 characters of each source text as +# the reference. + +# %% +# ── Evaluation ───────────────────────────────────────────────────────────── +print("\n📊 EVALUATION (ROUGE + BLEU)") +print("=" * 55) + +scores = tdgputi.evaluate_output(summary, sample_doc[:500]) +tdgputi.print_scores(scores, label="summary") + +# %% [markdown] +# --- +# +# ## Summary +# +# | Endpoint | Method | Wrapper function | +# |----------|--------|------------------| +# | `/api/answer` | POST | `query_docsgpt()` | +# | `/api/answer` + history | POST | `multi_turn_conversation()` | +# | `/stream` | POST | `stream_docsgpt()`, `stream_docsgpt_events()` | +# | `/api/store_attachment` | POST | `store_attachment()` | +# | `/api/task_status` | GET | `poll_attachment_status()` | +# +# **Things to remember:** +# - History is a JSON-encoded string, not a raw list +# - The history reply key is `"response"`, not `"answer"` +# - SSE answer chunks have `type == "answer"` and the text is in `event["answer"]` +# - File uploads are async — always poll until SUCCESS before using the attachment +# diff --git a/class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/docsgpt.example.ipynb b/class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/docsgpt.example.ipynb new file mode 100644 index 000000000..993d293fe --- /dev/null +++ b/class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/docsgpt.example.ipynb @@ -0,0 +1,1259 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "title", + "metadata": {}, + "source": [ + "# DocsGPT: Intelligent Documentation Assistant\n", + "\n", + "This notebook builds a complete AI-powered documentation assistant step by step.\n", + "Each part focuses on one piece of the system and shows how the code works.\n", + "\n", + "| Part | What it does |\n", + "|------|--------------|\n", + "| 1 | Load text from three real datasets |\n", + "| 2 | Summarise each document using DocsGPT |\n", + "| 3 | Generate FAQs from each document |\n", + "| 4 | Score the outputs with ROUGE and BLEU |\n", + "| 5 | Produce summaries and FAQs in other languages |\n", + "| 6 | Show all results in a summary table |\n", + "| 7 | Launch an interactive Gradio UI |\n", + "\n", + "**Setup:**\n", + "```bash\n", + "./docker_build.sh\n", + "./docker_jupyter.sh\n", + "cp .env.example .env\n", + "# Open .env and set DOCSGPT_API_KEY=your-agent-key\n", + "```\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "setup", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Base URL : https://gptcloud.arc53.com\n", + "API key : 5ffa3a62-f*** (truncated for safety)\n", + "Environment ready ✓\n" + ] + } + ], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2\n", + "\n", + "import logging\n", + "import os\n", + "import pandas as pd\n", + "\n", + "import docsgpt_utils as tdgputi\n", + "\n", + "from dotenv import load_dotenv\n", + "load_dotenv()\n", + "\n", + "logging.basicConfig(\n", + " level=logging.INFO,\n", + " format=\"%(asctime)s [%(levelname)s] %(name)s: %(message)s\",\n", + ")\n", + "_LOG = logging.getLogger(__name__)\n", + "\n", + "BASE_URL = tdgputi.get_base_url()\n", + "API_KEY = tdgputi.get_api_key()\n", + "\n", + "print(f\"Base URL : {BASE_URL}\")\n", + "print(f\"API key : {API_KEY[:10]}*** (truncated for safety)\")\n", + "print(\"Environment ready ✓\")" + ] + }, + { + "cell_type": "markdown", + "id": "part1-md", + "metadata": {}, + "source": [ + "---\n", + "\n", + "## Part 1: Data Collection\n", + "\n", + "We load text from three sources. Each one is handled by a function in\n", + "`docsgpt_utils.py` that fetches the data and returns plain text.\n", + "\n", + "- **`fetch_awesome_ml_readme()`** — downloads the raw markdown file from GitHub\n", + " using `requests.get()`. `parse_awesome_ml_sections()` then splits it into a\n", + " dict of `{section_title: body_text}` by scanning for `## ` heading lines.\n", + " `clean_markdown()` strips all markdown syntax (links, bullets, code blocks)\n", + " so we're left with plain prose the LLM can read cleanly.\n", + "\n", + "- **`load_stackoverflow_sample()`** — connects to HuggingFace Hub and streams\n", + " rows one at a time using `datasets.load_dataset(..., streaming=True)`. Only\n", + " the first `n_rows` rows are read — the full dataset is never downloaded.\n", + " `so_rows_to_text()` combines the title, body, and answer of each row into\n", + " one readable block of text.\n", + "\n", + "- **`load_pile_sample()`** — streams The Pile dataset the same way, collecting\n", + " rows until we have enough characters.\n", + "\n", + "All three functions fall back to built-in sample data if the network is\n", + "unavailable, so the notebook always keeps running.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "data-aml", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2026-05-06 13:20:16,138 [INFO] docsgpt_utils: fetching Awesome ML README from https://raw.githubusercontent.com/josephmisiti/awesome-machine-learning/master/README.md\n", + "2026-05-06 13:20:16,304 [INFO] docsgpt_utils: fetched 212555 chars\n", + "2026-05-06 13:20:16,306 [INFO] docsgpt_utils: parsed 41 sections from Awesome ML README\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "============================================================\n", + "1a. Fetching Awesome Machine Learning README from GitHub...\n", + " Total chars : 212,555\n", + " Sections found: 41\n", + " Section names : ['Preamble', 'IMPORTANT NOTE ON PRs:', 'Star History', 'Table of Contents', 'APL', 'C'] ...\n", + " Selected : 'Python'\n", + " Clean text : 17,302 chars\n", + " Preview : Computer Vision\n", + "LightlyTrain - Pretrain computer vision models on unlabeled data for industrial applications\n", + "Scikit-Image - A collection of algorithms for image processing in Python.\n", + "Scikit-Opt - Swar...\n" + ] + } + ], + "source": [ + "# ── 1a. Awesome Machine Learning (GitHub README) ───────────────────────────\n", + "# fetch_awesome_ml_readme() downloads the raw markdown from GitHub.\n", + "# parse_awesome_ml_sections() splits it into a dict: {section_title: body}\n", + "# clean_markdown() strips all markdown syntax → plain prose for the LLM.\n", + "\n", + "print(\"=\" * 60)\n", + "print(\"1a. Fetching Awesome Machine Learning README from GitHub...\")\n", + "\n", + "raw_md = tdgputi.fetch_awesome_ml_readme()\n", + "sections = tdgputi.parse_awesome_ml_sections(raw_md)\n", + "\n", + "print(f\" Total chars : {len(raw_md):,}\")\n", + "print(f\" Sections found: {len(sections)}\")\n", + "print(f\" Section names : {list(sections.keys())[:6]} ...\")\n", + "\n", + "# Pick the largest content-rich section to give the LLM something meaty\n", + "content_sections = {k: v for k, v in sections.items() if len(v) > 500}\n", + "aml_title = sorted(content_sections, key=lambda k: len(content_sections[k]), reverse=True)[0]\n", + "aml_text = tdgputi.clean_markdown(content_sections[aml_title])\n", + "\n", + "print(f\" Selected : '{aml_title}'\")\n", + "print(f\" Clean text : {len(aml_text):,} chars\")\n", + "print(f\" Preview : {aml_text[:200]}...\")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "data-so", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1b. Loading Stack Overflow questions from HuggingFace Hub...\n", + " (uses streaming — no full download needed)\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/kshitid/Kshiti/UMCP/DATA605/docsgpt/.venv/lib/python3.13/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n", + "2026-05-06 13:20:24,214 [INFO] docsgpt_utils: loading SO dataset (streaming, n=10)\n", + "Downloading readme: 2.85kB [00:00, 2.67MB/s]\n", + "2026-05-06 13:20:29,282 [INFO] docsgpt_utils: loaded 10 SO rows\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + " Rows loaded : 10\n", + " Combined chars: 17,201\n", + " First question: Parsing json directly using input stream\n", + " First answer : ...\n" + ] + } + ], + "source": [ + "# ── 1b. Stack Overflow Questions (HuggingFace Hub, streaming) ──────────────\n", + "# load_stackoverflow_sample() uses the HuggingFace `datasets` library with\n", + "# streaming=True so we never download the full dataset — just the first N rows.\n", + "#\n", + "# so_rows_to_text() combines title + body + answer into one readable document\n", + "# per question, then joins them all with --- separators.\n", + "\n", + "print(\"1b. Loading Stack Overflow questions from HuggingFace Hub...\")\n", + "print(\" (uses streaming — no full download needed)\")\n", + "\n", + "so_rows = tdgputi.load_stackoverflow_sample(n_rows=10)\n", + "so_text = tdgputi.so_rows_to_text(so_rows)\n", + "\n", + "print(f\"\\n Rows loaded : {len(so_rows)}\")\n", + "print(f\" Combined chars: {len(so_text):,}\")\n", + "print(f\" First question: {so_rows[0]['title']}\")\n", + "print(f\" First answer : {so_rows[0]['answer'][:100]}...\")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "data-pile", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2026-05-06 13:20:44,442 [INFO] docsgpt_utils: streaming Pile dataset (target 8000 chars)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1c. Streaming a sample from The Pile (uncopyrighted subset)...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Downloading readme: 100%|██████████| 776/776 [00:00<00:00, 3.33MB/s]\n", + "2026-05-06 13:20:45,523 [WARNING] docsgpt_utils: Pile dataset unavailable: Compression type zstd not supported — using fallback\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " Collected : 829 chars\n", + " Preview : Natural language processing (NLP) is a subfield of artificial intelligence that focuses on enabling computers to understand, interpret, and generate human language. Modern NLP relies on large transfor...\n", + "\n", + "📦 All datasets ready:\n", + " • Awesome ML: 17,302 chars\n", + " • Stack Overflow: 17,201 chars\n", + " • The Pile: 829 chars\n" + ] + } + ], + "source": [ + "# ── 1c. The Pile — uncopyrighted subset (HuggingFace Hub, streaming) ───────\n", + "# The Pile is a large-scale dataset of diverse text. We use the uncopyrighted\n", + "# subset for safety. Again we stream just enough characters rather than\n", + "# downloading the whole thing.\n", + "\n", + "print(\"1c. Streaming a sample from The Pile (uncopyrighted subset)...\")\n", + "\n", + "pile_text = tdgputi.load_pile_sample(n_chars=8000)\n", + "\n", + "print(f\" Collected : {len(pile_text):,} chars\")\n", + "print(f\" Preview : {pile_text[:200]}...\")\n", + "\n", + "# ── Package everything into a named dict for the rest of the notebook ───────\n", + "SOURCE_TEXTS = {\n", + " \"Awesome ML\": aml_text,\n", + " \"Stack Overflow\": so_text,\n", + " \"The Pile\": pile_text,\n", + "}\n", + "\n", + "print(\"\\n📦 All datasets ready:\")\n", + "for label, text in SOURCE_TEXTS.items():\n", + " print(f\" • {label}: {len(text):,} chars\")" + ] + }, + { + "cell_type": "markdown", + "id": "part2-md", + "metadata": {}, + "source": [ + "---\n", + "\n", + "## Part 2: Text Summarisation\n", + "\n", + "`summarize_document()` takes a block of text and sends it to DocsGPT to\n", + "get a summary. Here is what happens inside the function:\n", + "\n", + "1. `truncate_text()` cuts the text to 4000 characters so it fits in the prompt\n", + "2. The text is placed inside a prompt string that instructs DocsGPT to\n", + " summarise it in a certain number of words\n", + "3. That prompt is sent to `POST /api/answer` via `query_docsgpt()`\n", + "4. The `\"answer\"` field from the JSON response is returned as the summary\n", + "\n", + "The loop below runs this for each of the three datasets and stores the\n", + "results in a `summaries` dict keyed by dataset name.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "summarise", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2026-05-06 13:20:54,418 [INFO] docsgpt_utils: summarize_document: 'Awesome ML' (4012 chars)\n", + "2026-05-06 13:20:54,418 [INFO] docsgpt_utils: query_docsgpt | Read the following document carefully and write a concise, well-structured summary in no more than 2\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "============================================================\n", + "📄 Summarising: Awesome ML (17,302 chars → truncated to 4000)\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2026-05-06 13:20:56,187 [INFO] docsgpt_utils: answer received (1473 chars)\n", + "2026-05-06 13:20:56,188 [INFO] docsgpt_utils: summarize_document: 'Stack Overflow' (4012 chars)\n", + "2026-05-06 13:20:56,189 [INFO] docsgpt_utils: query_docsgpt | Read the following document carefully and write a concise, well-structured summary in no more than 2\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "✅ Summary (173 words):\n", + "This document outlines a diverse ecosystem of Python-based libraries and frameworks dedicated to **Computer Vision (CV)**, ranging from foundational image processing to advanced deep learning.\n", + "\n", + "Key categories include:\n", + "* **General Frameworks & Image Processing:** Tools like **Scikit-Image**, **SimpleCV**, and **imutils** provide essential algorithms and convenience functions. **Detecto** and **PyTorchCV** offer streamlined workflows for model training.\n", + "* **Face Recognition & Analysis:** High-performance libraries such as **deepface**, **face_recognition**, and **retinaface** enable facial detection, landmarking, and attribute analysis (age, gender, emotion).\n", + "* **Object Detection & Segmentation:** FAIR’s **Detectron2** and **albumentations** serve as industry standards for detection, segmentation, and robust data augmentation.\n", + "* **Generative AI & Style Transfer:** Resources like **TF-GAN**, **neural-style-pt**, and **joliGEN** support GANs, diffusion models, and artistic style transfer.\n", + "* **Specialized Applications:** The list covers **OCR** (pytesseract), **pose estimation** (Openpose), and **self-supervised learning** (Lightly). It also includes niche tools for swarm intelligence (**Scikit-Opt**) and energy-based models (**Learnergy**).\n", + "\n", + "While some projects are marked as deprecated (e.g., PCV, Detectron), the document highlights a strong shift toward **PyTorch** and **TensorFlow** ecosystems for modern industrial and research applications.\n", + "\n", + "============================================================\n", + "📄 Summarising: Stack Overflow (17,201 chars → truncated to 4000)\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2026-05-06 13:20:58,556 [INFO] docsgpt_utils: answer received (1237 chars)\n", + "2026-05-06 13:20:58,557 [INFO] docsgpt_utils: summarize_document: 'The Pile' (829 chars)\n", + "2026-05-06 13:20:58,557 [INFO] docsgpt_utils: query_docsgpt | Read the following document carefully and write a concise, well-structured summary in no more than 2\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "✅ Summary (178 words):\n", + "The user is seeking a memory-efficient method to parse JSON directly from an `InputStream` for Android applications targeting versions as low as 2.0. They aim to avoid loading the entire response into memory as a string, which is common with standard JSON objects.\n", + "\n", + "While the native `android.util.JsonReader` is limited to API 11+, the user attempts to use the **GSON library's `JsonReader`** as a compatible alternative. However, their implementation fails with a `java.io.EOFException`. \n", + "\n", + "**Key Technical Issue:**\n", + "The provided code demonstrates that the `InputStream` is being consumed twice. First, the user reads the entire stream into a `StringBuilder` via a `BufferedReader` (intended for a `TextView`). When they subsequently pass the same stream to GSON’s `JsonReader`, the stream has already reached the end, causing the parser to fail immediately.\n", + "\n", + "**Summary of Requirements:**\n", + "* **Target:** Android API 2.0+.\n", + "* **Goal:** Streaming JSON parsing to save memory.\n", + "* **Tool:** GSON (`JsonReader`).\n", + "* **Problem:** Improper `InputStream` management leading to `EOFException`.\n", + "\n", + "To resolve this, the user must choose between displaying the data or parsing it, or alternatively, parse the stream and update the UI incrementally.\n", + "\n", + "============================================================\n", + "📄 Summarising: The Pile (829 chars → truncated to 4000)\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2026-05-06 13:21:00,999 [INFO] docsgpt_utils: answer received (1007 chars)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "✅ Summary (134 words):\n", + "Natural Language Processing (NLP) is a branch of artificial intelligence focused on the interaction between computers and human language. Contemporary NLP is driven by large **transformer-based models**—such as BERT, GPT, and T5—which are pretrained on massive datasets and fine-tuned for specialized tasks.\n", + "\n", + "A pivotal development in the field was the **attention mechanism** (Vaswani et al., 2017), which allows models to weigh the importance of different words within a sequence. This, combined with **transfer learning**, has significantly reduced the amount of labeled data required for high-performance results.\n", + "\n", + "Key NLP tasks include:\n", + "* **Text classification** and **sentiment analysis**.\n", + "* **Named entity recognition (NER)** and **question answering**.\n", + "* **Machine translation** and **text summarization**.\n", + "\n", + "Today, NLP powers essential technologies such as search engines, virtual assistants, chatbots, and automated content moderation, making it a cornerstone of modern digital infrastructure.\n", + "\n", + "\n", + "🎉 All 3 summaries generated!\n" + ] + } + ], + "source": [ + "# ── Summarise each dataset source ──────────────────────────────────────────\n", + "# summarize_document() builds the prompt, calls POST /api/answer, and returns\n", + "# the 'answer' field from the JSON response.\n", + "\n", + "summaries: dict = {}\n", + "\n", + "for label, text in SOURCE_TEXTS.items():\n", + " print(f\"\\n{'='*60}\")\n", + " print(f\"📄 Summarising: {label} ({len(text):,} chars → truncated to 4000)\")\n", + "\n", + " truncated = tdgputi.truncate_text(text, max_chars=4000)\n", + "\n", + " summary = tdgputi.summarize_document(\n", + " truncated,\n", + " API_KEY,\n", + " BASE_URL,\n", + " max_words=200,\n", + " source_label=label,\n", + " )\n", + " summaries[label] = summary\n", + " word_count = len(summary.split())\n", + " print(f\"\\n✅ Summary ({word_count} words):\")\n", + " print(summary)\n", + "\n", + "print(f\"\\n\\n🎉 All {len(summaries)} summaries generated!\")" + ] + }, + { + "cell_type": "markdown", + "id": "part3-md", + "metadata": {}, + "source": [ + "---\n", + "\n", + "## Part 3: FAQ Generation\n", + "\n", + "`generate_faqs()` works the same way as summarisation — it embeds the\n", + "document text in a prompt and sends it to `POST /api/answer`. The prompt\n", + "instructs DocsGPT to format the output as:\n", + "\n", + "```\n", + "Q: \n", + "A: \n", + "```\n", + "\n", + "Once the response comes back, `parse_faqs()` processes the raw text:\n", + "1. Splits the text on `Q:` markers using `re.split()`\n", + "2. For each block, extracts the question with a `re.search()` for `Q: ...`\n", + " and the answer with a search for `A: ...`\n", + "3. Returns a list of `{\"question\": ..., \"answer\": ...}` dicts\n", + "\n", + "`print_faqs()` then formats and prints each pair.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "faq-gen", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2026-05-06 13:21:15,974 [INFO] docsgpt_utils: generate_faqs: 4 Qs for 'Awesome ML'\n", + "2026-05-06 13:21:15,974 [INFO] docsgpt_utils: query_docsgpt | Based on the document below, generate exactly 4 frequently asked questions (FAQs) with detailed, hel\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "============================================================\n", + "❓ Generating FAQs: Awesome ML\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2026-05-06 13:21:17,892 [INFO] docsgpt_utils: answer received (2298 chars)\n", + "2026-05-06 13:21:17,894 [INFO] docsgpt_utils: generate_faqs: 4 Qs for 'Stack Overflow'\n", + "2026-05-06 13:21:17,895 [INFO] docsgpt_utils: query_docsgpt | Based on the document below, generate exactly 4 frequently asked questions (FAQs) with detailed, hel\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "✅ Generated 4 FAQs:\n", + "\n", + "Q1: What is the difference between Detectron and Detectron2 according to the document?\n", + "A1: Detectron is Facebook AI Research's (FAIR) original software system for object detection, which implements algorithms like Mask R-CNN and is powered by the Caffe2 framework; however, it is now labeled as deprecated. Detectron2 is the next-generation research platform from FAIR, serving as a ground-up rewrite of the original version. Unlike its predecessor, Detectron2 is powered by the PyTorch deep learning framework and supports advanced object detection and segmentation tasks.\n", + "\n", + "Q2: Which libraries are recommended for performing face recognition and facial attribute analysis?\n", + "A2: There are several libraries listed for these tasks. The \"face_recognition\" library is designed for recognizing and manipulating faces from Python or the command line. For more comprehensive analysis, \"deepface\" is a lightweight framework that handles recognition and facial attribute analysis—such as age, gender, emotion, and race—by utilizing models like VGG-Face, FaceNet, and ArcFace. Additionally, \"OpenFace\" provides open-source face recognition via deep neural networks, and \"retinaface\" serves as a cutting-edge facial detector that includes facial landmarks.\n", + "\n", + "Q3: What is Albumentations and what makes it significant for deep learning competitions?\n", + "A3: Albumentations is a fast and framework-agnostic image augmentation library that supports a diverse range of techniques for classification, segmentation, and detection. It is significant because it has been used by winning participants in numerous high-profile Deep Learning competitions, including those hosted on Kaggle and Topcoder, as well as competitions held as part of CVPR workshops. Its ability to work across different frameworks makes it a versatile tool for enhancing model performance through data variation.\n", + "\n", + "Q4: Does the document mention any tools for Optical Character Recognition (OCR)?\n", + "A4: Yes, the document mentions \"pytesseract\" (Python-tesseract) for this purpose. It is an optical character recognition tool for Python that can recognize and \"read\" text embedded within images. It functions as a Python wrapper for Google's Tesseract-OCR Engine, allowing developers to integrate text-extraction capabilities into their computer vision workflows.\n", + "\n", + "============================================================\n", + "❓ Generating FAQs: Stack Overflow\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2026-05-06 13:21:20,391 [INFO] docsgpt_utils: answer received (2050 chars)\n", + "2026-05-06 13:21:20,392 [INFO] docsgpt_utils: generate_faqs: 4 Qs for 'The Pile'\n", + "2026-05-06 13:21:20,392 [INFO] docsgpt_utils: query_docsgpt | Based on the document below, generate exactly 4 frequently asked questions (FAQs) with detailed, hel\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "✅ Generated 4 FAQs:\n", + "\n", + "Q1: Why should I parse JSON directly from an InputStream instead of converting it to a String first?\n", + "A1: Parsing JSON directly from an InputStream is much more memory-efficient, especially for large datasets. By using a streaming parser like JsonReader, you process the data token by token as it arrives rather than loading the entire JSON payload into a String in your device's RAM. This prevents OutOfMemoryErrors that often occur when handling large API responses on mobile devices.\n", + "\n", + "Q2: How can I implement JSON streaming on Android versions older than API level 11?\n", + "A2: While the native `android.util.JsonReader` requires API level 11 or higher, you can support older versions (Android 2.0 and up) by using the Google GSON library. By including the GSON dependency, you can use `com.google.gson.stream.JsonReader`, which provides the same streaming functionality and API as the native version but remains compatible with legacy Android releases.\n", + "\n", + "Q3: Why does my code throw a \"java.io.EOFException: End of input\" when using JsonReader?\n", + "A3: This error typically occurs because the InputStream has already been fully consumed before the JsonReader attempts to read it. In the provided example, the code reads the stream into a StringBuilder using a BufferedReader first. Since most InputStreams can only be read once, the \"pointer\" is at the end of the data when it reaches the JsonReader. To fix this, you must pass the InputStream directly to the JsonReader and avoid reading it with any other methods beforehand.\n", + "\n", + "Q4: What is the correct way to initialize and use GSON's JsonReader with an InputStream?\n", + "A4: You should wrap the InputStream in an InputStreamReader and pass that directly to the JsonReader. The correct implementation is: `JsonReader reader = new JsonReader(new InputStreamReader(content, \"UTF-8\"));`. Once initialized, you can use methods like `reader.beginObject()`, `reader.nextName()`, and `reader.nextLong()` to navigate the JSON structure. Always remember to handle the stream tokens sequentially and close the reader when finished.\n", + "\n", + "============================================================\n", + "❓ Generating FAQs: The Pile\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2026-05-06 13:21:22,402 [INFO] docsgpt_utils: answer received (1464 chars)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "✅ Generated 4 FAQs:\n", + "\n", + "Q1: What is natural language processing (NLP) and what is its primary goal?\n", + "A1: Natural language processing (NLP) is a specialized subfield of artificial intelligence. Its primary goal is to enable computers to understand, interpret, and generate human language, allowing for more seamless interaction between humans and machines.\n", + "\n", + "Q2: Which models are commonly used in modern NLP, and how are they developed?\n", + "A2: Modern NLP relies on large transformer-based models such as BERT, GPT, and T5. These models are developed through a two-step process: first, they are pretrained on massive corpora of text data, and then they are fine-tuned to perform specific tasks such as text classification or sentiment analysis.\n", + "\n", + "Q3: What is the significance of the attention mechanism in NLP?\n", + "A3: The attention mechanism, which was introduced in the 2017 paper 'Attention is All You Need' by Vaswani et al., is a breakthrough that allows models to weigh the importance of different words across a sequence. This mechanism, combined with transfer learning, has dramatically reduced the amount of labeled data required to train models for downstream tasks.\n", + "\n", + "Q4: What are some common real-world applications of NLP technology?\n", + "A4: NLP technology is used to power a variety of everyday applications, including search engines, virtual assistants, and chatbots. It is also used by organizations for content moderation, automated document analysis, machine translation, and text summarization.\n", + "\n", + "\n", + "🎉 FAQ generation complete for all 3 sources!\n" + ] + } + ], + "source": [ + "# ── Generate FAQs for each dataset source ─────────────────────────────────\n", + "# generate_faqs() sends a structured prompt to /api/answer requesting\n", + "# exactly n_questions FAQs in Q:/A: format, then calls parse_faqs() on the result.\n", + "\n", + "all_faqs: dict = {}\n", + "\n", + "for label, text in SOURCE_TEXTS.items():\n", + " print(f\"\\n{'='*60}\")\n", + " print(f\"❓ Generating FAQs: {label}\")\n", + "\n", + " truncated = tdgputi.truncate_text(text, max_chars=4000)\n", + "\n", + " faqs = tdgputi.generate_faqs(\n", + " truncated,\n", + " API_KEY,\n", + " BASE_URL,\n", + " n_questions=4,\n", + " source_label=label,\n", + " )\n", + " all_faqs[label] = faqs\n", + "\n", + " print(f\"\\n✅ Generated {len(faqs)} FAQs:\")\n", + " tdgputi.print_faqs(faqs)\n", + "\n", + "print(f\"\\n\\n🎉 FAQ generation complete for all {len(all_faqs)} sources!\")" + ] + }, + { + "cell_type": "markdown", + "id": "part4-md", + "metadata": {}, + "source": [ + "---\n", + "\n", + "## Part 4: Evaluation — ROUGE and BLEU\n", + "\n", + "We score each generated summary and FAQ answer against the original source\n", + "text to measure how well the content was captured.\n", + "\n", + "**`rouge_scores(hypothesis, reference)`** uses the `rouge_score` library to\n", + "compute three variants of ROUGE:\n", + "- **ROUGE-1**: counts how many individual words overlap\n", + "- **ROUGE-2**: counts how many two-word pairs overlap\n", + "- **ROUGE-L**: finds the longest matching sequence of words in order\n", + "\n", + "All three are F1 scores (0 to 1). A higher number means more overlap\n", + "with the reference text.\n", + "\n", + "**`bleu_score(hypothesis, reference)`** uses NLTK to compute BLEU, which\n", + "measures how precisely the generated text matches n-grams in the reference.\n", + "It also penalises outputs that are too short.\n", + "\n", + "**`evaluate_all()`** runs both metrics for every dataset. It uses the first\n", + "500 characters of each source as the reference, scores the summary, then\n", + "scores the first FAQ answer, and collects everything into a results dict.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "evaluate", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2026-05-06 13:21:36,853 [INFO] absl: Using default tokenizer.\n", + "2026-05-06 13:21:37,011 [INFO] docsgpt_utils: evaluate_output: {'rouge1': 0.214, 'rouge2': 0.0415, 'rougeL': 0.0823, 'bleu': 0.0042}\n", + "2026-05-06 13:21:37,012 [INFO] absl: Using default tokenizer.\n", + "2026-05-06 13:21:37,013 [INFO] docsgpt_utils: evaluate_output: {'rouge1': 0.1, 'rouge2': 0.0, 'rougeL': 0.0571, 'bleu': 0.0039}\n", + "2026-05-06 13:21:37,013 [INFO] absl: Using default tokenizer.\n", + "2026-05-06 13:21:37,016 [INFO] docsgpt_utils: evaluate_output: {'rouge1': 0.2657, 'rouge2': 0.0493, 'rougeL': 0.1189, 'bleu': 0.0053}\n", + "2026-05-06 13:21:37,016 [INFO] absl: Using default tokenizer.\n", + "2026-05-06 13:21:37,017 [INFO] docsgpt_utils: evaluate_output: {'rouge1': 0.2561, 'rouge2': 0.0247, 'rougeL': 0.1341, 'bleu': 0.0112}\n", + "2026-05-06 13:21:37,017 [INFO] absl: Using default tokenizer.\n", + "2026-05-06 13:21:37,019 [INFO] docsgpt_utils: evaluate_output: {'rouge1': 0.5507, 'rouge2': 0.361, 'rougeL': 0.4638, 'bleu': 0.1065}\n", + "2026-05-06 13:21:37,019 [INFO] absl: Using default tokenizer.\n", + "2026-05-06 13:21:37,020 [INFO] docsgpt_utils: evaluate_output: {'rouge1': 0.419, 'rouge2': 0.3107, 'rougeL': 0.4, 'bleu': 0.1244}\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "📊 Running ROUGE + BLEU evaluation...\n", + "\n", + "\n", + "[Awesome ML]\n", + " Summary scores:\n", + " ROUGE-1 : 0.2140\n", + " ROUGE-2 : 0.0415\n", + " ROUGE-L : 0.0823\n", + " BLEU : 0.0042\n", + " FAQ answer scores (first FAQ):\n", + " ROUGE-1 : 0.1000\n", + " BLEU : 0.0039\n", + "\n", + "[Stack Overflow]\n", + " Summary scores:\n", + " ROUGE-1 : 0.2657\n", + " ROUGE-2 : 0.0493\n", + " ROUGE-L : 0.1189\n", + " BLEU : 0.0053\n", + " FAQ answer scores (first FAQ):\n", + " ROUGE-1 : 0.2561\n", + " BLEU : 0.0112\n", + "\n", + "[The Pile]\n", + " Summary scores:\n", + " ROUGE-1 : 0.5507\n", + " ROUGE-2 : 0.3610\n", + " ROUGE-L : 0.4638\n", + " BLEU : 0.1065\n", + " FAQ answer scores (first FAQ):\n", + " ROUGE-1 : 0.4190\n", + " BLEU : 0.1244\n" + ] + } + ], + "source": [ + "# ── Evaluate all summaries and FAQs ────────────────────────────────────────\n", + "# evaluate_all() runs evaluate_output() for each document label.\n", + "# evaluate_output() calls rouge_scores() and bleu_score() internally.\n", + "# The reference for each document is its first 500 chars.\n", + "\n", + "print(\"📊 Running ROUGE + BLEU evaluation...\\n\")\n", + "\n", + "eval_results = tdgputi.evaluate_all(summaries, all_faqs, SOURCE_TEXTS)\n", + "\n", + "for label, scores in eval_results.items():\n", + " print(f\"\\n[{label}]\")\n", + " print(f\" Summary scores:\")\n", + " print(f\" ROUGE-1 : {scores.get('rouge1', 0):.4f}\")\n", + " print(f\" ROUGE-2 : {scores.get('rouge2', 0):.4f}\")\n", + " print(f\" ROUGE-L : {scores.get('rougeL', 0):.4f}\")\n", + " print(f\" BLEU : {scores.get('bleu', 0):.4f}\")\n", + " if 'faq_rouge1' in scores:\n", + " print(f\" FAQ answer scores (first FAQ):\")\n", + " print(f\" ROUGE-1 : {scores.get('faq_rouge1', 0):.4f}\")\n", + " print(f\" BLEU : {scores.get('faq_bleu', 0):.4f}\")" + ] + }, + { + "cell_type": "markdown", + "id": "part8-md", + "metadata": {}, + "source": [ + "---\n", + "\n", + "## Part 5: Multi-Language Support\n", + "\n", + "`summarize_multilang()` and `generate_faqs_multilang()` produce output in\n", + "any of the 9 supported languages by running translation before and after\n", + "the DocsGPT call.\n", + "\n", + "**`translate_text(text, source, target)`** uses the `deep-translator`\n", + "library which calls the Google Translate API. If the text is longer than\n", + "4500 characters (Google's limit per request), the function splits it on\n", + "sentence boundaries using `re.split()`, translates each batch separately,\n", + "then joins the results back together.\n", + "\n", + "The full pipeline for `summarize_multilang()` is:\n", + "1. If the source language is not English, translate the input text to English\n", + "2. Call `summarize_document()` to get an English summary from DocsGPT\n", + "3. If the output language is not English, translate the summary to the target\n", + "4. Return both the English and translated summaries in a dict\n", + "\n", + "`generate_faqs_multilang()` follows the same pipeline but translates each\n", + "FAQ question and answer individually.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "multilang", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2026-05-06 13:24:11,019 [INFO] docsgpt_utils: summarize_document: 'so_es' (2011 chars)\n", + "2026-05-06 13:24:11,019 [INFO] docsgpt_utils: query_docsgpt | Read the following document carefully and write a concise, well-structured summary in no more than 1\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "🌍 Supported languages:\n", + " en: English\n", + " es: Spanish\n", + " fr: French\n", + " de: German\n", + " zh: Chinese\n", + " pt: Portuguese\n", + " it: Italian\n", + " ja: Japanese\n", + " ar: Arabic\n", + "\n", + "=======================================================\n", + "🌐 Target: Spanish (es)\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2026-05-06 13:24:12,589 [INFO] docsgpt_utils: answer received (565 chars)\n", + "2026-05-06 13:24:12,650 [INFO] docsgpt_utils: translating 565 chars: en -> es\n", + "2026-05-06 13:24:13,717 [INFO] docsgpt_utils: summarize_document: 'so_fr' (2011 chars)\n", + "2026-05-06 13:24:13,718 [INFO] docsgpt_utils: query_docsgpt | Read the following document carefully and write a concise, well-structured summary in no more than 1\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "[English summary]\n", + "The user seeks a memory-efficient method to parse JSON directly from an **InputStream** on Android, specifically targeting compatibility with version 2.0 (API level 5) and above. While trying to avoid loading large strings into memory, they found that the native `JsonReader` requires API level 11. Their current implementation incorrectly consumes the stream into a `StringBuilder` before parsing, defeating the purpose of streaming. They are looking for a solution using **GSON** or a similar library that supports stream-based parsing for older Android versions.\n", + "\n", + "[Spanish translation]\n", + "El usuario busca un método eficiente en memoria para analizar JSON directamente desde un **InputStream** en Android, específicamente dirigido a la compatibilidad con la versión 2.0 (API nivel 5) y superiores. Mientras intentaban evitar cargar cadenas grandes en la memoria, descubrieron que el `JsonReader` nativo requiere el nivel de API 11. Su implementación actual consume incorrectamente la transmisión en un `StringBuilder` antes de analizarla, frustrando el propósito de la transmisión. Están buscando una solución que utilice **GSON** o una biblioteca similar que admita el análisis basado en secuencias para versiones anteriores de Android.\n", + "\n", + "=======================================================\n", + "🌐 Target: French (fr)\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2026-05-06 13:24:15,147 [INFO] docsgpt_utils: answer received (565 chars)\n", + "2026-05-06 13:24:15,148 [INFO] docsgpt_utils: translating 565 chars: en -> fr\n", + "2026-05-06 13:24:15,293 [INFO] docsgpt_utils: summarize_document: 'so_de' (2011 chars)\n", + "2026-05-06 13:24:15,294 [INFO] docsgpt_utils: query_docsgpt | Read the following document carefully and write a concise, well-structured summary in no more than 1\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "[English summary]\n", + "The user seeks a memory-efficient method to parse JSON directly from an **InputStream** on Android, specifically targeting compatibility with version 2.0 (API level 5) and above. While trying to avoid loading large strings into memory, they found that the native `JsonReader` requires API level 11. Their current implementation incorrectly consumes the stream into a `StringBuilder` before parsing, defeating the purpose of streaming. They are looking for a solution using **GSON** or a similar library that supports stream-based parsing for older Android versions.\n", + "\n", + "[French translation]\n", + "L'utilisateur recherche une méthode économe en mémoire pour analyser JSON directement à partir d'un **InputStream** sur Android, en ciblant spécifiquement la compatibilité avec la version 2.0 (API niveau 5) et supérieure. Tout en essayant d'éviter de charger de grandes chaînes en mémoire, ils ont constaté que le « JsonReader » natif nécessite le niveau d'API 11. Leur implémentation actuelle consomme de manière incorrecte le flux dans un « StringBuilder » avant l'analyse, ce qui va à l'encontre de l'objectif du streaming. Ils recherchent une solution utilisant **GSON** ou une bibliothèque similaire prenant en charge l'analyse basée sur les flux pour les anciennes versions d'Android.\n", + "\n", + "=======================================================\n", + "🌐 Target: German (de)\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2026-05-06 13:24:17,197 [INFO] docsgpt_utils: answer received (565 chars)\n", + "2026-05-06 13:24:17,198 [INFO] docsgpt_utils: translating 565 chars: en -> de\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "[English summary]\n", + "The user seeks a memory-efficient method to parse JSON directly from an **InputStream** on Android, specifically targeting compatibility with version 2.0 (API level 5) and above. While trying to avoid loading large strings into memory, they found that the native `JsonReader` requires API level 11. Their current implementation incorrectly consumes the stream into a `StringBuilder` before parsing, defeating the purpose of streaming. They are looking for a solution using **GSON** or a similar library that supports stream-based parsing for older Android versions.\n", + "\n", + "[German translation]\n", + "Der Benutzer sucht nach einer speichereffizienten Methode, um JSON direkt aus einem **InputStream** auf Android zu analysieren, und strebt dabei insbesondere die Kompatibilität mit Version 2.0 (API-Level 5) und höher an. Beim Versuch, das Laden großer Strings in den Speicher zu vermeiden, stellten sie fest, dass der native „JsonReader“ API-Level 11 erfordert. Ihre aktuelle Implementierung verbraucht den Stream fälschlicherweise vor dem Parsen in einen „StringBuilder“, wodurch der Zweck des Streamings zunichte gemacht wird. Sie suchen nach einer Lösung mit **GSON** oder einer ähnlichen Bibliothek, die streambasiertes Parsing für ältere Android-Versionen unterstützt.\n", + "\n", + "\n", + "✅ Multi-language summaries complete!\n" + ] + } + ], + "source": [ + "# ── Multi-language summarisation demo ──────────────────────────────────────\n", + "print(\"🌍 Supported languages:\")\n", + "for code, name in tdgputi.list_supported_languages().items():\n", + " print(f\" {code}: {name}\")\n", + "\n", + "# Use the Stack Overflow text for this demo\n", + "demo_text = tdgputi.truncate_text(SOURCE_TEXTS[\"Stack Overflow\"], max_chars=2000)\n", + "target_langs = [\"es\", \"fr\", \"de\"]\n", + "\n", + "multilang_results = {}\n", + "\n", + "for lang_code in target_langs:\n", + " lang_name = tdgputi.SUPPORTED_LANGUAGES[lang_code]\n", + " print(f\"\\n{'='*55}\")\n", + " print(f\"🌐 Target: {lang_name} ({lang_code})\")\n", + "\n", + " result = tdgputi.summarize_multilang(\n", + " demo_text,\n", + " API_KEY,\n", + " source_lang=\"en\",\n", + " output_lang=lang_code,\n", + " base_url=BASE_URL,\n", + " max_words=100,\n", + " source_label=f\"so_{lang_code}\",\n", + " )\n", + " multilang_results[lang_code] = result\n", + "\n", + " print(f\"\\n[English summary]\")\n", + " print(result['english_summary'])\n", + " print(f\"\\n[{lang_name} translation]\")\n", + " print(result['translated_summary'])\n", + "\n", + "print(\"\\n\\n✅ Multi-language summaries complete!\")" + ] + }, + { + "cell_type": "markdown", + "id": "part9-md", + "metadata": {}, + "source": [ + "---\n", + "\n", + "## Part 6: Results Dashboard\n", + "\n", + "This cell collects all the outputs generated so far — summaries, FAQ counts,\n", + "and evaluation scores — and arranges them into a pandas DataFrame.\n", + "\n", + "Each row represents one dataset. The columns show the number of FAQs\n", + "generated, a preview of the summary, and the four metric scores.\n", + "`pd.set_option()` controls how wide the summary preview column is and how\n", + "many decimal places the scores display.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "dashboard", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "======================================================================\n", + " DocsGPT Documentation Assistant — Results Dashboard\n", + "======================================================================\n", + " # FAQs Summary preview ROUGE-1 ROUGE-2 ROUGE-L BLEU\n", + "Document \n", + "Awesome ML 4 This document outlines a diverse ecosystem of Python-based libraries and framewo... 0.2140 0.0415 0.0823 0.0042\n", + "Stack Overflow 4 The user is seeking a memory-efficient method to parse JSON directly from an `In... 0.2657 0.0493 0.1189 0.0053\n", + "The Pile 4 Natural Language Processing (NLP) is a branch of artificial intelligence focused... 0.5507 0.3610 0.4638 0.1065\n", + "======================================================================\n", + "\n", + "🏆 Best ROUGE-1 : The Pile (0.5507)\n", + "🏆 Best BLEU : The Pile (0.1065)\n" + ] + } + ], + "source": [ + "# ── Results dashboard ──────────────────────────────────────────────────────\n", + "rows = []\n", + "for label in SOURCE_TEXTS:\n", + " scores = eval_results.get(label, {})\n", + " n_faqs = len(all_faqs.get(label, []))\n", + " summary = summaries.get(label, \"\")\n", + " rows.append({\n", + " \"Document\": label,\n", + " \"# FAQs\": n_faqs,\n", + " \"Summary preview\": (summary[:80] + \"...\") if len(summary) > 80 else summary,\n", + " \"ROUGE-1\": scores.get(\"rouge1\", 0.0),\n", + " \"ROUGE-2\": scores.get(\"rouge2\", 0.0),\n", + " \"ROUGE-L\": scores.get(\"rougeL\", 0.0),\n", + " \"BLEU\": scores.get(\"bleu\", 0.0),\n", + " })\n", + "\n", + "dashboard = pd.DataFrame(rows).set_index(\"Document\")\n", + "pd.set_option(\"display.max_colwidth\", 85)\n", + "pd.set_option(\"display.float_format\", \"{:.4f}\".format)\n", + "\n", + "print(\"=\" * 70)\n", + "print(\" DocsGPT Documentation Assistant — Results Dashboard\")\n", + "print(\"=\" * 70)\n", + "print(dashboard.to_string())\n", + "print(\"=\" * 70)\n", + "\n", + "best_rouge = dashboard[\"ROUGE-1\"].idxmax()\n", + "best_bleu = dashboard[\"BLEU\"].idxmax()\n", + "print(f\"\\n🏆 Best ROUGE-1 : {best_rouge} ({dashboard.loc[best_rouge, 'ROUGE-1']:.4f})\")\n", + "print(f\"🏆 Best BLEU : {best_bleu} ({dashboard.loc[best_bleu, 'BLEU']:.4f})\")" + ] + }, + { + "cell_type": "markdown", + "id": "part10-md", + "metadata": {}, + "source": [ + "---\n", + "\n", + "## Part 7: Gradio User Interface\n", + "\n", + "This cell builds an interactive web UI using Gradio. When a user clicks\n", + "the Generate button, the `run_docsgpt_ui()` function runs:\n", + "\n", + "1. Truncates the input text to 3500 characters\n", + "2. Calls `summarize_multilang()` to get a summary in the selected language\n", + "3. Calls `generate_faqs_multilang()` to get FAQs in the selected language\n", + "4. Calls `evaluate_output()` to compute ROUGE and BLEU scores for the summary\n", + "5. Returns all three results to the UI components\n", + "\n", + "`gr.Blocks()` defines the layout — a text input, a language dropdown, a\n", + "slider for FAQ count, and three output areas. `submit_btn.click()` wires\n", + "the button to the function, specifying which inputs to read and which\n", + "outputs to update.\n", + "\n", + "Run the cell, then open **http://127.0.0.1:7860** in your browser.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "gradio-ui", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/var/folders/lj/89tkjhqd6034t1cfsz5drsy80000gn/T/ipykernel_17203/3298890697.py:64: UserWarning: The parameters have been moved from the Blocks constructor to the launch() method in Gradio 6.0: theme. Please pass these parameters to launch() instead.\n", + " with gr.Blocks(title=\"DocsGPT Documentation Assistant\", theme=gr.themes.Soft()) as demo:\n", + "2026-05-06 19:27:42,318 [INFO] httpx: HTTP Request: HEAD https://huggingface.co/api/telemetry/https%3A/api.gradio.app/gradio-initiated-analytics \"HTTP/1.1 200 OK\"\n", + "2026-05-06 19:27:42,337 [INFO] httpx: HTTP Request: GET http://127.0.0.1:7860/gradio_api/startup-events \"HTTP/1.1 200 OK\"\n", + "2026-05-06 19:27:42,344 [INFO] httpx: HTTP Request: HEAD http://127.0.0.1:7860/ \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Launching Gradio app at http://127.0.0.1:7860 ...\n", + "* Running on local URL: http://127.0.0.1:7860\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2026-05-06 19:27:42,662 [INFO] httpx: HTTP Request: GET https://api.gradio.app/pkg-version \"HTTP/1.1 200 OK\"\n", + "2026-05-06 19:27:42,716 [INFO] httpx: HTTP Request: GET https://api.gradio.app/v3/tunnel-request \"HTTP/1.1 200 OK\"\n", + "2026-05-06 19:27:42,990 [INFO] httpx: HTTP Request: GET https://cdn-media.huggingface.co/frpc-gradio-0.3/frpc_darwin_arm64 \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "* Running on public URL: https://a6a2d2994f3cbb2171.gradio.live\n", + "\n", + "This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2026-05-06 19:27:44,691 [INFO] httpx: HTTP Request: HEAD https://a6a2d2994f3cbb2171.gradio.live \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "data": { + "text/html": [ + "
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2026-05-06 19:27:44,755 [INFO] httpx: HTTP Request: HEAD https://huggingface.co/api/telemetry/https%3A/api.gradio.app/gradio-launched-telemetry \"HTTP/1.1 200 OK\"\n", + "2026-05-06 21:57:57,878 [INFO] docsgpt_utils: summarize_document: 'ui' (514 chars)\n", + "2026-05-06 21:57:57,881 [INFO] docsgpt_utils: query_docsgpt | Read the following document carefully and write a concise, well-structured summary in no more than 2\n", + "2026-05-06 21:58:16,110 [INFO] docsgpt_utils: answer received (1076 chars)\n", + "2026-05-06 21:58:16,112 [INFO] docsgpt_utils: generate_faqs: 2 Qs for 'ui_faq'\n", + "2026-05-06 21:58:16,113 [INFO] docsgpt_utils: query_docsgpt | Based on the document below, generate exactly 2 frequently asked questions (FAQs) with detailed, hel\n", + "2026-05-06 21:58:17,914 [INFO] docsgpt_utils: answer received (915 chars)\n", + "2026-05-06 21:58:17,940 [INFO] absl: Using default tokenizer.\n", + "2026-05-06 21:58:18,113 [INFO] docsgpt_utils: evaluate_output: {'rouge1': 0.6117, 'rouge2': 0.402, 'rougeL': 0.4466, 'bleu': 0.2475}\n" + ] + } + ], + "source": [ + "# ── Gradio UI ──────────────────────────────────────────────────────────────\n", + "import gradio as gr\n", + "\n", + "\n", + "def run_docsgpt_ui(document_text: str, output_language: str, n_faqs: int) -> tuple:\n", + " \"\"\"Gradio handler: summarise + generate FAQs + evaluate + translate.\"\"\"\n", + " if not document_text.strip():\n", + " return \"⚠️ Please paste a document first.\", \"\", \"\"\n", + "\n", + " lang_map = {v: k for k, v in tdgputi.list_supported_languages().items()}\n", + " out_lang_code = lang_map.get(output_language, \"en\")\n", + "\n", + " try:\n", + " truncated = tdgputi.truncate_text(document_text, max_chars=3500)\n", + "\n", + " # ── Summarisation ──────────────────────────────────────────────────\n", + " result = tdgputi.summarize_multilang(\n", + " truncated, API_KEY, source_lang=\"en\",\n", + " output_lang=out_lang_code, base_url=BASE_URL, source_label=\"ui\",\n", + " )\n", + " english_summary = result[\"english_summary\"]\n", + " translated_summary = result[\"translated_summary\"]\n", + "\n", + " summary_md = (\n", + " f\"### Summary\\n{english_summary}\" if out_lang_code == \"en\"\n", + " else f\"### English Summary\\n{english_summary}\\n\\n### {output_language} Summary\\n{translated_summary}\"\n", + " )\n", + "\n", + " # ── FAQ Generation ─────────────────────────────────────────────────\n", + " faq_result = tdgputi.generate_faqs_multilang(\n", + " truncated, API_KEY, source_lang=\"en\",\n", + " output_lang=out_lang_code, n_questions=int(n_faqs),\n", + " base_url=BASE_URL, source_label=\"ui_faq\",\n", + " )\n", + " faq_lines = [\n", + " f\"**Q{i}: {faq['question']}**\\nA: {faq['answer']}\"\n", + " for i, faq in enumerate(faq_result[\"translated_faqs\"], 1)\n", + " ]\n", + " faqs_md = \"\\n\\n\".join(faq_lines) or \"No FAQs generated.\"\n", + "\n", + " # ── Evaluation ─────────────────────────────────────────────────────\n", + " scores = tdgputi.evaluate_output(english_summary, document_text[:500])\n", + " scores_text = \"\\n\".join(f\"{k.upper()}: {v:.4f}\" for k, v in scores.items())\n", + "\n", + " return summary_md, faqs_md, scores_text\n", + "\n", + " except Exception as exc:\n", + " return f\"❌ Error: {exc}\", \"\", \"\"\n", + "\n", + "\n", + "# ── Build the interface ────────────────────────────────────────────────────\n", + "lang_choices = list(tdgputi.list_supported_languages().values())\n", + "\n", + "EXAMPLE_TEXT = (\n", + " \"Python is a high-level, interpreted programming language known for its \"\n", + " \"clear syntax and readability. It supports multiple programming paradigms \"\n", + " \"including procedural, object-oriented, and functional programming. Python \"\n", + " \"is widely used in data science, machine learning, web development, and \"\n", + " \"automation. It was created by Guido van Rossum and first released in 1991. \"\n", + " \"Python has a large standard library and a vibrant open-source ecosystem. \"\n", + " \"Its package manager pip provides access to hundreds of thousands of packages.\"\n", + ")\n", + "\n", + "with gr.Blocks(title=\"DocsGPT Documentation Assistant\", theme=gr.themes.Soft()) as demo:\n", + " gr.Markdown(\n", + " \"# 📄 DocsGPT Documentation Assistant\\n\"\n", + " \"Paste any technical document to get an AI-powered **summary** and **FAQs** \"\n", + " \"in your chosen language — powered by **DocsGPT Cloud** (`POST /api/answer`).\"\n", + " )\n", + "\n", + " with gr.Row():\n", + " with gr.Column(scale=3):\n", + " doc_input = gr.Textbox(\n", + " label=\"📋 Document Text\",\n", + " placeholder=\"Paste your technical document here...\",\n", + " lines=12, value=EXAMPLE_TEXT,\n", + " )\n", + " with gr.Column(scale=1):\n", + " lang_dropdown = gr.Dropdown(\n", + " choices=lang_choices, value=\"English\", label=\"🌍 Output Language\",\n", + " )\n", + " n_faqs_slider = gr.Slider(\n", + " minimum=1, maximum=8, value=4, step=1, label=\"❓ Number of FAQs\",\n", + " )\n", + " submit_btn = gr.Button(\"🚀 Generate Summary + FAQs\", variant=\"primary\", size=\"lg\")\n", + "\n", + " with gr.Row():\n", + " summary_out = gr.Markdown(label=\"Summary\")\n", + "\n", + " with gr.Row():\n", + " faqs_out = gr.Markdown(label=\"FAQs\")\n", + "\n", + " scores_out = gr.Textbox(\n", + " label=\"📊 Evaluation Scores (ROUGE + BLEU)\", lines=5, interactive=False,\n", + " )\n", + "\n", + " submit_btn.click(\n", + " fn=run_docsgpt_ui,\n", + " inputs=[doc_input, lang_dropdown, n_faqs_slider],\n", + " outputs=[summary_out, faqs_out, scores_out],\n", + " )\n", + "\n", + "print(\"Launching Gradio app at http://127.0.0.1:7860 ...\")\n", + "demo.launch(share=True)\n", + "# Set share=True to get a public URL for demos" + ] + }, + { + "cell_type": "markdown", + "id": "conclusion", + "metadata": {}, + "source": [ + "---\n", + "\n", + "## What We Built\n", + "\n", + "| Part | Function used | API call |\n", + "|------|---------------|----------|\n", + "| Data Collection | `fetch_awesome_ml_readme()`, `load_stackoverflow_sample()`, `load_pile_sample()` | GitHub, HuggingFace Hub |\n", + "| Summarisation | `summarize_document()` | `POST /api/answer` |\n", + "| FAQ Generation | `generate_faqs()`, `parse_faqs()` | `POST /api/answer` |\n", + "| Evaluation | `evaluate_all()` | rouge_score, nltk |\n", + "| Multi-Language | `summarize_multilang()`, `generate_faqs_multilang()` | deep-translator + DocsGPT |\n", + "| UI | `gr.Blocks()`, `demo.launch()` | Gradio |\n" + ] + }, + { + "cell_type": "markdown", + "id": "a2adda35", + "metadata": {}, + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv (3.13.7)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.7" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/docsgpt.example.py b/class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/docsgpt.example.py new file mode 100644 index 000000000..1a7bd4492 --- /dev/null +++ b/class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/docsgpt.example.py @@ -0,0 +1,555 @@ +# --- +# jupyter: +# jupytext: +# text_representation: +# extension: .py +# format_name: percent +# format_version: '1.3' +# jupytext_version: 1.16.4 +# kernelspec: +# display_name: .venv (3.13.7) +# language: python +# name: python3 +# --- + +# %% [markdown] +# # DocsGPT: Intelligent Documentation Assistant +# +# This notebook builds a complete AI-powered documentation assistant step by step. +# Each part focuses on one piece of the system and shows how the code works. +# +# | Part | What it does | +# |------|--------------| +# | 1 | Load text from three real datasets | +# | 2 | Summarise each document using DocsGPT | +# | 3 | Generate FAQs from each document | +# | 4 | Score the outputs with ROUGE and BLEU | +# | 5 | Produce summaries and FAQs in other languages | +# | 6 | Show all results in a summary table | +# | 7 | Launch an interactive Gradio UI | +# +# **Setup:** +# ```bash +# ./docker_build.sh +# ./docker_jupyter.sh +# cp .env.example .env +# # Open .env and set DOCSGPT_API_KEY=your-agent-key +# ``` +# + +# %% +# %load_ext autoreload +# %autoreload 2 + +import logging +import os +import pandas as pd + +import docsgpt_utils as tdgputi + +from dotenv import load_dotenv +load_dotenv() + +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s [%(levelname)s] %(name)s: %(message)s", +) +_LOG = logging.getLogger(__name__) + +BASE_URL = tdgputi.get_base_url() +API_KEY = tdgputi.get_api_key() + +print(f"Base URL : {BASE_URL}") +print(f"API key : {API_KEY[:10]}*** (truncated for safety)") +print("Environment ready ✓") + +# %% [markdown] +# --- +# +# ## Part 1: Data Collection +# +# We load text from three sources. Each one is handled by a function in +# `docsgpt_utils.py` that fetches the data and returns plain text. +# +# - **`fetch_awesome_ml_readme()`** — downloads the raw markdown file from GitHub +# using `requests.get()`. `parse_awesome_ml_sections()` then splits it into a +# dict of `{section_title: body_text}` by scanning for `## ` heading lines. +# `clean_markdown()` strips all markdown syntax (links, bullets, code blocks) +# so we're left with plain prose the LLM can read cleanly. +# +# - **`load_stackoverflow_sample()`** — connects to HuggingFace Hub and streams +# rows one at a time using `datasets.load_dataset(..., streaming=True)`. Only +# the first `n_rows` rows are read — the full dataset is never downloaded. +# `so_rows_to_text()` combines the title, body, and answer of each row into +# one readable block of text. +# +# - **`load_pile_sample()`** — streams The Pile dataset the same way, collecting +# rows until we have enough characters. +# +# All three functions fall back to built-in sample data if the network is +# unavailable, so the notebook always keeps running. +# + +# %% +# ── 1a. Awesome Machine Learning (GitHub README) ─────────────────────────── +# fetch_awesome_ml_readme() downloads the raw markdown from GitHub. +# parse_awesome_ml_sections() splits it into a dict: {section_title: body} +# clean_markdown() strips all markdown syntax → plain prose for the LLM. + +print("=" * 60) +print("1a. Fetching Awesome Machine Learning README from GitHub...") + +raw_md = tdgputi.fetch_awesome_ml_readme() +sections = tdgputi.parse_awesome_ml_sections(raw_md) + +print(f" Total chars : {len(raw_md):,}") +print(f" Sections found: {len(sections)}") +print(f" Section names : {list(sections.keys())[:6]} ...") + +# Pick the largest content-rich section to give the LLM something meaty +content_sections = {k: v for k, v in sections.items() if len(v) > 500} +aml_title = sorted(content_sections, key=lambda k: len(content_sections[k]), reverse=True)[0] +aml_text = tdgputi.clean_markdown(content_sections[aml_title]) + +print(f" Selected : '{aml_title}'") +print(f" Clean text : {len(aml_text):,} chars") +print(f" Preview : {aml_text[:200]}...") + +# %% +# ── 1b. Stack Overflow Questions (HuggingFace Hub, streaming) ────────────── +# load_stackoverflow_sample() uses the HuggingFace `datasets` library with +# streaming=True so we never download the full dataset — just the first N rows. +# +# so_rows_to_text() combines title + body + answer into one readable document +# per question, then joins them all with --- separators. + +print("1b. Loading Stack Overflow questions from HuggingFace Hub...") +print(" (uses streaming — no full download needed)") + +so_rows = tdgputi.load_stackoverflow_sample(n_rows=10) +so_text = tdgputi.so_rows_to_text(so_rows) + +print(f"\n Rows loaded : {len(so_rows)}") +print(f" Combined chars: {len(so_text):,}") +print(f" First question: {so_rows[0]['title']}") +print(f" First answer : {so_rows[0]['answer'][:100]}...") + +# %% +# ── 1c. The Pile — uncopyrighted subset (HuggingFace Hub, streaming) ─────── +# The Pile is a large-scale dataset of diverse text. We use the uncopyrighted +# subset for safety. Again we stream just enough characters rather than +# downloading the whole thing. + +print("1c. Streaming a sample from The Pile (uncopyrighted subset)...") + +pile_text = tdgputi.load_pile_sample(n_chars=8000) + +print(f" Collected : {len(pile_text):,} chars") +print(f" Preview : {pile_text[:200]}...") + +# ── Package everything into a named dict for the rest of the notebook ─────── +SOURCE_TEXTS = { + "Awesome ML": aml_text, + "Stack Overflow": so_text, + "The Pile": pile_text, +} + +print("\n📦 All datasets ready:") +for label, text in SOURCE_TEXTS.items(): + print(f" • {label}: {len(text):,} chars") + +# %% [markdown] +# --- +# +# ## Part 2: Text Summarisation +# +# `summarize_document()` takes a block of text and sends it to DocsGPT to +# get a summary. Here is what happens inside the function: +# +# 1. `truncate_text()` cuts the text to 4000 characters so it fits in the prompt +# 2. The text is placed inside a prompt string that instructs DocsGPT to +# summarise it in a certain number of words +# 3. That prompt is sent to `POST /api/answer` via `query_docsgpt()` +# 4. The `"answer"` field from the JSON response is returned as the summary +# +# The loop below runs this for each of the three datasets and stores the +# results in a `summaries` dict keyed by dataset name. +# + +# %% +# ── Summarise each dataset source ────────────────────────────────────────── +# summarize_document() builds the prompt, calls POST /api/answer, and returns +# the 'answer' field from the JSON response. + +summaries: dict = {} + +for label, text in SOURCE_TEXTS.items(): + print(f"\n{'='*60}") + print(f"📄 Summarising: {label} ({len(text):,} chars → truncated to 4000)") + + truncated = tdgputi.truncate_text(text, max_chars=4000) + + summary = tdgputi.summarize_document( + truncated, + API_KEY, + BASE_URL, + max_words=200, + source_label=label, + ) + summaries[label] = summary + word_count = len(summary.split()) + print(f"\n✅ Summary ({word_count} words):") + print(summary) + +print(f"\n\n🎉 All {len(summaries)} summaries generated!") + +# %% [markdown] +# --- +# +# ## Part 3: FAQ Generation +# +# `generate_faqs()` works the same way as summarisation — it embeds the +# document text in a prompt and sends it to `POST /api/answer`. The prompt +# instructs DocsGPT to format the output as: +# +# ``` +# Q: +# A: +# ``` +# +# Once the response comes back, `parse_faqs()` processes the raw text: +# 1. Splits the text on `Q:` markers using `re.split()` +# 2. For each block, extracts the question with a `re.search()` for `Q: ...` +# and the answer with a search for `A: ...` +# 3. Returns a list of `{"question": ..., "answer": ...}` dicts +# +# `print_faqs()` then formats and prints each pair. +# + +# %% +# ── Generate FAQs for each dataset source ───────────────────────────────── +# generate_faqs() sends a structured prompt to /api/answer requesting +# exactly n_questions FAQs in Q:/A: format, then calls parse_faqs() on the result. + +all_faqs: dict = {} + +for label, text in SOURCE_TEXTS.items(): + print(f"\n{'='*60}") + print(f"❓ Generating FAQs: {label}") + + truncated = tdgputi.truncate_text(text, max_chars=4000) + + faqs = tdgputi.generate_faqs( + truncated, + API_KEY, + BASE_URL, + n_questions=4, + source_label=label, + ) + all_faqs[label] = faqs + + print(f"\n✅ Generated {len(faqs)} FAQs:") + tdgputi.print_faqs(faqs) + +print(f"\n\n🎉 FAQ generation complete for all {len(all_faqs)} sources!") + +# %% [markdown] +# --- +# +# ## Part 4: Evaluation — ROUGE and BLEU +# +# We score each generated summary and FAQ answer against the original source +# text to measure how well the content was captured. +# +# **`rouge_scores(hypothesis, reference)`** uses the `rouge_score` library to +# compute three variants of ROUGE: +# - **ROUGE-1**: counts how many individual words overlap +# - **ROUGE-2**: counts how many two-word pairs overlap +# - **ROUGE-L**: finds the longest matching sequence of words in order +# +# All three are F1 scores (0 to 1). A higher number means more overlap +# with the reference text. +# +# **`bleu_score(hypothesis, reference)`** uses NLTK to compute BLEU, which +# measures how precisely the generated text matches n-grams in the reference. +# It also penalises outputs that are too short. +# +# **`evaluate_all()`** runs both metrics for every dataset. It uses the first +# 500 characters of each source as the reference, scores the summary, then +# scores the first FAQ answer, and collects everything into a results dict. +# + +# %% +# ── Evaluate all summaries and FAQs ──────────────────────────────────────── +# evaluate_all() runs evaluate_output() for each document label. +# evaluate_output() calls rouge_scores() and bleu_score() internally. +# The reference for each document is its first 500 chars. + +print("📊 Running ROUGE + BLEU evaluation...\n") + +eval_results = tdgputi.evaluate_all(summaries, all_faqs, SOURCE_TEXTS) + +for label, scores in eval_results.items(): + print(f"\n[{label}]") + print(f" Summary scores:") + print(f" ROUGE-1 : {scores.get('rouge1', 0):.4f}") + print(f" ROUGE-2 : {scores.get('rouge2', 0):.4f}") + print(f" ROUGE-L : {scores.get('rougeL', 0):.4f}") + print(f" BLEU : {scores.get('bleu', 0):.4f}") + if 'faq_rouge1' in scores: + print(f" FAQ answer scores (first FAQ):") + print(f" ROUGE-1 : {scores.get('faq_rouge1', 0):.4f}") + print(f" BLEU : {scores.get('faq_bleu', 0):.4f}") + +# %% [markdown] +# --- +# +# ## Part 5: Multi-Language Support +# +# `summarize_multilang()` and `generate_faqs_multilang()` produce output in +# any of the 9 supported languages by running translation before and after +# the DocsGPT call. +# +# **`translate_text(text, source, target)`** uses the `deep-translator` +# library which calls the Google Translate API. If the text is longer than +# 4500 characters (Google's limit per request), the function splits it on +# sentence boundaries using `re.split()`, translates each batch separately, +# then joins the results back together. +# +# The full pipeline for `summarize_multilang()` is: +# 1. If the source language is not English, translate the input text to English +# 2. Call `summarize_document()` to get an English summary from DocsGPT +# 3. If the output language is not English, translate the summary to the target +# 4. Return both the English and translated summaries in a dict +# +# `generate_faqs_multilang()` follows the same pipeline but translates each +# FAQ question and answer individually. +# + +# %% +# ── Multi-language summarisation demo ────────────────────────────────────── +print("🌍 Supported languages:") +for code, name in tdgputi.list_supported_languages().items(): + print(f" {code}: {name}") + +# Use the Stack Overflow text for this demo +demo_text = tdgputi.truncate_text(SOURCE_TEXTS["Stack Overflow"], max_chars=2000) +target_langs = ["es", "fr", "de"] + +multilang_results = {} + +for lang_code in target_langs: + lang_name = tdgputi.SUPPORTED_LANGUAGES[lang_code] + print(f"\n{'='*55}") + print(f"🌐 Target: {lang_name} ({lang_code})") + + result = tdgputi.summarize_multilang( + demo_text, + API_KEY, + source_lang="en", + output_lang=lang_code, + base_url=BASE_URL, + max_words=100, + source_label=f"so_{lang_code}", + ) + multilang_results[lang_code] = result + + print(f"\n[English summary]") + print(result['english_summary']) + print(f"\n[{lang_name} translation]") + print(result['translated_summary']) + +print("\n\n✅ Multi-language summaries complete!") + +# %% [markdown] +# --- +# +# ## Part 6: Results Dashboard +# +# This cell collects all the outputs generated so far — summaries, FAQ counts, +# and evaluation scores — and arranges them into a pandas DataFrame. +# +# Each row represents one dataset. The columns show the number of FAQs +# generated, a preview of the summary, and the four metric scores. +# `pd.set_option()` controls how wide the summary preview column is and how +# many decimal places the scores display. +# + +# %% +# ── Results dashboard ────────────────────────────────────────────────────── +rows = [] +for label in SOURCE_TEXTS: + scores = eval_results.get(label, {}) + n_faqs = len(all_faqs.get(label, [])) + summary = summaries.get(label, "") + rows.append({ + "Document": label, + "# FAQs": n_faqs, + "Summary preview": (summary[:80] + "...") if len(summary) > 80 else summary, + "ROUGE-1": scores.get("rouge1", 0.0), + "ROUGE-2": scores.get("rouge2", 0.0), + "ROUGE-L": scores.get("rougeL", 0.0), + "BLEU": scores.get("bleu", 0.0), + }) + +dashboard = pd.DataFrame(rows).set_index("Document") +pd.set_option("display.max_colwidth", 85) +pd.set_option("display.float_format", "{:.4f}".format) + +print("=" * 70) +print(" DocsGPT Documentation Assistant — Results Dashboard") +print("=" * 70) +print(dashboard.to_string()) +print("=" * 70) + +best_rouge = dashboard["ROUGE-1"].idxmax() +best_bleu = dashboard["BLEU"].idxmax() +print(f"\n🏆 Best ROUGE-1 : {best_rouge} ({dashboard.loc[best_rouge, 'ROUGE-1']:.4f})") +print(f"🏆 Best BLEU : {best_bleu} ({dashboard.loc[best_bleu, 'BLEU']:.4f})") + +# %% [markdown] +# --- +# +# ## Part 7: Gradio User Interface +# +# This cell builds an interactive web UI using Gradio. When a user clicks +# the Generate button, the `run_docsgpt_ui()` function runs: +# +# 1. Truncates the input text to 3500 characters +# 2. Calls `summarize_multilang()` to get a summary in the selected language +# 3. Calls `generate_faqs_multilang()` to get FAQs in the selected language +# 4. Calls `evaluate_output()` to compute ROUGE and BLEU scores for the summary +# 5. Returns all three results to the UI components +# +# `gr.Blocks()` defines the layout — a text input, a language dropdown, a +# slider for FAQ count, and three output areas. `submit_btn.click()` wires +# the button to the function, specifying which inputs to read and which +# outputs to update. +# +# Run the cell, then open **http://127.0.0.1:7860** in your browser. +# + +# %% +# ── Gradio UI ────────────────────────────────────────────────────────────── +import gradio as gr + + +def run_docsgpt_ui(document_text: str, output_language: str, n_faqs: int) -> tuple: + """Gradio handler: summarise + generate FAQs + evaluate + translate.""" + if not document_text.strip(): + return "⚠️ Please paste a document first.", "", "" + + lang_map = {v: k for k, v in tdgputi.list_supported_languages().items()} + out_lang_code = lang_map.get(output_language, "en") + + try: + truncated = tdgputi.truncate_text(document_text, max_chars=3500) + + # ── Summarisation ────────────────────────────────────────────────── + result = tdgputi.summarize_multilang( + truncated, API_KEY, source_lang="en", + output_lang=out_lang_code, base_url=BASE_URL, source_label="ui", + ) + english_summary = result["english_summary"] + translated_summary = result["translated_summary"] + + summary_md = ( + f"### Summary\n{english_summary}" if out_lang_code == "en" + else f"### English Summary\n{english_summary}\n\n### {output_language} Summary\n{translated_summary}" + ) + + # ── FAQ Generation ───────────────────────────────────────────────── + faq_result = tdgputi.generate_faqs_multilang( + truncated, API_KEY, source_lang="en", + output_lang=out_lang_code, n_questions=int(n_faqs), + base_url=BASE_URL, source_label="ui_faq", + ) + faq_lines = [ + f"**Q{i}: {faq['question']}**\nA: {faq['answer']}" + for i, faq in enumerate(faq_result["translated_faqs"], 1) + ] + faqs_md = "\n\n".join(faq_lines) or "No FAQs generated." + + # ── Evaluation ───────────────────────────────────────────────────── + scores = tdgputi.evaluate_output(english_summary, document_text[:500]) + scores_text = "\n".join(f"{k.upper()}: {v:.4f}" for k, v in scores.items()) + + return summary_md, faqs_md, scores_text + + except Exception as exc: + return f"❌ Error: {exc}", "", "" + + +# ── Build the interface ──────────────────────────────────────────────────── +lang_choices = list(tdgputi.list_supported_languages().values()) + +EXAMPLE_TEXT = ( + "Python is a high-level, interpreted programming language known for its " + "clear syntax and readability. It supports multiple programming paradigms " + "including procedural, object-oriented, and functional programming. Python " + "is widely used in data science, machine learning, web development, and " + "automation. It was created by Guido van Rossum and first released in 1991. " + "Python has a large standard library and a vibrant open-source ecosystem. " + "Its package manager pip provides access to hundreds of thousands of packages." +) + +with gr.Blocks(title="DocsGPT Documentation Assistant", theme=gr.themes.Soft()) as demo: + gr.Markdown( + "# 📄 DocsGPT Documentation Assistant\n" + "Paste any technical document to get an AI-powered **summary** and **FAQs** " + "in your chosen language — powered by **DocsGPT Cloud** (`POST /api/answer`)." + ) + + with gr.Row(): + with gr.Column(scale=3): + doc_input = gr.Textbox( + label="📋 Document Text", + placeholder="Paste your technical document here...", + lines=12, value=EXAMPLE_TEXT, + ) + with gr.Column(scale=1): + lang_dropdown = gr.Dropdown( + choices=lang_choices, value="English", label="🌍 Output Language", + ) + n_faqs_slider = gr.Slider( + minimum=1, maximum=8, value=4, step=1, label="❓ Number of FAQs", + ) + submit_btn = gr.Button("🚀 Generate Summary + FAQs", variant="primary", size="lg") + + with gr.Row(): + summary_out = gr.Markdown(label="Summary") + + with gr.Row(): + faqs_out = gr.Markdown(label="FAQs") + + scores_out = gr.Textbox( + label="📊 Evaluation Scores (ROUGE + BLEU)", lines=5, interactive=False, + ) + + submit_btn.click( + fn=run_docsgpt_ui, + inputs=[doc_input, lang_dropdown, n_faqs_slider], + outputs=[summary_out, faqs_out, scores_out], + ) + +print("Launching Gradio app at http://127.0.0.1:7860 ...") +demo.launch(share=True) +# Set share=True to get a public URL for demos + +# %% [markdown] +# --- +# +# ## What We Built +# +# | Part | Function used | API call | +# |------|---------------|----------| +# | Data Collection | `fetch_awesome_ml_readme()`, `load_stackoverflow_sample()`, `load_pile_sample()` | GitHub, HuggingFace Hub | +# | Summarisation | `summarize_document()` | `POST /api/answer` | +# | FAQ Generation | `generate_faqs()`, `parse_faqs()` | `POST /api/answer` | +# | Evaluation | `evaluate_all()` | rouge_score, nltk | +# | Multi-Language | `summarize_multilang()`, `generate_faqs_multilang()` | deep-translator + DocsGPT | +# | UI | `gr.Blocks()`, `demo.launch()` | Gradio | +# + +# %% [markdown] +# diff --git a/class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/docsgpt_utils.py b/class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/docsgpt_utils.py new file mode 100644 index 000000000..6c3ffbc6a --- /dev/null +++ b/class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/docsgpt_utils.py @@ -0,0 +1,1016 @@ +""" +docsgpt_utils.py +================ +Utility functions for DocsGPT-based documentation assistant workflows. + +DocsGPT is an AI-powered RAG platform for querying and generating content +from technical documents via a REST API on https://gptcloud.arc53.com. + +Verified against: https://docs.docsgpt.cloud/Agents/api (May 2026) + +API endpoints: + POST /api/answer - non-streaming Q&A (full JSON) + POST /stream - SSE streaming Q&A (token-by-token) + POST /api/store_attachment - upload a file attachment (multipart) + GET /api/task_status - poll async attachment processing + +Key design decisions: + - With an Agent API key, document sources are pre-configured in the UI. + Summarisation and FAQ generation embed text INLINE in the prompt so the + LLM reasons directly over the provided content. + - File attachments use the 3-step store_attachment flow. + - History format: [{"prompt": "...", "response": "..."}] (official docs). + - SSE token field: {"type": "answer", "answer": ""}. + +Import as: + import docsgpt_utils as tdgputi +""" + +from __future__ import annotations + +import json +import logging +import os +import re +import time +from typing import Generator + +import nltk +import requests + +_LOG = logging.getLogger(__name__) + +DEFAULT_BASE_URL = "https://gptcloud.arc53.com" + + +# ############################################################################# +# Configuration +# ############################################################################# + + +def get_base_url() -> str: + """ + Return the DocsGPT base URL with no trailing slash. + + :return: Base URL string, defaulting to the cloud URL. + """ + return os.environ.get("DOCSGPT_BASE_URL", DEFAULT_BASE_URL).rstrip("/") + + +def get_api_key() -> str: + """ + Return the DocsGPT Agent API key from the DOCSGPT_API_KEY env var. + + The key must be an *Agent* key obtained from: + https://app.docsgpt.cloud -> Settings -> Agents -> Create New -> Key field + + :return: Agent API key string. + :raises RuntimeError: If the DOCSGPT_API_KEY env var is not set. + """ + key = os.environ.get("DOCSGPT_API_KEY", "").strip() + if not key: + raise RuntimeError( + "DOCSGPT_API_KEY is not set.\n" + " 1. Go to https://app.docsgpt.cloud\n" + " 2. Settings -> Agents -> Create New\n" + " 3. Upload your documents to the agent in the UI\n" + " 4. Copy the Key field and run:\n" + " export DOCSGPT_API_KEY='your-agent-key'" + ) + return key + + +# ############################################################################# +# Core API wrappers +# ############################################################################# + + +def query_docsgpt( + question: str, + api_key: str, + base_url: str | None = None, + history: list | None = None, + save_conversation: bool = False, +) -> dict: + """ + Send a question to DocsGPT via POST /api/answer (non-streaming). + + The agent retrieves relevant chunks from its indexed documents (RAG) + and returns a complete answer in one JSON response. + + :param question: User question or prompt string. + :param api_key: DocsGPT Agent API key. + :param base_url: Override base URL; defaults to cloud URL. + :param history: Prior conversation turns as a list of dicts. + Format: [{"prompt": "question", "response": "answer"}] + Note: the reply key is "response", NOT "answer". + :param save_conversation: Persist the conversation server-side. + :return: Dict with keys: answer, sources, conversation_id, tool_calls, thought. + """ + base_url = base_url or get_base_url() + payload: dict = { + "question": question, + "api_key": api_key, + "save_conversation": save_conversation, + } + if history: + # history must be sent as a JSON-encoded string, not a raw list. + payload["history"] = json.dumps(history) + _LOG.info("query_docsgpt | %s", question[:100]) + resp = requests.post(f"{base_url}/api/answer", json=payload, timeout=90) + resp.raise_for_status() + data = resp.json() + _LOG.info("answer received (%d chars)", len(data.get("answer", ""))) + return data + + +def stream_docsgpt( + question: str, + api_key: str, + base_url: str | None = None, + history: list | None = None, + attachments: list[str] | None = None, + print_live: bool = True, +) -> str: + """ + Send a question to DocsGPT via POST /stream (SSE streaming). + + Assembles all answer tokens into a single string. Optionally prints + each token to stdout as it arrives. + + SSE event types per official docs: + "answer" - incremental token (field: "answer", NOT "token") + "source" - source chunks retrieved by RAG + "thought" - agent reasoning steps (if enabled) + "id" - final conversation_id + "error" - error message + "end" - stream finished + + :param question: User question string. + :param api_key: DocsGPT Agent API key. + :param base_url: Override base URL. + :param history: Prior turns: [{"prompt": "...", "response": "..."}] + :param attachments: List of attachment_id strings from store_attachment flow. + :param print_live: If True, print each token to stdout as it arrives. + :return: Full answer string assembled from all SSE tokens. + """ + base_url = base_url or get_base_url() + payload: dict = {"question": question, "api_key": api_key} + if history: + payload["history"] = json.dumps(history) + if attachments: + payload["attachments"] = attachments + _LOG.info("stream_docsgpt | %s", question[:100]) + tokens: list[str] = [] + with requests.post( + f"{base_url}/stream", + json=payload, + stream=True, + timeout=120, + headers={"Accept": "text/event-stream"}, + ) as resp: + resp.raise_for_status() + for raw in resp.iter_lines(): + if not raw: + continue + line: str = raw.decode("utf-8") if isinstance(raw, bytes) else raw + if not line.startswith("data:"): + continue + data_str = line[5:].strip() + try: + event = json.loads(data_str) + except json.JSONDecodeError: + continue + etype = event.get("type", "") + if etype == "answer": + # Official field name is "answer". + token = event.get("answer", "") + if token: + tokens.append(token) + if print_live: + print(token, end="", flush=True) + elif etype == "end": + break + elif etype == "error": + _LOG.error("stream error: %s", event.get("error", "")) + break + if print_live: + print() + answer = "".join(tokens) + _LOG.info("stream complete (%d chars)", len(answer)) + return answer + + +def stream_docsgpt_events( + question: str, + api_key: str, + base_url: str | None = None, + history: list | None = None, +) -> Generator[dict, None, None]: + """ + Send a question to DocsGPT via POST /stream and yield every raw SSE event. + + Use this when you need access to source, thought, or id events in addition + to answer tokens. Each yielded dict has at least a 'type' key. + + :param question: User question string. + :param api_key: DocsGPT Agent API key. + :param base_url: Override base URL. + :param history: Prior turns: [{"prompt": "...", "response": "..."}] + :return: Generator of SSE event dicts. + """ + base_url = base_url or get_base_url() + payload: dict = {"question": question, "api_key": api_key} + if history: + payload["history"] = json.dumps(history) + with requests.post( + f"{base_url}/stream", + json=payload, + stream=True, + timeout=120, + headers={"Accept": "text/event-stream"}, + ) as resp: + resp.raise_for_status() + for raw in resp.iter_lines(): + if not raw: + continue + line: str = raw.decode("utf-8") if isinstance(raw, bytes) else raw + if not line.startswith("data:"): + continue + data_str = line[5:].strip() + try: + event = json.loads(data_str) + yield event + if event.get("type") == "end": + break + except json.JSONDecodeError: + continue + + +def multi_turn_conversation( + questions: list[str], + api_key: str, + base_url: str | None = None, +) -> list[dict]: + """ + Run a multi-turn conversation, accumulating history between turns. + + Each turn appends to history using the official format so the agent + has full context when answering follow-up questions. + + :param questions: Ordered list of question strings. + :param api_key: DocsGPT Agent API key. + :param base_url: Override base URL. + :return: List of turn dicts, each with keys: question, answer, sources, + conversation_id. + """ + base_url = base_url or get_base_url() + # History format per official docs: key is "response", not "answer". + history: list[dict] = [] + turns: list[dict] = [] + for q in questions: + result = query_docsgpt(q, api_key, base_url, history=history) + answer = result.get("answer", "") + history.append({"prompt": q, "response": answer}) + turns.append({ + "question": q, + "answer": answer, + "sources": result.get("sources", []), + "conversation_id": result.get("conversation_id", ""), + }) + _LOG.info("multi-turn: turn %d complete", len(turns)) + return turns + + +# ############################################################################# +# Attachment API (3-step: upload -> poll -> attach to /stream) +# ############################################################################# + + +def store_attachment( + file_path: str, + api_key: str, + base_url: str | None = None, +) -> str: + """ + Upload a file via POST /api/store_attachment and return the task_id. + + Step 1 of the attachment flow. Processing is asynchronous — call + poll_attachment_status() after this to wait for completion. + + :param file_path: Local path to the file to upload. + :param api_key: DocsGPT Agent API key. + :param base_url: Override base URL. + :return: task_id string to pass to poll_attachment_status(). + """ + base_url = base_url or get_base_url() + _LOG.info("store_attachment: %s", file_path) + with open(file_path, "rb") as fh: + resp = requests.post( + f"{base_url}/api/store_attachment", + files={"file": (os.path.basename(file_path), fh)}, + data={"api_key": api_key}, + timeout=60, + ) + resp.raise_for_status() + data = resp.json() + task_id = data.get("task_id", "") + _LOG.info("store_attachment task_id: %s", task_id) + return task_id + + +def poll_attachment_status( + task_id: str, + base_url: str | None = None, + timeout_sec: int = 120, + poll_interval: float = 3.0, +) -> str: + """ + Poll GET /api/task_status until attachment processing succeeds. + + Step 2 of the attachment flow. + + :param task_id: Task ID returned by store_attachment(). + :param base_url: Override base URL. + :param timeout_sec: Maximum seconds to wait before raising TimeoutError. + :param poll_interval: Seconds to wait between polls. + :return: attachment_id string to use in stream_docsgpt(attachments=[...]). + :raises TimeoutError: If the task does not complete within timeout_sec. + :raises RuntimeError: If the task fails or is revoked. + """ + base_url = base_url or get_base_url() + deadline = time.time() + timeout_sec + _LOG.info("polling task_status for task_id=%s", task_id) + while time.time() < deadline: + resp = requests.get( + f"{base_url}/api/task_status", + params={"task_id": task_id}, + timeout=15, + ) + resp.raise_for_status() + data = resp.json() + status = data.get("status", "") + _LOG.debug("task status: %s", status) + if status == "SUCCESS": + attachment_id = data.get("result", {}).get("attachment_id", "") + _LOG.info("attachment ready: %s", attachment_id) + return attachment_id + if status in ("FAILURE", "REVOKED"): + raise RuntimeError(f"Attachment task failed: {data}") + time.sleep(poll_interval) + raise TimeoutError(f"Attachment task {task_id} not ready after {timeout_sec}s") + + +def upload_and_attach( + file_path: str, + api_key: str, + base_url: str | None = None, + timeout_sec: int = 120, +) -> str: + """ + Upload a file and wait for its attachment_id (combines steps 1 and 2). + + :param file_path: Local path to the file to upload. + :param api_key: DocsGPT Agent API key. + :param base_url: Override base URL. + :param timeout_sec: Maximum seconds to wait for processing. + :return: attachment_id ready for stream_docsgpt(attachments=[...]). + """ + task_id = store_attachment(file_path, api_key, base_url) + return poll_attachment_status(task_id, base_url, timeout_sec) + + +# ############################################################################# +# Summarisation (inline prompt — no upload needed) +# ############################################################################# + + +def summarize_document( + text: str, + api_key: str, + base_url: str | None = None, + max_words: int = 200, + source_label: str = "document", +) -> str: + """ + Summarise document text by embedding it inline in the prompt. + + Sends the text directly inside the question to POST /api/answer. + No file upload is needed — the LLM reasons over the provided content. + Text is auto-truncated to 4000 characters before sending. + + :param text: Plain text to summarise. + :param api_key: DocsGPT Agent API key. + :param base_url: Override base URL. + :param max_words: Target word count for the summary. + :param source_label: Label used only for logging. + :return: Summary string from DocsGPT. + """ + base_url = base_url or get_base_url() + text = truncate_text(text, max_chars=4000) + prompt = ( + f"Read the following document carefully and write a concise, well-structured " + f"summary in no more than {max_words} words. Capture the key concepts, main " + f"points, and important details.\n\n" + f"DOCUMENT:\n{text}" + ) + _LOG.info("summarize_document: '%s' (%d chars)", source_label, len(text)) + result = query_docsgpt(prompt, api_key, base_url) + return result.get("answer", "") + + +# ############################################################################# +# FAQ generation (inline prompt — no upload needed) +# ############################################################################# + + +def generate_faqs( + text: str, + api_key: str, + base_url: str | None = None, + n_questions: int = 5, + source_label: str = "document", +) -> list[dict]: + """ + Generate FAQ question-answer pairs from document text. + + Sends the text inline in the prompt to POST /api/answer and parses + the Q:/A: formatted response into structured dicts. + Text is auto-truncated to 4000 characters before sending. + + :param text: Plain text to generate FAQs from. + :param api_key: DocsGPT Agent API key. + :param base_url: Override base URL. + :param n_questions: Number of FAQ items to request. + :param source_label: Label used only for logging. + :return: List of dicts with keys 'question' and 'answer'. + """ + base_url = base_url or get_base_url() + text = truncate_text(text, max_chars=4000) + prompt = ( + f"Based on the document below, generate exactly {n_questions} frequently asked " + f"questions (FAQs) with detailed, helpful answers.\n\n" + f"Use this EXACT format for every FAQ (no deviation):\n" + f"Q: \n" + f"A: \n\n" + f"Separate each FAQ pair with a blank line.\n\n" + f"DOCUMENT:\n{text}" + ) + _LOG.info("generate_faqs: %d Qs for '%s'", n_questions, source_label) + result = query_docsgpt(prompt, api_key, base_url) + raw = result.get("answer", "") + return parse_faqs(raw) + + +def parse_faqs(raw_text: str) -> list[dict]: + """ + Parse DocsGPT FAQ output in Q:/A: format into a list of dicts. + + :param raw_text: Raw answer string from DocsGPT containing Q:/A: pairs. + :return: List of dicts, each with keys 'question' and 'answer'. + """ + faqs: list[dict] = [] + blocks = re.split(r"\n{1,2}(?=Q:)", raw_text.strip()) + for block in blocks: + q_match = re.search(r"Q:\s*(.+?)(?:\n|$)", block, re.IGNORECASE) + a_match = re.search(r"A:\s*(.+)", block, re.IGNORECASE | re.DOTALL) + if q_match and a_match: + faqs.append({ + "question": q_match.group(1).strip(), + "answer": a_match.group(1).strip(), + }) + return faqs + + +# ############################################################################# +# Dataset loaders +# ############################################################################# + + +def fetch_awesome_ml_readme( + url: str = ( + "https://raw.githubusercontent.com/josephmisiti/" + "awesome-machine-learning/master/README.md" + ), +) -> str: + """ + Fetch the Awesome Machine Learning README from GitHub. + + :param url: Raw GitHub URL for the README. + :return: Full README text as a string. + """ + _LOG.info("fetching Awesome ML README from %s", url) + resp = requests.get(url, timeout=30) + resp.raise_for_status() + _LOG.info("fetched %d chars", len(resp.text)) + return resp.text + + +def parse_awesome_ml_sections(raw_md: str) -> dict[str, str]: + """ + Parse the Awesome ML README into a dict of section title to body text. + + :param raw_md: Raw markdown string of the README. + :return: Dict mapping section title string to section body string. + """ + sections: dict[str, str] = {} + current_title = "Preamble" + current_lines: list[str] = [] + for line in raw_md.splitlines(): + if line.startswith("## "): + sections[current_title] = "\n".join(current_lines).strip() + current_title = line.lstrip("# ").strip() + current_lines = [] + else: + current_lines.append(line) + sections[current_title] = "\n".join(current_lines).strip() + _LOG.info("parsed %d sections from Awesome ML README", len(sections)) + return sections + + +def load_stackoverflow_sample(n_rows: int = 20) -> list[dict]: + """ + Load Stack Overflow questions from HuggingFace Hub via streaming. + + Uses streaming so the full dataset is never downloaded. Falls back to + a built-in synthetic sample if the dataset is unavailable. + + :param n_rows: Number of rows to load. + :return: List of dicts with keys: title, body, answer. + """ + try: + from datasets import load_dataset # type: ignore + _LOG.info("loading SO dataset (streaming, n=%d)", n_rows) + ds = load_dataset( + "pacovaldez/stackoverflow-questions", + split="train", + streaming=True, + trust_remote_code=True, + ) + rows: list[dict] = [] + for i, row in enumerate(ds): + if i >= n_rows: + break + rows.append({ + "title": row.get("Title", row.get("title", "")), + "body": row.get("Body", row.get("body", "")), + "answer": row.get("Answer", row.get("answer", "")), + }) + _LOG.info("loaded %d SO rows", len(rows)) + return rows + except Exception as exc: + _LOG.warning("SO dataset unavailable: %s — using fallback", exc) + return _synthetic_stackoverflow_sample(n_rows) + + +def _synthetic_stackoverflow_sample(n: int) -> list[dict]: + """ + Return a built-in synthetic Stack Overflow sample for offline fallback. + + :param n: Number of rows to return. + :return: List of dicts with keys: title, body, answer. + """ + template = [ + { + "title": "How do I reverse a list in Python?", + "body": "I have a list and want it reversed. What is the most Pythonic way?", + "answer": "Use list.reverse() to reverse in-place, or my_list[::-1] for a new reversed list.", + }, + { + "title": "What is a Docker volume?", + "body": "I need to persist data across container restarts. What should I use?", + "answer": "A Docker volume is managed storage outside the container filesystem that persists between restarts.", + }, + { + "title": "Difference between supervised and unsupervised learning?", + "body": "What is the key difference between supervised and unsupervised ML?", + "answer": "Supervised learning trains on labelled data. Unsupervised learning finds structure in unlabelled data.", + }, + { + "title": "What is gradient descent?", + "body": "Explain gradient descent and how it is used in ML model training.", + "answer": "Gradient descent minimises a loss function by iteratively updating parameters in the direction of the negative gradient.", + }, + { + "title": "SQL vs NoSQL — when to use each?", + "body": "When should I choose a SQL database over a NoSQL one?", + "answer": "Use SQL for structured data with complex queries. Use NoSQL for flexible schemas and high-volume unstructured data.", + }, + { + "title": "What is a REST API?", + "body": "Can you explain what a REST API is and its core principles?", + "answer": "REST is an architectural style over HTTP. Core principles: statelessness, uniform interface, client-server separation, cacheability.", + }, + ] + return [template[i % len(template)] for i in range(n)] + + +def load_pile_sample(n_chars: int = 8000) -> str: + """ + Load a text sample from The Pile (uncopyrighted) via HuggingFace streaming. + + Falls back to a built-in NLP passage if the dataset is unavailable. + + :param n_chars: Approximate number of characters to collect. + :return: Text string of approximately n_chars characters. + """ + try: + from datasets import load_dataset # type: ignore + _LOG.info("streaming Pile dataset (target %d chars)", n_chars) + ds = load_dataset( + "monology/pile-uncopyrighted", + split="train", + streaming=True, + trust_remote_code=True, + ) + collected: list[str] = [] + total = 0 + for row in ds: + text = row.get("text", "") + collected.append(text) + total += len(text) + if total >= n_chars: + break + result = "\n\n".join(collected) + _LOG.info("collected %d chars from Pile", len(result)) + return result + except Exception as exc: + _LOG.warning("Pile dataset unavailable: %s — using fallback", exc) + return ( + "Natural language processing (NLP) is a subfield of artificial intelligence " + "that focuses on enabling computers to understand, interpret, and generate " + "human language. Modern NLP relies on large transformer-based models such as " + "BERT, GPT, and T5 that are pretrained on massive corpora and fine-tuned for " + "specific tasks. Key NLP tasks include text classification, named entity " + "recognition, sentiment analysis, machine translation, text summarisation, " + "and question answering. The attention mechanism, introduced in 'Attention is " + "All You Need' (Vaswani et al., 2017), allows models to weigh word importance " + "across a sequence. Transfer learning dramatically reduced the labelled data " + "required for downstream tasks. NLP applications power search engines, virtual " + "assistants, chatbots, content moderation, and automated document analysis." + ) + + +# ############################################################################# +# Text preprocessing helpers +# ############################################################################# + + +def clean_markdown(text: str) -> str: + """ + Strip markdown syntax to produce clean plain prose. + + Removes HTML tags, link syntax, headings, bullet points, code blocks, + inline code, and bold/italic markers. + + :param text: Raw markdown string. + :return: Clean plain-text string. + """ + text = re.sub(r"<[^>]+>", " ", text) + text = re.sub(r"\[([^\]]+)\]\([^)]+\)", r"\1", text) + text = re.sub(r"^#{1,6}\s+", "", text, flags=re.MULTILINE) + text = re.sub(r"^\s*[-*+]\s+", "", text, flags=re.MULTILINE) + text = re.sub(r"```[^`]*```", " ", text, flags=re.DOTALL) + text = re.sub(r"`[^`]+`", " ", text) + text = re.sub(r"\*{1,2}([^*]+)\*{1,2}", r"\1", text) + text = re.sub(r"\n{3,}", "\n\n", text) + text = re.sub(r"[ \t]+", " ", text) + return text.strip() + + +def chunk_text( + text: str, + max_chars: int = 2000, + overlap: int = 200, +) -> list[str]: + """ + Split a long document into overlapping character-level chunks. + + :param text: Input text string. + :param max_chars: Maximum characters per chunk. + :param overlap: Overlap in characters between consecutive chunks. + :return: List of text chunk strings. + :raises ValueError: If max_chars is not greater than overlap. + """ + if max_chars <= overlap: + raise ValueError("max_chars must be greater than overlap") + chunks: list[str] = [] + start = 0 + while start < len(text): + end = min(start + max_chars, len(text)) + chunks.append(text[start:end]) + if end == len(text): + break + start += max_chars - overlap + return chunks + + +def truncate_text(text: str, max_chars: int = 4000) -> str: + """ + Truncate text to at most max_chars characters. + + :param text: Input text string. + :param max_chars: Maximum number of characters to keep. + :return: Truncated string, with '[truncated]' appended if cut. + """ + if len(text) <= max_chars: + return text + return text[:max_chars].rstrip() + " [truncated]" + + +def so_rows_to_text(rows: list[dict]) -> str: + """ + Convert Stack Overflow row dicts into a single plain-text document. + + :param rows: List of dicts with keys: title, body, answer. + :return: Combined plain-text string with --- separators between rows. + """ + parts: list[str] = [] + for row in rows: + title = row.get("title", "") + body = clean_markdown(row.get("body", "")) + answer = clean_markdown(row.get("answer", "")) + parts.append(f"Question: {title}\n{body}\n\nAnswer:\n{answer}") + return "\n\n---\n\n".join(parts) + + +# ############################################################################# +# Multi-language support +# ############################################################################# + + +SUPPORTED_LANGUAGES: dict[str, str] = { + "en": "English", + "es": "Spanish", + "fr": "French", + "de": "German", + "zh": "Chinese", + "pt": "Portuguese", + "it": "Italian", + "ja": "Japanese", + "ar": "Arabic", +} + + +def list_supported_languages() -> dict[str, str]: + """ + Return the mapping of ISO-639-1 code to language name. + + :return: Dict of {code: language_name}. + """ + return dict(SUPPORTED_LANGUAGES) + + +def translate_text( + text: str, + source: str = "auto", + target: str = "en", +) -> str: + """ + Translate text using deep-translator (Google Translate backend). + + Handles texts longer than Google's ~4500-char limit by sentence-batching: + splits on sentence boundaries, translates each batch, then joins. + + :param text: Source text to translate. + :param source: Source language ISO-639-1 code or 'auto' for auto-detect. + :param target: Target language ISO-639-1 code. + :return: Translated string. + """ + from deep_translator import GoogleTranslator # type: ignore + if source == target: + return text + _LOG.info("translating %d chars: %s -> %s", len(text), source, target) + max_chunk = 4500 + if len(text) <= max_chunk: + return GoogleTranslator(source=source, target=target).translate(text) or text + sentences = re.split(r"(?<=[.!?])\s+", text) + batches: list[str] = [] + current: list[str] = [] + current_len = 0 + for sent in sentences: + if current_len + len(sent) > max_chunk and current: + batches.append(" ".join(current)) + current, current_len = [sent], len(sent) + else: + current.append(sent) + current_len += len(sent) + if current: + batches.append(" ".join(current)) + parts = [ + GoogleTranslator(source=source, target=target).translate(b) or b + for b in batches + ] + return " ".join(parts) + + +def summarize_multilang( + text: str, + api_key: str, + source_lang: str = "en", + output_lang: str = "es", + base_url: str | None = None, + max_words: int = 200, + source_label: str = "doc", +) -> dict: + """ + Summarise a document and return the summary in a target language. + + Pipeline: (translate to EN if needed) -> DocsGPT summarise + -> (translate to target language). + + :param text: Input document text. + :param api_key: DocsGPT Agent API key. + :param source_lang: ISO-639-1 code of the input language. + :param output_lang: ISO-639-1 code of the desired output language. + :param base_url: Override base URL. + :param max_words: Target word count for the summary. + :param source_label: Label used only for logging. + :return: Dict with keys 'english_summary' and 'translated_summary'. + """ + base_url = base_url or get_base_url() + english_text = ( + text if source_lang == "en" + else translate_text(text, source_lang, "en") + ) + english_summary = summarize_document( + english_text, api_key, base_url, max_words, source_label + ) + text_to_translate = clean_markdown(english_summary) + translated = ( + english_summary if output_lang == "en" + else translate_text(text_to_translate, "en", output_lang) + ) + return {"english_summary": english_summary, "translated_summary": translated} + + +def generate_faqs_multilang( + text: str, + api_key: str, + source_lang: str = "en", + output_lang: str = "es", + n_questions: int = 5, + base_url: str | None = None, + source_label: str = "doc", +) -> dict: + """ + Generate FAQs from a document and return them in a target language. + + :param text: Input document text. + :param api_key: DocsGPT Agent API key. + :param source_lang: ISO-639-1 code of the input language. + :param output_lang: ISO-639-1 code of the desired output language. + :param n_questions: Number of FAQ items to generate. + :param base_url: Override base URL. + :param source_label: Label used only for logging. + :return: Dict with keys 'english_faqs' and 'translated_faqs', + each a list of dicts with keys 'question' and 'answer'. + """ + base_url = base_url or get_base_url() + english_text = ( + text if source_lang == "en" + else translate_text(text, source_lang, "en") + ) + english_faqs = generate_faqs( + english_text, api_key, base_url, n_questions, source_label + ) + if output_lang == "en": + return {"english_faqs": english_faqs, "translated_faqs": english_faqs} + translated_faqs = [ + { + "question": translate_text(f["question"], "en", output_lang), + "answer": translate_text(f["answer"], "en", output_lang), + } + for f in english_faqs + ] + return {"english_faqs": english_faqs, "translated_faqs": translated_faqs} + + +# ############################################################################# +# Evaluation metrics (ROUGE + BLEU) +# ############################################################################# + + +def rouge_scores(hypothesis: str, reference: str) -> dict[str, float]: + """ + Compute ROUGE-1, ROUGE-2, and ROUGE-L F1 scores. + + :param hypothesis: Generated text (e.g. DocsGPT summary or FAQ answer). + :param reference: Ground-truth reference text. + :return: Dict with keys 'rouge1', 'rouge2', 'rougeL', all floats in [0, 1]. + """ + from rouge_score import rouge_scorer as rs # type: ignore + scorer = rs.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=True) + scores = scorer.score(reference, hypothesis) + return { + "rouge1": round(scores["rouge1"].fmeasure, 4), + "rouge2": round(scores["rouge2"].fmeasure, 4), + "rougeL": round(scores["rougeL"].fmeasure, 4), + } + + +def bleu_score(hypothesis: str, reference: str) -> float: + """ + Compute sentence-level BLEU score with smoothing (NLTK). + + :param hypothesis: Generated text. + :param reference: Ground-truth reference text. + :return: BLEU score as float in [0, 1]. + """ + from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction # type: ignore + nltk.download("punkt", quiet=True) + nltk.download("punkt_tab", quiet=True) + ref_tokens = reference.lower().split() + hyp_tokens = hypothesis.lower().split() + sf = SmoothingFunction().method1 + return round(sentence_bleu([ref_tokens], hyp_tokens, smoothing_function=sf), 4) + + +def evaluate_output(hypothesis: str, reference: str) -> dict[str, float]: + """ + Evaluate generated text against a reference using ROUGE and BLEU. + + :param hypothesis: Generated text. + :param reference: Ground-truth reference text. + :return: Dict with keys 'rouge1', 'rouge2', 'rougeL', 'bleu'. + """ + r = rouge_scores(hypothesis, reference) + b = bleu_score(hypothesis, reference) + result = {**r, "bleu": b} + _LOG.info("evaluate_output: %s", result) + return result + + +def evaluate_all( + summaries: dict[str, str], + faqs: dict[str, list[dict]], + source_texts: dict[str, str], +) -> dict[str, dict[str, float]]: + """ + Evaluate summaries and FAQ answers for multiple labelled documents. + + Uses the first 500 chars of each source text as the ground-truth reference. + + :param summaries: Dict of {label: summary_string}. + :param faqs: Dict of {label: [{"question": ..., "answer": ...}]}. + :param source_texts: Dict of {label: full_source_text}. + :return: Dict of {label: {rouge1, rouge2, rougeL, bleu, faq_rouge1, ...}}. + """ + results: dict = {} + for label in summaries: + reference = truncate_text(source_texts.get(label, ""), max_chars=500) + summary_scores = evaluate_output(summaries[label], reference) + faq_list = faqs.get(label, []) + faq_scores: dict = {} + if faq_list: + fs = evaluate_output(faq_list[0]["answer"], reference) + faq_scores = {f"faq_{k}": v for k, v in fs.items()} + results[label] = {**summary_scores, **faq_scores} + return results + + +# ############################################################################# +# Display helpers +# ############################################################################# + + +def print_answer(result: dict, label: str = "") -> None: + """ + Pretty-print a query_docsgpt() result dict. + + :param result: Dict returned by query_docsgpt(). + :param label: Optional label prefix for output lines. + """ + prefix = f"[{label}] " if label else "" + print(f"\n{prefix}ANSWER:\n{result.get('answer', '(no answer)')}") + sources = result.get("sources", []) + if sources: + print(f"{prefix}SOURCES: {sources}") + + +def print_faqs(faqs: list[dict], label: str = "") -> None: + """ + Pretty-print a list of FAQ dicts. + + :param faqs: List of dicts with keys 'question' and 'answer'. + :param label: Optional label prefix for output lines. + """ + prefix = f"[{label}] " if label else "" + if not faqs: + print(f"{prefix}(no FAQs generated)") + return + for i, faq in enumerate(faqs, 1): + print(f"\n{prefix}Q{i}: {faq['question']}") + print(f"{prefix}A{i}: {faq['answer']}") + + +def print_scores(scores: dict, label: str = "") -> None: + """ + Pretty-print an evaluation score dict. + + :param scores: Dict of metric_name -> float score. + :param label: Optional label prefix for output lines. + """ + prefix = f"[{label}] " if label else "" + for k, v in scores.items(): + print(f" {prefix}{k.upper()}: {v:.4f}") diff --git a/class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/project_template_README.md b/class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/project_template_README.md new file mode 100644 index 000000000..58d90e2d1 --- /dev/null +++ b/class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/project_template_README.md @@ -0,0 +1,802 @@ +# Summary +This directory contains a Docker-based development environment template with: + +- Utility scripts for Docker operations (build, run, clean, push) +- Configuration files for Dockerfile and environment setup +- Jupyter notebook templates for standardized project development +- Shell utilities and Python helpers for container-based workflows + +A guide to set up Docker-based projects using the template, customize it for +your needs, and maintain it over time. + +## Description of Files +- `bashrc` + - Bash configuration file enabling `vi` mode for command-line editing + +- `copy_docker_files.py` + - Python script for copying Docker configuration files to destination + directories + +- `docker_build.version.log` + - Log file containing Python, `pip`, Jupyter, and package version information + from Docker build + +- `docker_cmd.sh` + - Shell script for executing arbitrary commands inside Docker containers with + volume mounting + +- `docker_jupyter.sh` + - Shell script for launching Jupyter Lab server inside Docker containers + +- `docker_name.sh` + - Configuration file defining Docker repository and image naming variables + +- `Dockerfile` + - Docker image build configuration with Ubuntu, Python, Jupyter, and project + dependencies + +- `etc_sudoers` + - Sudoers configuration file granting passwordless sudo access for postgres + user + +- `README.md` + - Documentation file describing directory contents, files, and executable + scripts + +- `template_utils.py` + - Python utility functions supporting tutorial notebooks with data processing + and modeling helpers + +- `template.API.ipynb` + - Jupyter notebook template for API exploration and library usage examples + +- `template.example.ipynb` + - Jupyter notebook template for project examples and demonstrations + +- `utils.sh` + - Bash utility library with reusable functions for Docker operations + - Provides centralized argument parsing (`parse_default_args`) for `-h` and + `-v` flags used by all `docker_*.sh` scripts + - Provides Jupyter configuration logic: vim keybindings, notification + settings, and Docker run option builders + - All `docker_*.sh`, `docker_jupyter.sh`, and `run_jupyter.sh` scripts across + the repo source this file from `class_project/project_template/utils.sh` + +## Workflows +- All commands should be run from inside the project directory + ```bash + > cd tutorials/FilterPy + ``` + +- To build the container for a project + ```bash + > cd $PROJECT + # Build the container. + > docker_build.sh + # Build without cache (pass extra args after -v). + > docker_build.sh --no-cache + # Test the container. + > docker_bash.sh ls + ``` + +- Enable verbose (trace) output with `-v` + ```bash + > docker_build.sh -v + > docker_bash.sh -v + ``` + +- Get help for any docker script + ```bash + > docker_build.sh -h + > docker_jupyter.sh -h + ``` + +- Start Jupyter + ```bash + > docker_jupyter.sh + # Go to localhost:8888 + ``` + +- Start Jupyter on a specific port with vim support + ```bash + > docker_jupyter.sh -p 8890 -u + # Go to localhost:8890 + ``` + +## How to Customize a Project Template +- Copy the template + ```bash + > cp -r class_project/project_template $TARGET + ``` + +## Description of Executables + +### `copy_docker_files.py` +- **What It Does** + - Copies Docker configuration and utility files from project_template to a + destination directory + - Preserves all file permissions and attributes during copying + - Creates destination directory if it doesn't exist + +- Copy all Docker files to a target directory: + ```bash + > ./copy_docker_files.py --dst_dir /path/to/destination + ``` + +- Copy with verbose logging: + ```bash + > ./copy_docker_files.py --dst_dir /path/to/destination -v DEBUG + ``` + +### `docker_bash.sh` +- **What It Does** + - Launches an interactive bash shell inside a Docker container + - Mounts the current working directory as `/data` inside the container + - Exposes port 8888 for potential services running in the container + - Accepts `-h` (help) and `-v` (verbose/trace) flags via `parse_default_args` + +- Launch bash shell in the container: + ```bash + > ./docker_bash.sh + ``` + +- Launch with verbose output (prints each command): + ```bash + > ./docker_bash.sh -v + ``` + +### `docker_build.sh` +- **What It Does** + - Builds Docker container images using Docker BuildKit + - Supports single-architecture builds (default) or multi-architecture builds + (`linux/arm64`, `linux/amd64`) + - Copies project files to temporary build directory and generates build logs + - Accepts `-h` (help) and `-v` (verbose/trace) flags; any extra arguments + after flags are forwarded to `docker build` + +- Build container image for current architecture: + ```bash + > ./docker_build.sh + ``` + +- Build without Docker layer cache: + ```bash + > ./docker_build.sh --no-cache + ``` + +- Build multi-architecture image (requires setting `DOCKER_BUILD_MULTI_ARCH=1` + in the script): + ```bash + > # Edit docker_build.sh to set DOCKER_BUILD_MULTI_ARCH=1 + > ./docker_build.sh + ``` + +### `docker_clean.sh` +- **What It Does** + +- Removes all Docker images matching the project's full image name +- Lists images before and after removal for verification +- Uses force removal to ensure cleanup completes + +- Remove project's Docker images: + ```bash + > ./docker_clean.sh + ``` + +### `docker_cmd.sh` +- **What It Does** + - Executes arbitrary commands inside a Docker container + - Mounts current directory as `/data` for accessing project files + - Automatically removes container after command execution completes + - Accepts `-h` (help) and `-v` (verbose/trace) flags; remaining arguments + form the command to execute + +- Run Python script inside container: + ```bash + > ./docker_cmd.sh python script.py --arg value + ``` + +- List files in the container: + ```bash + > ./docker_cmd.sh ls -la /data + ``` + +- Run tests inside container: + ```bash + > ./docker_cmd.sh pytest tests/ + ``` + +### `docker_exec.sh` +- **What It Does** + - Attaches to an already running Docker container with an interactive bash + shell + - Finds the container ID automatically based on the image name + - Useful for debugging or inspecting running containers + - Accepts `-h` (help) and `-v` (verbose/trace) flags via `parse_default_args` + +- Attach to running container: + ```bash + > ./docker_exec.sh + ``` + +### `docker_jupyter.sh` +- **What It Does** + - Launches Jupyter Lab server inside a Docker container + - Supports custom port configuration (default 8888), vim keybindings, and + custom directory mounting + - Runs `run_jupyter.sh` script inside the container with specified options + +- Start Jupyter on default port 8888: + ```bash + > ./docker_jupyter.sh + ``` + +- Start Jupyter on custom port with vim bindings: + ```bash + > ./docker_jupyter.sh -p 8889 -u + ``` + +- Start Jupyter with external directory mounted: + ```bash + > ./docker_jupyter.sh -d /path/to/notebooks -p 8889 + ``` + +- Start Jupyter in verbose mode: + ```bash + > ./docker_jupyter.sh -v -p 8890 + ``` + +### `docker_push.sh` +- **What It Does** + - Authenticates to Docker registry using credentials from + `~/.docker/passwd.$REPO_NAME.txt` + - Pushes the project's Docker image to the remote repository + - Lists images before pushing for verification + +- Push container image to registry: + ```bash + > ./docker_push.sh + ``` + +### `run_jupyter.sh` +- **What It Does** + - Launches Jupyter Lab server with no authentication (token and password + disabled) + - Binds to all network interfaces (0.0.0.0) on port 8888 + - Allows root access for container environments + - When `JUPYTER_USE_VIM=1`, verifies that `jupyterlab_vim` is installed + before enabling vim keybindings; exits with an error if not found + +- Start Jupyter Lab server (typically called from docker_jupyter.sh): + ```bash + > ./run_jupyter.sh + ``` + +- Start with vim keybindings (requires `jupyterlab_vim` installed in the + container): + ```bash + > JUPYTER_USE_VIM=1 ./run_jupyter.sh + ``` + +### `utils.sh` +- **What It Does** + - Central Bash library sourced by all `docker_*.sh` and `run_jupyter.sh` + scripts across the repository + - Provides `parse_default_args` which adds `-h` (help) and `-v` + (verbose/`set -x`) flags to every docker script + - Provides `build_container_image`, `push_container_image`, + `remove_container_image`, `kill_container`, `exec_container` utilities + - Provides Jupyter configuration helpers: vim keybindings, notification + suppression, and Docker run option builders + +### `version.sh` +- **What It Does** + - Reports version information for Python3, pip3, and Jupyter + - Lists all installed Python packages with versions + - Used during Docker image builds to log environment configuration + +- Display version information: + ```bash + > ./version.sh + ``` + +- Save version information to a log file: + ```bash + > ./version.sh 2>&1 | tee version.log + ``` + +# Template Customization and Maintenance + +## Quick Start for New Projects + +### Step 1: Copy the Template +```bash +> cd class_project/project_template +> cp -r . /path/to/your/new/project +> cd /path/to/your/new/project +``` + +### Step 2: Choose a Base Image +The template includes three Dockerfile options. Choose the one that best fits +your project: + +| Option | File | Best For | +| -------------------------- | ------------------------ | ---------------------------------------------------------------- | +| **Standard** | `Dockerfile.ubuntu` | Full Ubuntu environment with system tools | +| **Lightweight** | `Dockerfile.python_slim` | Minimal Python environment; reduced image size | +| **Modern Package Manager** | `Dockerfile.uv` | Fast dependency resolution with [uv](https://docs.astral.sh/uv/) | + +**How to choose:** + +- **Use Standard** if you need system-level tools (git, curl, graphviz, etc.) +- **Use Python Slim** to minimize image size and build time +- **Use uv** if you want faster, more reliable dependency management + +### Step 3: Set Up Your Dockerfile +- Delete unused reference files + ```bash + > rm Dockerfile.ubuntu Dockerfile.python_slim Dockerfile.uv + ``` + +- Create your working Dockerfile + ```bash + > cp Dockerfile.ubuntu Dockerfile + ``` + +- Add your dependencies + ```bash + > echo "numpy\npandas\nscikit-learn" > requirements.in + > pip-compile requirements.in > requirements.txt + ``` + +### Step 4: Keep Customization Minimal +- Only modify what's necessary for your project +- Use `requirements.txt` for all Python packages (don't edit Dockerfile for + this) +- Keep `bashrc` and `etc_sudoers` as-is unless you need custom shell setup +- Keep base image and Python version unless you have specific requirements + +## Understanding the Dockerfile Flow +Each Dockerfile follows the same structure. Here are the key stages: + +### Stage 1: Base Image and System Setup +```dockerfile +FROM ubuntu:24.04 # or python:3.12-slim, depending on your requirement +ENV DEBIAN_FRONTEND noninteractive +RUN apt-get -y update && apt-get -y upgrade +``` + +- **Purpose**: Start with a clean base image and disable interactive + installation prompts + +- **When to customize**: Only change the base image or version if your project + has specific requirements (different Ubuntu version, specific Python version, + etc.) + +### Stage 2: System Utilities (Ubuntu-based Dockerfiles Only) +```dockerfile +RUN apt install -y --no-install-recommends \ + sudo \ + curl \ + systemctl \ + gnupg \ + git \ + vim +``` + +- **Purpose**: Install essential system tools for development and container + management + +- **When to customize**: Add only if needed for your project + - `postgresql-client`: for database connections + - `graphviz`: for graph visualizations + - `ffmpeg`: for media processing + +- **Best practice**: Use `--no-install-recommends` to keep the image small + +### Stage 3: Python and Build Tools (Ubuntu-based Dockerfiles Only) +```dockerfile +RUN apt-get update && apt-get install -y --no-install-recommends \ + build-essential \ + python3 \ + python3-pip \ + python3-dev \ + python3-venv \ + && rm -rf /var/lib/apt/lists/* +``` + +- **Purpose**: Install Python 3, pip, and build tools needed for compiled + packages + +- **Why venv**: Creates an isolated Python environment separate from system + Python + +- **When to customize**: Rarely. Only change if you need a specific Python + version (e.g., `python3.11` instead of `python3`) + +### Stage 4: Virtual Environment Setup +```dockerfile +RUN python3 -m venv /opt/venv +ENV PATH="/opt/venv/bin:$PATH" +RUN python -m pip install --upgrade pip +``` + +- **Purpose**: Create and activate an isolated virtual environment for your + project + +- **Why this matters**: Ensures reproducibility and prevents dependency + conflicts across projects + +- **When to customize**: Never. This is a standard best practice + +### Stage 5: Jupyter Installation +```dockerfile +RUN pip install jupyterlab jupyterlab_vim +``` + +- **Purpose**: Install JupyterLab and the Vim keybinding extension for + interactive development + - `jupyterlab`: the main IDE for running notebooks in the browser + - `jupyterlab_vim`: adds Vim-style navigation to notebook cells + +- **Why in Dockerfile, not requirements.txt**: These are infrastructure + packages (the IDE itself), not project-specific dependencies + - Do NOT add `jupyterlab`, `jupyterlab-vim`, or `ipywidgets` to + `requirements.txt`; they are already installed here + +- **When to customize**: + - **Remove** this line if your project doesn't use Jupyter + - **Add more extensions** if needed (e.g., `jupyterlab-git`, + `jupyterlab-variableinspector`) + +### Stage 6: Project Dependencies +```dockerfile +COPY requirements.txt /install/requirements.txt +RUN pip install --no-cache-dir -r /install/requirements.txt +``` + +- **Purpose**: Install your project-specific Python packages + +- **When to customize**: This is the primary place to customize. Define all your + dependencies in `requirements.txt` + +- **Best practice**: + - **Pin all versions**: `numpy==1.24.0` (not `numpy>=1.20.0`) + - **Use `--no-cache-dir`**: Reduces image size by skipping pip cache + - **For complex dependencies**: Use `requirements.in` with `pip-tools` or + `pip-compile` + +- **Example requirements.txt**: + ```text + numpy==1.24.0 + pandas==2.0.0 + scikit-learn==1.2.2 + tensorflow==2.13.0 + ``` + +### Stage 7: Configuration +```dockerfile +COPY etc_sudoers /etc/sudoers +COPY bashrc /root/.bashrc +``` + +- **Purpose**: Apply custom bash configuration and sudo permissions + +- **When to customize**: + - **Edit `bashrc`**: to add aliases, environment variables, or custom prompt + - **Edit `etc_sudoers`**: if additional users need passwordless sudo access + +### Stage 8: Version Logging +```dockerfile +ADD version.sh /install/ +RUN /install/version.sh 2>&1 | tee version.log +``` + +- **Purpose**: Document the exact versions of Python, pip, Jupyter, and all + installed packages + +- **What it logs**: + - Python 3 version + - Pip version + - Jupyter version + - Complete list of all installed Python packages + +- **Why it matters**: Creates a detailed record of your container's environment + for troubleshooting and reproducibility + +- **How to use**: After building, review `version.log` to verify all + dependencies installed correctly + ```bash + > docker build -t my-project . + > cat version.log + ``` + +- **Extending it**: If you need to log additional tools (MongoDB, Node.js, + etc.), add them to `version.sh`: + ```bash + > echo "# mongo" + > mongod --version + ``` + +### Stage 9: Port Declaration +```dockerfile +EXPOSE 8888 +``` + +- **Purpose**: Declare that the container uses port 8888 (informational for + Docker) + +- **When to customize**: Add additional ports if your application needs them + (e.g., `EXPOSE 8888 5432 3000`) + +## Best Practices: Keep It Simple + +### The Core Principle +Only change what's necessary for your project. Everything else should inherit +from the template. + +This approach: + +- Makes Dockerfiles easier to understand and maintain +- Keeps images smaller and faster to build +- Simplifies future updates from the template +- Ensures consistency across similar projects + +### How to Do It Right +| What | Where | Example | +| :--------------------------- | :--------------------------- | :------------------------------ | +| Project Python packages | `requirements.txt` | `numpy==1.24.0` | +| Jupyter + Vim (always there) | Dockerfile Stage 5 | `jupyterlab jupyterlab_vim` | +| System tools | Dockerfile `apt-get` section | `postgresql-client` | +| Shell aliases | `bashrc` | `alias jlab="jupyter lab"` | +| Custom scripts | `scripts/` directory | Setup or initialization scripts | +| User permissions | `etc_sudoers` | Grant passwordless sudo | + +- **Do NOT add to `requirements.txt`**: `jupyterlab`, `jupyterlab-vim`, + `jupyterlab_vim`, or `ipywidgets` — these are Jupyter infrastructure packages + and are already installed in Stage 5 of the Dockerfile + +### Wrong Vs. Right Approach +- **Wrong**: Embed everything in the Dockerfile + ```dockerfile + RUN pip install my-package && python my_setup.py && npm install + ``` + +- **Right**: Use separate files and keep Dockerfile clean + ```dockerfile + COPY requirements.txt /install/ + RUN pip install -r /install/requirements.txt + COPY scripts/setup.sh /install/ + RUN /install/setup.sh + ``` + +## .Dockerignore Policy + +### Why It Matters +The `.dockerignore` file prevents unnecessary files from being added to the +Docker build context: + +- **Reduces build time**: Fewer files to transfer to Docker daemon +- **Reduces image size**: Only necessary files are included +- **Improves security**: Prevents leaking sensitive data + +### What to Exclude: Category Breakdown +- Python Artifacts (Always Exclude) + ```verbatim + __pycache__/ + *.pyc + *.pyo + *.pyd + ``` + - Why: Compiled bytecode generated at runtime. Regenerated in container, adds + bloat + +- Virtual Environments (Always Exclude) + ```verbatim + venv/ + .venv/ + env/ + .env/ + ``` + - Why: Local venvs aren't portable to containers. The Dockerfile creates its + own + +- Jupyter Checkpoints (Always Exclude) + ```verbatim + .ipynb_checkpoints/ + ``` + - Why: Auto-generated by Jupyter, not needed in the image + +- Git and Version Control (Always Exclude) + ```verbatim + .git/ + .gitignore + .gitattributes + ``` + - Why: Repository history not needed at runtime + +- Docker Build Scripts (Always Exclude) + ```verbatim + docker_build.sh + docker_push.sh + docker_clean.sh + docker_exec.sh + docker_cmd.sh + docker_bash.sh + docker_jupyter.sh + docker_name.sh + Dockerfile.* + ``` + - Why: Local development scripts don't run inside the container + +- Large Data Files (Recommended) + ```verbatim + data/ + *.csv + *.pkl + *.h5 + *.parquet + ``` + - Why: Don't ship large training and test data in the image. Mount via volume + instead + - Best practice: `bash > docker run -v /path/to/data:/data my-image ` + +- Test Files (Project-Dependent) + ```verbatim + tests/ + tutorials/ + ``` + - Why: Exclude if tests don't run in the container + - When to include: If CI and CD runs tests inside the container + +- Documentation (Recommended) + ```verbatim + README.md + docs/ + *.md + ``` + - Why: Not needed at runtime + - Exception: Only keep if your app reads these files at runtime + +- Generated Files (Always Exclude) + ```verbatim + *.log + *.tmp + *.cache + build/ + dist/ + ``` + - Why: Generated at runtime, not needed in the image + +## Workflow: From Template to Your Project + +### Complete Setup Checklist +- Copy the template + ```bash + > cp -r project_template my-new-project + > cd my-new-project + ``` + +- Keep all reference Dockerfiles + ```verbatim + Dockerfile.ubuntu_24_04 + Dockerfile.python_slim + Dockerfile.uv + ``` + +- Create your working Dockerfile + ```bash + > cp Dockerfile.ubuntu_24_04 Dockerfile + ``` + +- Add your dependencies + ```bash + > pip freeze > requirements.txt + ``` + +- Configure `.dockerignore`: Review the template `.dockerignore` and add your + project-specific exclusions (e.g., data directories) + +- Test the build + ```bash + > docker build -t my-project:latest . + > docker run -it my-project:latest bash + ``` + +- Test Jupyter (if using) + ```bash + > ./docker_jupyter.sh -p 8888 + ``` + +- Document customizations in your project README: + - Base image chosen and why + - Key dependencies + - Any Dockerfile modifications + - How to build and run + +## Maintaining Your Setup + +### Document Any Changes +- If you modify the Dockerfile, add explanatory comments: + ```dockerfile + # Custom: PostgreSQL client for database access + postgresql-client \ + + # Custom: Node.js for frontend builds + nodejs \ + ``` + +### Monitor Package Versions +- After each build, review `version.log`: + ```bash + > docker build -t my-project . + > cat version.log + ``` + +### Keep `.dockerignore` Updated +- If you add new directories or files, update `.dockerignore`. Add to + `.dockerignore` if the directory shouldn't be in the image: + ```verbatim + data/ + cache/ + .temp/ + ``` + +### Contribute Improvements Back +When you improve your project's Docker setup: + +- Test thoroughly in your project +- Document the improvement clearly +- Submit back to `project_template` +- Other projects can adopt it when they update + +Example improvements: + +- Better way to install TensorFlow with GPU support +- Optimized `.dockerignore` for data science projects +- Security hardening (non-root user setup) + +## Troubleshooting + +### Build Is Slow +- Check `.dockerignore`: Ensure large directories (data/, .git/) are excluded +- Check Docker daemon: Verify Docker is running properly +- Check layer caching: Docker reuses cached layers; avoid changing early layers + +### Image Is Too Large +- Check layer sizes: + ```bash + > docker history my-project:latest + ``` + +- Remove unnecessary packages or use `python_slim` base image + +### Package Not Found Error +- Verify package name in PyPI (packages are case-sensitive) +- Check Python version compatibility +- Pin specific version if needed + +### Permission Issues in Container +- Check `etc_sudoers`: Ensure user has appropriate permissions +- Check file ownership: Ensure COPY doesn't create root-only files + +### Jupyter Won't Connect +- Run Jupyter + ```bash + > ./docker_jupyter.sh -p 8888 + ``` + +- Verify http://localhost:8888 (not https). Check firewall if remote access + needed + +### Vim Keybindings Not Working +- If `run_jupyter.sh` exits with `ERROR: jupyterlab_vim is not installed`, it + means `jupyterlab_vim` is missing from the container image +- Make sure `jupyterlab_vim` is installed in the Dockerfile: + ```dockerfile + RUN pip install jupyterlab jupyterlab_vim + ``` +- Rebuild the image after adding the package: + ```bash + > ./docker_build.sh + ``` diff --git a/class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/requirements.txt b/class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/requirements.txt index 49aca3901..ea1d177dd 100644 --- a/class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/requirements.txt +++ b/class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/requirements.txt @@ -1,4 +1,46 @@ -matplotlib -numpy -pandas -seaborn +# =========================================================================== +# DocsGPT Tutorial — Python dependencies +# =========================================================================== +# Install: pip install -r requirements.txt +# All versions pinned for reproducibility. + +# --------------------------------------------------------------------------- +# Core — HTTP client and API interaction +# --------------------------------------------------------------------------- +requests==2.32.3 +python-dotenv>=1.0.1 + +# --------------------------------------------------------------------------- +# NLP evaluation metrics +# --------------------------------------------------------------------------- +rouge-score==0.1.2 +nltk==3.9.1 + +# --------------------------------------------------------------------------- +# Dataset loaders (HuggingFace Hub streaming) +# --------------------------------------------------------------------------- +datasets==2.20.0 +huggingface-hub>=0.24.6 + +# --------------------------------------------------------------------------- +# Multi-language translation +# --------------------------------------------------------------------------- +deep-translator==1.11.4 + +# --------------------------------------------------------------------------- +# User interface +# --------------------------------------------------------------------------- +gradio==6.14.0 + +# --------------------------------------------------------------------------- +# Notebook utilities +# --------------------------------------------------------------------------- +pandas==2.2.2 +jupyterlab==4.2.5 +jupytext==1.16.4 + +# --------------------------------------------------------------------------- +# Dev / linting +# --------------------------------------------------------------------------- +black==24.8.0 +pylint==3.3.1 \ No newline at end of file diff --git a/class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/run_jupyter.sh b/class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/run_jupyter.sh index d725c3fe7..470d08719 100755 --- a/class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/run_jupyter.sh +++ b/class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/run_jupyter.sh @@ -25,6 +25,10 @@ get_docker_vars_script ${BASH_SOURCE[0]} source $DOCKER_NAME print_docker_vars +# Configure vim keybindings and notifications. +configure_jupyter_vim_keybindings +configure_jupyter_notifications + # Setup Jupyter Lab environment. setup_jupyter_environment diff --git a/class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/template.API.ipynb b/class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/template.API.ipynb deleted file mode 100644 index 3afca937c..000000000 --- a/class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/template.API.ipynb +++ /dev/null @@ -1,215 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "183c2248-ea3d-43ba-b87e-d821bba1bbc6", - "metadata": {}, - "source": [ - "# Template API Notebook\n", - "\n", - "This is a template notebook. The first heading should be the title of what notebook is about. For example, if it is a neo4j tutorial the heading should be `Neo4j API`.\n", - "\n", - "- Add description of what the notebook does.\n", - "- Point to references, e.g. (neo4j.API.md)\n", - "- Add citations.\n", - "- Keep the notebook flow clear.\n", - "- Comments should be imperative and have a period at the end.\n", - "- Your code should be well commented.\n", - "\n", - "The name of this notebook should in the following format:\n", - "- if the notebook is exploring `pycaret API`, then it is `pycaret.API.ipynb`\n", - "\n", - "Follow the reference to write notebooks in a clear manner: https://github.com/causify-ai/helpers/blob/master/docs/coding/all.jupyter_notebook.how_to_guide.md" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "265e0d58-a7cd-4edf-a0b4-96b60220e801", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "The autoreload extension is already loaded. To reload it, use:\n", - " %reload_ext autoreload\n" - ] - } - ], - "source": [ - "%load_ext autoreload\n", - "%autoreload 2\n", - "%matplotlib inline" - ] - }, - { - "cell_type": "markdown", - "id": "d3b2f997-5c9b-4238-b6d5-e5f2cea43809", - "metadata": {}, - "source": [ - "## Imports" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "d1480ee9-d6a6-437d-b927-da6cbb05bdf5", - "metadata": {}, - "outputs": [], - "source": [ - "import logging\n", - "# Import libraries in this section.\n", - "# Avoid imports like import *, from ... import ..., from ... import *, etc.\n", - "\n", - "import helpers.hdbg as hdbg\n", - "import helpers.hnotebook as hnotebo" - ] - }, - { - "cell_type": "markdown", - "id": "f9208cc9-837d-4fec-a312-9c4aa5b7648d", - "metadata": {}, - "source": [ - "## Configuration" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "9a2d7a9c-c6c5-48c9-8445-11c97045d00b", - "metadata": { - "lines_to_next_cell": 2 - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[0mWARNING: Running in Jupyter\n", - "INFO > cmd='/venv/lib/python3.12/site-packages/ipykernel_launcher.py -f /home/.local/share/jupyter/runtime/kernel-085a2ce7-6161-4c8a-92d5-492051832f3c.json'\n" - ] - } - ], - "source": [ - "hdbg.init_logger(verbosity=logging.INFO)\n", - "\n", - "_LOG = logging.getLogger(__name__)\n", - "\n", - "hnotebo.config_notebook()" - ] - }, - { - "cell_type": "markdown", - "id": "79c37ba3-bd5d-4a44-87df-645eee54977a", - "metadata": { - "lines_to_next_cell": 2 - }, - "source": [ - "## Make the notebook flow clear\n", - "Each notebook needs to follow a clear and logical flow, e.g:\n", - "- Load data\n", - "- Compute stats\n", - "- Clean data\n", - "- Compute stats\n", - "- Do analysis\n", - "- Show results\n", - "\n", - "\n", - "\n", - "\n", - "#############################################################################\n", - "Template\n", - "#############################################################################" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "a8a109cd-fc8e-4b9e-9dc0-4fc8d4126ad8", - "metadata": { - "lines_to_next_cell": 2 - }, - "outputs": [], - "source": [ - "class Template:\n", - " \"\"\"\n", - " Brief imperative description of what the class does in one line, if needed.\n", - " \"\"\"\n", - "\n", - " def __init__(self):\n", - " pass\n", - "\n", - " def method1(self, arg1: int) -> None:\n", - " \"\"\"\n", - " Brief imperative description of what the method does in one line.\n", - "\n", - " You can elaborate more in the method docstring in this section, for e.g. explaining\n", - " the formula/algorithm. Every method/function should have a docstring, typehints and include the\n", - " parameters and return as follows:\n", - "\n", - " :param arg1: description of arg1\n", - " :return: description of return\n", - " \"\"\"\n", - " # Code bloks go here.\n", - " # Make sure to include comments to explain what the code is doing.\n", - " # No empty lines between code blocks.\n", - " pass\n", - "\n", - "\n", - "def template_function(arg1: int) -> None:\n", - " \"\"\"\n", - " Brief imperative description of what the function does in one line.\n", - "\n", - " You can elaborate more in the function docstring in this section, for e.g. explaining\n", - " the formula/algorithm. Every function should have a docstring, typehints and include the\n", - " parameters and return as follows:\n", - "\n", - " :param arg1: description of arg1\n", - " :return: description of return\n", - " \"\"\"\n", - " # Code bloks go here.\n", - " # Make sure to include comments to explain what the code is doing.\n", - " # No empty lines between code blocks.\n", - " pass" - ] - }, - { - "cell_type": "markdown", - "id": "00926523-ae59-497d-bba8-b22e58333849", - "metadata": {}, - "source": [ - "## The flow should be highlighted using headings in markdown\n", - "```\n", - "# Level 1\n", - "## Level 2\n", - "### Level 3\n", - "```" - ] - } - ], - "metadata": { - "jupytext": { - "formats": "ipynb,py:percent" - }, - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.12.3" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/template.API.py b/class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/template.API.py deleted file mode 100644 index 4192ef8fe..000000000 --- a/class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/template.API.py +++ /dev/null @@ -1,129 +0,0 @@ -# --- -# jupyter: -# jupytext: -# formats: ipynb,py:percent -# text_representation: -# extension: .py -# format_name: percent -# format_version: '1.3' -# jupytext_version: 1.19.0 -# kernelspec: -# display_name: Python 3 (ipykernel) -# language: python -# name: python3 -# --- - -# %% [markdown] -# # Template API Notebook -# -# This is a template notebook. The first heading should be the title of what notebook is about. For example, if it is a neo4j tutorial the heading should be `Neo4j API`. -# -# - Add description of what the notebook does. -# - Point to references, e.g. (neo4j.API.md) -# - Add citations. -# - Keep the notebook flow clear. -# - Comments should be imperative and have a period at the end. -# - Your code should be well commented. -# -# The name of this notebook should in the following format: -# - if the notebook is exploring `pycaret API`, then it is `pycaret.API.ipynb` -# -# Follow the reference to write notebooks in a clear manner: https://github.com/causify-ai/helpers/blob/master/docs/coding/all.jupyter_notebook.how_to_guide.md - -# %% -# %load_ext autoreload -# %autoreload 2 -# %matplotlib inline - -# %% [markdown] -# ## Imports - -# %% -import logging -# Import libraries in this section. -# Avoid imports like import *, from ... import ..., from ... import *, etc. - -import helpers.hdbg as hdbg -import helpers.hnotebook as hnotebo - -# %% [markdown] -# ## Configuration - -# %% -hdbg.init_logger(verbosity=logging.INFO) - -_LOG = logging.getLogger(__name__) - -hnotebo.config_notebook() - - -# %% [markdown] -# ## Make the notebook flow clear -# Each notebook needs to follow a clear and logical flow, e.g: -# - Load data -# - Compute stats -# - Clean data -# - Compute stats -# - Do analysis -# - Show results -# -# -# -# - - -# ############################################################################# -# Template -# ############################################################################# - - -# %% -class Template: - """ - Brief imperative description of what the class does in one line, if needed. - """ - - def __init__(self): - pass - - def method1(self, arg1: int) -> None: - """ - Brief imperative description of what the method does in one line. - - You can elaborate more in the method docstring in this section, for e.g. explaining - the formula/algorithm. Every method/function should have a docstring, typehints and include the - parameters and return as follows: - - :param arg1: description of arg1 - :return: description of return - """ - # Code bloks go here. - # Make sure to include comments to explain what the code is doing. - # No empty lines between code blocks. - pass - - -def template_function(arg1: int) -> None: - """ - Brief imperative description of what the function does in one line. - - You can elaborate more in the function docstring in this section, for e.g. explaining - the formula/algorithm. Every function should have a docstring, typehints and include the - parameters and return as follows: - - :param arg1: description of arg1 - :return: description of return - """ - # Code bloks go here. - # Make sure to include comments to explain what the code is doing. - # No empty lines between code blocks. - pass - - -# %% [markdown] -# ## The flow should be highlighted using headings in markdown -# ``` -# # Level 1 -# ## Level 2 -# ### Level 3 -# ``` diff --git a/class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/template.example.ipynb b/class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/template.example.ipynb deleted file mode 100644 index a2e9aedd7..000000000 --- a/class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/template.example.ipynb +++ /dev/null @@ -1,198 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "50f78f7e-2dee-45d6-9d37-7a55eeaae283", - "metadata": {}, - "source": [ - "# Template Example Notebook\n", - "\n", - "This is a template notebook. The first heading should be the title of what notebook is about. For example, if it is a project on neo4j tutorial the heading should be `Project Title`.\n", - "\n", - "- Add description of what the notebook does.\n", - "- Point to references, e.g. (neo4j.example.md)\n", - "- Add citations.\n", - "- Keep the notebook flow clear.\n", - "- Comments should be imperative and have a period at the end.\n", - "- Your code should be well commented.\n", - "\n", - "The name of this notebook should in the following format:\n", - "- if the notebook is exploring `pycaret API`, then it is `pycaret.example.ipynb`\n", - "\n", - "Follow the reference to write notebooks in a clear manner: https://github.com/causify-ai/helpers/blob/master/docs/coding/all.jupyter_notebook.how_to_guide.md" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "6226667e-cab5-479c-be6a-6b7d6f580a97", - "metadata": {}, - "outputs": [], - "source": [ - "%load_ext autoreload\n", - "%autoreload 2\n", - "%matplotlib inline" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "8020901a-4bc7-4b73-95e8-aaa462b4fc19", - "metadata": {}, - "outputs": [], - "source": [ - "import logging\n", - "# Import libraries in this section.\n", - "# Avoid imports like import *, from ... import ..., from ... import *, etc.\n", - "\n", - "import helpers.hdbg as hdbg\n", - "import helpers.hnotebook as hnotebo" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "4ecb72b2-b21d-4fb0-ac92-e7174da390e6", - "metadata": { - "lines_to_next_cell": 2 - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[0mWARNING: Running in Jupyter\n", - "INFO > cmd='/venv/lib/python3.12/site-packages/ipykernel_launcher.py -f /home/.local/share/jupyter/runtime/kernel-783e0930-1631-4d64-8bb4-f3a98bb74fcd.json'\n" - ] - } - ], - "source": [ - "hdbg.init_logger(verbosity=logging.INFO)\n", - "\n", - "_LOG = logging.getLogger(__name__)\n", - "\n", - "hnotebo.config_notebook()" - ] - }, - { - "cell_type": "markdown", - "id": "1ede6422-bff2-4f0a-8d28-29a01d4786b2", - "metadata": { - "lines_to_next_cell": 2 - }, - "source": [ - "## Make the notebook flow clear\n", - "Each notebook needs to follow a clear and logical flow, e.g:\n", - "- Load data\n", - "- Compute stats\n", - "- Clean data\n", - "- Compute stats\n", - "- Do analysis\n", - "- Show results\n", - "\n", - "\n", - "\n", - "\n", - "#############################################################################\n", - "Template\n", - "#############################################################################" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "8bbd660d-d22f-44fa-bf53-dd622dee0f53", - "metadata": { - "lines_to_next_cell": 2 - }, - "outputs": [], - "source": [ - "class Template:\n", - " \"\"\"\n", - " Brief imperative description of what the class does in one line, if needed.\n", - " \"\"\"\n", - "\n", - " def __init__(self):\n", - " pass\n", - "\n", - " def method1(self, arg1: int) -> None:\n", - " \"\"\"\n", - " Brief imperative description of what the method does in one line.\n", - "\n", - " You can elaborate more in the method docstring in this section, for e.g. explaining\n", - " the formula/algorithm. Every method/function should have a docstring, typehints and include the\n", - " parameters and return as follows:\n", - "\n", - " :param arg1: description of arg1\n", - " :return: description of return\n", - " \"\"\"\n", - " # Code bloks go here.\n", - " # Make sure to include comments to explain what the code is doing.\n", - " # No empty lines between code blocks.\n", - " pass\n", - "\n", - "\n", - "def template_function(arg1: int) -> None:\n", - " \"\"\"\n", - " Brief imperative description of what the function does in one line.\n", - "\n", - " You can elaborate more in the function docstring in this section, for e.g. explaining\n", - " the formula/algorithm. Every function should have a docstring, typehints and include the\n", - " parameters and return as follows:\n", - "\n", - " :param arg1: description of arg1\n", - " :return: description of return\n", - " \"\"\"\n", - " # Code bloks go here.\n", - " # Make sure to include comments to explain what the code is doing.\n", - " # No empty lines between code blocks.\n", - " pass" - ] - }, - { - "cell_type": "markdown", - "id": "103f6e36-54cf-442c-b137-8091d48805a7", - "metadata": {}, - "source": [ - "## The flow should be highlighted using headings in markdown\n", - "```\n", - "# Level 1\n", - "## Level 2\n", - "### Level 3\n", - "```" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d05d52af-67ba-4a4f-a561-af453e43854f", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "jupytext": { - "formats": "ipynb,py:percent" - }, - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.12.3" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/template.example.py b/class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/template.example.py deleted file mode 100644 index 8566ff277..000000000 --- a/class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/template.example.py +++ /dev/null @@ -1,125 +0,0 @@ -# --- -# jupyter: -# jupytext: -# formats: ipynb,py:percent -# text_representation: -# extension: .py -# format_name: percent -# format_version: '1.3' -# jupytext_version: 1.19.0 -# kernelspec: -# display_name: Python 3 (ipykernel) -# language: python -# name: python3 -# --- - -# %% [markdown] -# # Template Example Notebook -# -# This is a template notebook. The first heading should be the title of what notebook is about. For example, if it is a project on neo4j tutorial the heading should be `Project Title`. -# -# - Add description of what the notebook does. -# - Point to references, e.g. (neo4j.example.md) -# - Add citations. -# - Keep the notebook flow clear. -# - Comments should be imperative and have a period at the end. -# - Your code should be well commented. -# -# The name of this notebook should in the following format: -# - if the notebook is exploring `pycaret API`, then it is `pycaret.example.ipynb` -# -# Follow the reference to write notebooks in a clear manner: https://github.com/causify-ai/helpers/blob/master/docs/coding/all.jupyter_notebook.how_to_guide.md - -# %% -# %load_ext autoreload -# %autoreload 2 -# %matplotlib inline - -# %% -import logging -# Import libraries in this section. -# Avoid imports like import *, from ... import ..., from ... import *, etc. - -import helpers.hdbg as hdbg -import helpers.hnotebook as hnotebo - -# %% -hdbg.init_logger(verbosity=logging.INFO) - -_LOG = logging.getLogger(__name__) - -hnotebo.config_notebook() - - -# %% [markdown] -# ## Make the notebook flow clear -# Each notebook needs to follow a clear and logical flow, e.g: -# - Load data -# - Compute stats -# - Clean data -# - Compute stats -# - Do analysis -# - Show results -# -# -# -# - - -# ############################################################################# -# Template -# ############################################################################# - - -# %% -class Template: - """ - Brief imperative description of what the class does in one line, if needed. - """ - - def __init__(self): - pass - - def method1(self, arg1: int) -> None: - """ - Brief imperative description of what the method does in one line. - - You can elaborate more in the method docstring in this section, for e.g. explaining - the formula/algorithm. Every method/function should have a docstring, typehints and include the - parameters and return as follows: - - :param arg1: description of arg1 - :return: description of return - """ - # Code bloks go here. - # Make sure to include comments to explain what the code is doing. - # No empty lines between code blocks. - pass - - -def template_function(arg1: int) -> None: - """ - Brief imperative description of what the function does in one line. - - You can elaborate more in the function docstring in this section, for e.g. explaining - the formula/algorithm. Every function should have a docstring, typehints and include the - parameters and return as follows: - - :param arg1: description of arg1 - :return: description of return - """ - # Code bloks go here. - # Make sure to include comments to explain what the code is doing. - # No empty lines between code blocks. - pass - - -# %% [markdown] -# ## The flow should be highlighted using headings in markdown -# ``` -# # Level 1 -# ## Level 2 -# ### Level 3 -# ``` - -# %% diff --git a/class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/template_utils.py b/class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/template_utils.py deleted file mode 100644 index f8916102e..000000000 --- a/class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/template_utils.py +++ /dev/null @@ -1,72 +0,0 @@ -""" -template_utils.py - -This file contains utility functions that support the tutorial notebooks. - -- Notebooks should call these functions instead of writing raw logic inline. -- This helps keep the notebooks clean, modular, and easier to debug. -- Students should implement functions here for data preprocessing, - model setup, evaluation, or any reusable logic. - -Import as: - -import class_project.project_template.template_utils as cpptteut -""" - -import pandas as pd -import logging -from sklearn.model_selection import train_test_split -from pycaret.classification import compare_models - -# ----------------------------------------------------------------------------- -# Logging -# ----------------------------------------------------------------------------- - -logging.basicConfig(level=logging.INFO) -logger = logging.getLogger(__name__) - -# ----------------------------------------------------------------------------- -# Example 1: Split the dataset into train and test sets -# ----------------------------------------------------------------------------- - - -def split_data(df: pd.DataFrame, target_column: str, test_size: float = 0.2): - """ - Split the dataset into training and testing sets. - - :param df: full dataset - :param target_column: name of the target column - :param test_size: proportion of test data (default = 0.2) - - :return: X_train, X_test, y_train, y_test - """ - logger.info("Splitting data into train and test sets") - X = df.drop(columns=[target_column]) - y = df[target_column] - return train_test_split(X, y, test_size=test_size, random_state=42) - - -# ----------------------------------------------------------------------------- -# Example 2: PyCaret classification pipeline -# ----------------------------------------------------------------------------- - - -def run_pycaret_classification( - df: pd.DataFrame, target_column: str -) -> pd.DataFrame: - """ - Run a basic PyCaret classification experiment. - - :param df: dataset containing features and target - :param target_column: name of the target column - - :return: comparison of top-performing models - """ - logger.info("Initializing PyCaret classification setup") - ... - - logger.info("Comparing models") - results = compare_models() - ... - - return results diff --git a/class_project/data605/Spring2026/projects/docker_build.log b/class_project/data605/Spring2026/projects/docker_build.log new file mode 100644 index 000000000..280a65d01 --- /dev/null +++ b/class_project/data605/Spring2026/projects/docker_build.log @@ -0,0 +1,57 @@ +#0 building with "desktop-linux" instance using docker driver + +#1 [internal] load build definition from Dockerfile +#1 transferring dockerfile: 824B done +#1 DONE 0.0s + +#2 [internal] load metadata for docker.io/library/python:3.12-slim +#2 DONE 0.5s + +#3 [internal] load .dockerignore +#3 transferring context: 2B done +#3 DONE 0.0s + +#4 [ 1/10] FROM docker.io/library/python:3.12-slim@sha256:46cb7cc2877e60fbd5e21a9ae6115c30ace7a077b9f8772da879e4590c18c2e3 +#4 resolve docker.io/library/python:3.12-slim@sha256:46cb7cc2877e60fbd5e21a9ae6115c30ace7a077b9f8772da879e4590c18c2e3 0.0s done +#4 DONE 0.0s + +#5 [internal] load build context +#5 transferring context: 3.43kB done +#5 DONE 0.0s + +#6 [ 7/10] COPY etc_sudoers /etc/sudoers +#6 CACHED + +#7 [ 8/10] COPY bashrc /root/.bashrc +#7 CACHED + +#8 [ 2/10] RUN apt-get update && apt-get install -y ca-certificates && rm -rf /var/lib/apt/lists/* +#8 CACHED + +#9 [ 9/10] COPY version.sh /install/ +#9 CACHED + +#10 [ 3/10] RUN mkdir -p /install +#10 CACHED + +#11 [ 4/10] COPY requirements.txt /install/requirements.txt +#11 CACHED + +#12 [ 5/10] RUN pip install --upgrade pip && pip install --no-cache-dir jupyterlab jupyterlab_vim jupytext -r /install/requirements.txt +#12 CACHED + +#13 [ 6/10] COPY etc_sudoers /install/ +#13 CACHED + +#14 [10/10] RUN /install/version.sh 2>&1 | tee version.log +#14 CACHED + +#15 exporting to image +#15 exporting layers done +#15 exporting manifest sha256:6fae8d4c58c30cbd9584b94792553aa81521c8cbd226325402317c19d23ec8bd done +#15 exporting config sha256:17f23757e76f3e39a67cf04cec38373be2f88bd0e9850e89d7c6b7b064e83865 done +#15 exporting attestation manifest sha256:c7ad33749cac1ef153bec7acff5e155270f57bda47539d7dab7851761cef1263 done +#15 exporting manifest list sha256:599c534f595871d36583d5361614985c263b7624d583fa13ac2b75f715dfecde done +#15 naming to docker.io/kshitideshpande/docsgpt_project:latest done +#15 unpacking to docker.io/kshitideshpande/docsgpt_project:latest done +#15 DONE 0.0s From 01562c6d90445f79f8d520d48275c775e84d8d8a Mon Sep 17 00:00:00 2001 From: Kshiti Deshpande <113427581+kshitideshpande@users.noreply.github.com> Date: Thu, 7 May 2026 09:39:03 -0400 Subject: [PATCH 4/5] Update README.md --- .../projects/UmdTask437_DATA605_Spring2026_DocsGPT/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/README.md b/class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/README.md index 11e38d349..d64b192a2 100644 --- a/class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/README.md +++ b/class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/README.md @@ -42,5 +42,5 @@ Open **http://localhost:8888** and work through the notebooks in order: | `requirements.txt` | Pinned Python dependencies | | `.env.example` | Template for API key configuration | -See [project template README](../../project_template_README.md) for full -Docker usage details. \ No newline at end of file +See [project template README](../project_template_README.md) for full +Docker usage details. From 6097a513c04d33f310eca6d1996720825ba1ee52 Mon Sep 17 00:00:00 2001 From: Kshiti Deshpande <113427581+kshitideshpande@users.noreply.github.com> Date: Thu, 7 May 2026 09:41:18 -0400 Subject: [PATCH 5/5] Update README.md --- .../projects/UmdTask437_DATA605_Spring2026_DocsGPT/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/README.md b/class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/README.md index d64b192a2..636062984 100644 --- a/class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/README.md +++ b/class_project/data605/Spring2026/projects/UmdTask437_DATA605_Spring2026_DocsGPT/README.md @@ -42,5 +42,5 @@ Open **http://localhost:8888** and work through the notebooks in order: | `requirements.txt` | Pinned Python dependencies | | `.env.example` | Template for API key configuration | -See [project template README](../project_template_README.md) for full +See [project template README](project_template_README.md) for full Docker usage details.