diff --git a/.github/workflows/deploy-preview.yml b/.github/workflows/deploy-preview.yml new file mode 100644 index 00000000..17fc8251 --- /dev/null +++ b/.github/workflows/deploy-preview.yml @@ -0,0 +1,76 @@ +# ============================================================================= +# Deploy Preview +# ============================================================================= +# +# PURPOSE: +# Generates a preview build of the documentation site for every pull request +# that touches docs/ files. This lets reviewers see exactly how content +# changes will look on the live site before merging. +# +# HOW IT WORKS: +# 1. Checks out the PR branch +# 2. Installs Node.js dependencies (with npm cache for speed) +# 3. Builds the Docusaurus site (English-only for faster builds) +# 4. Uploads the built HTML as a GitHub Actions artifact (kept for 7 days) +# 5. Posts a comment on the PR informing reviewers that the preview is ready +# +# HOW TO USE THE PREVIEW: +# - Go to the Actions tab on the PR +# - Download the "docs-preview-" artifact +# - Extract and open index.html locally, or serve with `npx serve` +# +# NOTE: NODE_OPTIONS is set to 8GB to prevent out-of-memory errors during +# the Docusaurus webpack build, which can be memory-intensive with many pages. +# +# TRIGGERS: Pull requests that change docs/** files. +# ============================================================================= + +name: Deploy Preview + +on: + pull_request: + paths: + - 'docs/**' + +jobs: + # ---- Build English-only site and upload as downloadable artifact ---- + deploy-preview: + runs-on: ubuntu-latest + permissions: + pull-requests: write + steps: + - uses: actions/checkout@v4 + + - uses: actions/setup-node@v4 + with: + node-version: 20 + cache: npm + cache-dependency-path: website/package-lock.json + + - name: Install dependencies + working-directory: website + run: npm ci + + - name: Build (English only for speed) + working-directory: website + env: + NODE_OPTIONS: --max_old_space_size=8192 + run: npm run build -- --locale en + + - name: Deploy to preview + uses: actions/upload-artifact@v4 + with: + name: docs-preview-${{ github.event.pull_request.number }} + path: website/build + retention-days: 7 + + - name: Comment PR with preview info + uses: actions/github-script@v7 + with: + script: | + github.rest.issues.createComment({ + issue_number: context.issue.number, + owner: context.repo.owner, + repo: context.repo.repo, + body: '📄 **Docs Preview** built successfully!\n\nDownload the preview artifact from the Actions tab to review locally.\n\n_Built from commit ${{ github.event.pull_request.head.sha }}_' + }); diff --git a/.github/workflows/docs-build.yml b/.github/workflows/docs-build.yml new file mode 100644 index 00000000..82d2b0f0 --- /dev/null +++ b/.github/workflows/docs-build.yml @@ -0,0 +1,108 @@ +# ============================================================================= +# Docs Build & Deploy +# ============================================================================= +# +# PURPOSE: +# Builds the full Docusaurus documentation site and deploys it to GitHub +# Pages when changes land on the main branch. For pull requests, it builds +# the site to verify nothing is broken, but does not deploy. +# +# HOW IT WORKS (3 jobs): +# +# 1. build - Installs dependencies, runs `npm run build` to produce +# the static HTML/CSS/JS output in website/build/. This is +# a full production build (all locales, all pages, SSG). +# On pushes to main, uploads the build as a GitHub Pages +# artifact for deployment. +# +# 2. deploy - Only runs on pushes to main (not PRs). Takes the build +# artifact from the previous job and deploys it to the +# GitHub Pages environment. Requires pages:write and +# id-token:write permissions. +# +# 3. pr-comment - Only runs on PRs. Posts a success comment on the PR +# to confirm the docs build passed. +# +# IMPORTANT: This workflow uses NODE_OPTIONS=--max_old_space_size=8192 to +# give Node.js enough memory for the webpack build. Without this, builds +# with many pages can fail with heap out-of-memory errors. +# +# TRIGGERS: push/PR to main or docs branch when docs/** files change. +# ============================================================================= + +name: Docs Build + +on: + push: + branches: [main, docs] + paths: + - 'docs/**' + pull_request: + branches: [main, docs] + paths: + - 'docs/**' + +env: + NODE_OPTIONS: --max_old_space_size=8192 + +jobs: + # ---- Job 1: Build the Docusaurus site ---- + build: + runs-on: ubuntu-latest + defaults: + run: + working-directory: website + steps: + - uses: actions/checkout@v4 + + - name: Setup Node + uses: actions/setup-node@v4 + with: + node-version: '20' + cache: 'npm' + cache-dependency-path: website/package-lock.json + + - name: Install dependencies + run: npm ci + + - name: Build + run: npm run build + + - name: Upload pages artifact + if: github.ref == 'refs/heads/main' + uses: actions/upload-pages-artifact@v3 + with: + path: website/build + + # ---- Job 2: Deploy to GitHub Pages (main branch only) ---- + deploy: + if: github.ref == 'refs/heads/main' + needs: build + runs-on: ubuntu-latest + permissions: + pages: write + id-token: write + environment: + name: github-pages + url: ${{ steps.deployment.outputs.page_url }} + steps: + - name: Deploy to GitHub Pages + id: deployment + uses: actions/deploy-pages@v4 + + # ---- Job 3: Post build-success comment on PRs ---- + pr-comment: + if: github.event_name == 'pull_request' + needs: build + runs-on: ubuntu-latest + steps: + - name: Comment on PR + uses: actions/github-script@v7 + with: + script: | + github.rest.issues.createComment({ + issue_number: context.issue.number, + owner: context.repo.owner, + repo: context.repo.repo, + body: '✅ Docs build succeeded!' + }) diff --git a/.github/workflows/docs-lint.yml b/.github/workflows/docs-lint.yml new file mode 100644 index 00000000..ac24674f --- /dev/null +++ b/.github/workflows/docs-lint.yml @@ -0,0 +1,139 @@ +# ============================================================================= +# Docs Lint +# ============================================================================= +# +# PURPOSE: +# Runs quality and consistency checks on all documentation content whenever +# docs/, examples/, or website/ files change. This workflow is the first line +# of defense against prose errors, broken links, spelling mistakes, and +# documentation drift. +# +# WHAT IT CHECKS (5 parallel jobs): +# +# 1. prose-lint - Runs Vale (https://vale.sh) to enforce a writing style +# guide. Catches passive voice, jargon, inconsistent +# terminology, etc. Config lives in website/.vale.ini. +# +# 2. spell-check - Runs cspell to catch typos and unknown words. +# Custom dictionary is in website/.cspell.json. +# +# 3. sync-check - Verifies that auto-generated docs pages (in docs/) +# are in sync with their source README.md files (in +# examples/). Fails if someone edited a generated .mdx +# without updating the source, or vice versa. +# Fix: run `node website/scripts/sync-examples.mjs` +# +# 4. mdx-only-check - Ensures all documentation files use .mdx extension, +# not plain .md. MDX is required for Docusaurus features +# like component imports and Tabs. +# +# 5. link-check - Builds the full Docusaurus site, then runs Lychee +# link checker against the HTML output to catch broken +# internal and external links. Uploads a report artifact +# on failure for easy debugging. +# +# TRIGGERS: push/PR to docs/**, examples/**, website/**, or manual dispatch. +# ============================================================================= + +name: Docs Lint + +on: + push: + paths: + - 'docs/**' + - 'examples/**' + - 'website/**' + pull_request: + paths: + - 'docs/**' + - 'examples/**' + - 'website/**' + workflow_dispatch: + +jobs: + # ---- Job 1: Writing style enforcement ---- + prose-lint: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Vale + uses: errata-ai/vale-action@v2 + with: + files: docs/ + vale_flags: --config=website/.vale.ini + + # ---- Job 2: Typo and spelling detection ---- + spell-check: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Spell check + uses: streetsidesoftware/cspell-action@v6 + with: + files: docs/** + config: website/.cspell.json + + # ---- Job 3: Ensure examples/ README.md files match docs/ .mdx files ---- + sync-check: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Setup Node + uses: actions/setup-node@v4 + with: + node-version: '20' + + - name: Verify examples/docs sync + run: node website/scripts/sync-examples.mjs --check + + # ---- Job 4: Enforce .mdx extension for all docs files ---- + mdx-only-check: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Ensure no .md files in docs/ + run: | + MD_FILES=$(find docs/ -name '*.md' -not -name 'README.md' -not -path '*/node_modules/*' | head -20) + if [ -n "$MD_FILES" ]; then + echo "ERROR: Found .md files in docs/ that should be .mdx:" + echo "$MD_FILES" + exit 1 + fi + + # ---- Job 5: Build site and check all links (internal + external) ---- + link-check: + runs-on: ubuntu-latest + defaults: + run: + working-directory: website + steps: + - uses: actions/checkout@v4 + + - name: Setup Node + uses: actions/setup-node@v4 + with: + node-version: '20' + + - name: Install dependencies + run: npm ci + + - name: Build site + run: npm run build + + - name: Lychee link checker + uses: lycheeverse/lychee-action@v2 + with: + args: '--config .lychee.toml --base website/build --no-progress website/build' + fail: true + output: ./lychee-report.md + + - name: Upload link check report + if: failure() + uses: actions/upload-artifact@v4 + with: + name: link-check-report + path: ./lychee-report.md diff --git a/.github/workflows/pr-checks.yml b/.github/workflows/pr-checks.yml new file mode 100644 index 00000000..e45095e3 --- /dev/null +++ b/.github/workflows/pr-checks.yml @@ -0,0 +1,137 @@ +# ============================================================================= +# PR Checks +# ============================================================================= +# +# PURPOSE: +# Gate checks that run on every pull request to main. These are the minimum +# quality bars that must pass before a PR can be merged. They focus on +# structural correctness of the docs site (not prose quality -- that's in +# docs-lint.yml). +# +# WHAT IT CHECKS (4 jobs, some run in parallel): +# +# 1. enforce-mdx - Scans docs/ for any .md files (which should be .mdx). +# All documentation must use .mdx extension for Docusaurus +# MDX features (component imports, Tabs, etc.) to work. +# Plain .md files will fail the build silently or produce +# broken pages. +# +# 2. mdx-check - Runs the Docusaurus MDX syntax checker to catch invalid +# JSX, unclosed tags, missing imports, and other MDX +# compilation errors BEFORE attempting a full build. +# Much faster feedback than waiting for a build failure. +# +# 3. build-test - Full production build of the Docusaurus site. Catches +# broken links, missing pages, import errors, and any +# issues that only surface during static site generation +# (SSG). If this passes, the site will deploy cleanly. +# +# 4. smoke-test - Runs Playwright end-to-end tests against the built site. +# Waits for build-test to complete first (needs: build-test). +# Verifies that key pages load, navigation works, search +# functions, and critical UI elements render correctly. +# Config: website/playwright.config.ts +# +# TRIGGERS: Pull requests to main, or manual dispatch. +# ============================================================================= + +name: PR Checks + +on: + pull_request: + branches: [main] + workflow_dispatch: + +jobs: + # ---- Job 1: Reject plain .md files in docs/ ---- + enforce-mdx: + name: Enforce .mdx (no .md files) + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Check for .md files in docs/ + run: | + # Find any .md files in docs/ content (excluding website/ and node_modules/) + MD_FILES=$(find docs -name "*.md" \ + -not -path "*/website/*" \ + -not -path "*/node_modules/*" \ + 2>/dev/null || true) + + if [ -n "$MD_FILES" ]; then + echo "::error::Found .md files in docs/. All documentation must use .mdx extension." + echo "" + echo "Files that need to be renamed to .mdx:" + echo "$MD_FILES" + echo "" + echo "Rename with: mv .md .mdx" + exit 1 + fi + echo "All documentation files use .mdx extension." + + # ---- Job 2: Validate MDX syntax (fast, no full build) ---- + mdx-check: + runs-on: ubuntu-latest + defaults: + run: + working-directory: website + steps: + - uses: actions/checkout@v4 + + - name: Setup Node + uses: actions/setup-node@v4 + with: + node-version: '20' + + - name: Install dependencies + run: npm ci + + - name: MDX check + run: npx docusaurus-mdx-checker + + # ---- Job 3: Full production build (catches SSG errors, broken links) ---- + build-test: + runs-on: ubuntu-latest + defaults: + run: + working-directory: website + steps: + - uses: actions/checkout@v4 + + - name: Setup Node + uses: actions/setup-node@v4 + with: + node-version: '20' + + - name: Install dependencies + run: npm ci + + - name: Build + run: npm run build + + # ---- Job 4: Playwright E2E tests against the built site ---- + smoke-test: + runs-on: ubuntu-latest + needs: build-test + defaults: + run: + working-directory: website + steps: + - uses: actions/checkout@v4 + + - name: Setup Node + uses: actions/setup-node@v4 + with: + node-version: '20' + + - name: Install dependencies + run: npm ci + + - name: Install Playwright browsers + run: npx playwright install --with-deps + + - name: Build + run: npm run build + + - name: Run Playwright smoke tests + run: npx playwright test --config=playwright.config.ts diff --git a/.github/workflows/test-code-samples.yml b/.github/workflows/test-code-samples.yml new file mode 100644 index 00000000..437b5974 --- /dev/null +++ b/.github/workflows/test-code-samples.yml @@ -0,0 +1,166 @@ +# ============================================================================= +# Test Code Samples +# ============================================================================= +# +# PURPOSE: +# Validates that every code snippet in the documentation actually works. +# This workflow extracts code blocks from MDX files, runs them, and updates +# the CI validation status badges shown on each docs page. This is the +# "docs as code" principle -- if the code in the docs is broken, CI fails. +# +# HOW IT WORKS (3-stage pipeline): +# +# 1. syntax-check - Runs on ubuntu-latest (no AMD hardware needed). +# Uses extract_code_blocks.py to parse every .mdx file, +# extract fenced code blocks (```python, ```bash, etc.), +# and run syntax/import checks on them. Blocks tagged +# with `notest` are skipped. This catches obvious errors +# (syntax errors, missing imports) without needing real +# hardware. Results are saved to syntax-results.json. +# +# 2. test-npu - Runs on a SELF-HOSTED runner with AMD NPU hardware +# (labels: self-hosted, windows, npu, ryzen-ai). Only +# runs after syntax-check passes. Does two things: +# a) Runs pytest on standalone code-samples/ test files +# b) Runs extract_code_blocks.py again, this time with +# FULL EXECUTION -- actually running every code +# block to verify it produces correct output on +# real AMD hardware. +# Results are saved to test-results.json. +# +# 3. update-ci-status - Runs on ubuntu-latest after test-npu completes. +# Only runs on main branch or manual dispatch. +# Downloads the test results, then runs +# update_ci_status.py to update the `ci_validated` +# and `ci_last_run` front matter fields in each .mdx +# file. These fields drive the badge shown +# at the top of each docs page, telling readers whether +# the code on that page has been verified on real +# hardware. Changes are committed automatically. +# +# CODE BLOCK CONVENTIONS: +# - ```python -- will be extracted and tested +# - ```python notest -- will be SKIPPED (e.g., pseudocode, partial snippets) +# - ```bash -- tested on Linux runners +# - ```bat -- tested on Windows runners +# - ```powershell -- tested on Windows runners +# +# SELF-HOSTED RUNNER REQUIREMENTS: +# Same as validate-models.yml: Windows 11, AMD NPU, ryzen-ai-1.7.0 conda env. +# +# TRIGGERS: push/PR when docs/** or website/code-samples/** change, or manual. +# ============================================================================= + +name: Test Code Samples + +on: + push: + branches: [main] + paths: + - 'docs/**' + - 'website/code-samples/**' + - 'docs/**/*.mdx' + pull_request: + paths: + - 'docs/**' + - 'website/code-samples/**' + - 'docs/**/*.mdx' + workflow_dispatch: + +jobs: + # ---- Stage 1: Lightweight syntax/import checks (no hardware needed) ---- + syntax-check: + name: Syntax & import checks (no hardware) + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Setup Python + uses: actions/setup-python@v5 + with: + python-version: '3.10' + + - name: Run syntax checks on inline blocks + standalone files + run: | + python website/ci/extract_code_blocks.py \ + --run --syntax-only \ + --check-files --check-requirements \ + --output-json syntax-results.json + + - name: Upload syntax results + uses: actions/upload-artifact@v4 + if: always() + with: + name: syntax-results + path: syntax-results.json + + # ---- Stage 2: Full execution on AMD NPU hardware (self-hosted runner) ---- + test-npu: + name: Full execution (NPU hardware) + runs-on: [self-hosted, windows, npu, ryzen-ai] + needs: syntax-check + steps: + - uses: actions/checkout@v4 + + - name: Setup Python + uses: actions/setup-python@v5 + with: + python-version: '3.10' + + - name: Install dependencies + run: pip install -r website/code-samples/requirements.txt + + - name: Test code samples + run: pytest website/code-samples/ --tb=short -v + + - name: Run full execution of inline code blocks + run: | + python website/ci/extract_code_blocks.py \ + --run --check-files --check-imports \ + --output-json test-results.json + + - name: Upload test results + uses: actions/upload-artifact@v4 + if: always() + with: + name: test-results + path: test-results.json + + # ---- Stage 3: Write pass/fail results back into .mdx front matter ---- + update-ci-status: + name: Update CI validation badges + runs-on: ubuntu-latest + needs: test-npu + if: github.ref == 'refs/heads/main' || github.event_name == 'workflow_dispatch' + permissions: + contents: write + steps: + - uses: actions/checkout@v4 + with: + token: ${{ secrets.GITHUB_TOKEN }} + + - name: Download test results + uses: actions/download-artifact@v4 + with: + name: test-results + path: . + + - name: Setup Python + uses: actions/setup-python@v5 + with: + python-version: '3.10' + + - name: Update CI status in MDX files + run: | + python website/ci/update_ci_status.py \ + --results test-results.json \ + --date $(date +%Y-%m-%d) + + - name: Commit updated CI status + run: | + git config user.name "github-actions[bot]" + git config user.email "github-actions[bot]@users.noreply.github.com" + git add docs/**/*.mdx + git diff --cached --quiet && echo "No CI status changes" && exit 0 + git commit -m "ci: update CI validation status [skip ci]" + git push diff --git a/.github/workflows/validate-models.yml b/.github/workflows/validate-models.yml new file mode 100644 index 00000000..9fa70f59 --- /dev/null +++ b/.github/workflows/validate-models.yml @@ -0,0 +1,151 @@ +# ============================================================================= +# Validate Models +# ============================================================================= +# +# PURPOSE: +# Automated model validation pipeline that runs all supported AI models on +# real AMD Ryzen AI hardware (NPU, GPU) to verify they actually work. This +# is the "proof" behind the supported models list in the documentation. +# Results are committed back to the repo so the docs site always reflects +# the latest verified state. +# +# ARCHITECTURE (3-stage pipeline): +# +# 1. discover - Runs on ubuntu-latest (no hardware needed). Reads the +# hardware registry config and scans the docs to produce a +# JSON list of all models that need testing. This list is +# passed to the next stage as an artifact. +# +# 2. test - Runs on a SELF-HOSTED runner with AMD NPU hardware +# (labels: self-hosted, windows, npu, ryzen-ai). Downloads +# the model list from discover, then runs three test +# harnesses in sequence: +# - harness_llm.py (LLM models: Llama, Mistral, etc.) +# - harness_vision.py (Vision: ResNet, YOLO, etc.) +# - harness_audio.py (Audio: Whisper, etc.) +# Each harness loads the model, runs inference, checks +# output validity, and records pass/fail + performance +# metrics to a JSON results file. +# +# 3. gate - Runs on ubuntu-latest. Aggregates all test results, +# applies pass/fail thresholds, and writes the final +# verified model list to docs/reference/model-list-data.json. +# On main branch, commits and pushes this file so the docs +# site automatically shows which models are verified. +# +# SCHEDULE: Runs weekly (Monday 6:00 AM UTC) and on manual dispatch. +# This means the model validation table stays current without manual effort. +# +# SELF-HOSTED RUNNER REQUIREMENTS: +# - Windows 11 with AMD Ryzen AI NPU +# - Ryzen AI Software installed (ryzen-ai-1.7.0 conda environment) +# - GitHub Actions runner registered with labels: self-hosted, windows, +# npu, ryzen-ai +# ============================================================================= + +name: Validate Models + +on: + schedule: + - cron: '0 6 * * 1' + workflow_dispatch: + +jobs: + # ---- Stage 1: Scan docs and config to build list of models to test ---- + discover: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Setup Python + uses: actions/setup-python@v5 + with: + python-version: '3.10' + + - name: Discover models + run: | + python website/ci/model-validation/discover.py \ + --config website/ci/hardware-registry.json \ + --output models-to-test.json + + - name: Upload models list + uses: actions/upload-artifact@v4 + with: + name: models-to-test + path: models-to-test.json + + # ---- Stage 2: Run models on real AMD hardware (self-hosted runner) ---- + test: + runs-on: [self-hosted, windows, npu, ryzen-ai] + needs: discover + steps: + - uses: actions/checkout@v4 + + - name: Download models list + uses: actions/download-artifact@v4 + with: + name: models-to-test + + - name: Setup Python + uses: actions/setup-python@v5 + with: + python-version: '3.10' + + - name: Install requirements + run: pip install -r website/code-samples/requirements.txt + + - name: Run LLM harness + run: | + python website/ci/model-validation/harness_llm.py \ + --models models-to-test.json \ + --output results/llm-results.json + + - name: Run vision harness + run: | + python website/ci/model-validation/harness_vision.py \ + --models models-to-test.json \ + --output results/vision-results.json + + - name: Run audio harness + run: | + python website/ci/model-validation/harness_audio.py \ + --models models-to-test.json \ + --output results/audio-results.json + + - name: Upload results + uses: actions/upload-artifact@v4 + with: + name: validation-results + path: results/ + + # ---- Stage 3: Aggregate results and update the verified model list ---- + gate: + runs-on: ubuntu-latest + needs: test + if: github.ref == 'refs/heads/main' + steps: + - uses: actions/checkout@v4 + + - name: Download results + uses: actions/download-artifact@v4 + with: + name: validation-results + path: results/ + + - name: Setup Python + uses: actions/setup-python@v5 + with: + python-version: '3.10' + + - name: Run gate + run: | + python website/ci/model-validation/gate.py \ + --results results/ \ + --output docs/reference/model-list-data.json + + - name: Commit and push + run: | + git config user.name "github-actions[bot]" + git config user.email "github-actions[bot]@users.noreply.github.com" + git add docs/reference/model-list-data.json + git diff --staged --quiet || (git commit -m "chore: update model list from validation" && git push) diff --git a/.gitignore b/.gitignore index 329e9f2b..a6cde1bb 100644 --- a/.gitignore +++ b/.gitignore @@ -1,8 +1,70 @@ -*~ -*.swp -__pycache__ -*node_modules* -*.yml +# Dependencies +node_modules/ + +# Docusaurus +.docusaurus/ +build/ +.cache-loader + +# Python +__pycache__/ +*.pyc +*.pyo +.venv/ +*.egg-info/ + +# CI artifacts +docs/website/ci/validation-results/ +docs/website/ci/extracted-blocks/ +docs/website/ci/model-validation/results/*.tmp + +# Generated docs (from docusaurus-plugin-llms) +# The plugin copies .mdx content as .md into the website dir +docs/website/docs/ +docs/website/applications/*.md +docs/website/applications.md +docs/website/develop/**/*.md +docs/website/getting-started/**/*.md +docs/website/models-tutorials/**/*.md +docs/website/reference/**/*.md +docs/website/templates/**/*.md +docs/website/tools/**/*.md + +# Downloaded models +*-onnx-ryzenai-*/ + +# Build artifacts *_Perf.log test.json -benchmark_scripts \ No newline at end of file +benchmark_scripts/ +build_out/ + +# C++ build artifacts (ResNet examples) +configure.txt +passContext.txt +original-info-signature.txt +original-model-signature.txt +output.log + +# Local validation scripts +run_validation.bat +validation-output.txt + +# Editor +*~ +*.swp +.DS_Store + +# Environment +.env.local +.env.development.local +.env.test.local +.env.production.local + +# Logs +npm-debug.log* +yarn-debug.log* +yarn-error.log* + +# Vale downloaded styles +docs/website/.vale/styles/Vale/ diff --git a/.lychee.toml b/.lychee.toml new file mode 100644 index 00000000..39f24ce8 --- /dev/null +++ b/.lychee.toml @@ -0,0 +1,56 @@ +## Lychee link checker configuration +## https://lychee.cli.rs/usage/config/ + +# Check external links (HTTP/HTTPS) +include_verbatim = true + +# Timeout per request (seconds) +timeout = 30 + +# Max retries per link +max_retries = 3 + +# Max concurrent requests +max_concurrency = 8 + +# User-agent string (some sites block bots) +user_agent = "Mozilla/5.0 (compatible; lychee/0.23; +https://github.com/lycheeverse/lychee)" + +# Accept these HTTP status codes as valid +accept = [200, 204, 301, 302, 307, 308] + +# Exclude private/internal IPs +exclude_all_private = true + +# Do not check mail addresses +include_mail = false + +# Glob patterns for files to skip +exclude_path = [ + "docs/website/node_modules/**", + "docs/website/.docusaurus/**", + "docs/website/build/**", + "docs/templates/**", +] + +# URL patterns to exclude from checking (regex) +exclude = [ + # AMD websites have HTTP/2 protocol issues with automated checkers + "https://www\\.amd\\.com/.*", + "https://community\\.amd\\.com/.*", + # AMD account-gated downloads (require login) + "https://account\\.amd\\.com/.*", + # Hugging Face model repos (rate-limited / gated) + "https://huggingface\\.co/amd/.*", + "https://huggingface\\.co/collections/.*", + "https://hf\\.co/.*", + # localhost references (used in code examples) + "http://localhost.*", + "http://127\\.0\\.0\\.1.*", + # PyPI simple index (not browsable) + "https://pypi\\.amd\\.com/.*", + # Lunarg URLs with shell variables + ".*\\$UBUNTU_CODENAME.*", + # Reacher API endpoint (not a browsable URL) + "https://api\\.reacher\\.email/.*", +] diff --git a/CNN-examples/getting_started_resnet/bf16/app/.gitignore b/CNN-examples/getting_started_resnet/bf16/app/.gitignore deleted file mode 100644 index 9f4078bc..00000000 --- a/CNN-examples/getting_started_resnet/bf16/app/.gitignore +++ /dev/null @@ -1,6 +0,0 @@ -build -configure.txt -passContext.txt -original-info-signature.txt -original-model-signature.txt -output.log \ No newline at end of file diff --git a/CODEOWNERS b/CODEOWNERS new file mode 100644 index 00000000..b4f66bd8 --- /dev/null +++ b/CODEOWNERS @@ -0,0 +1,91 @@ +# CODEOWNERS - Ryzen AI Software +# +# Each path has a designated owner who must approve PRs touching that path. +# Replace @bconsolvo placeholders with actual GitHub usernames as the team grows. +# +# Docs: https://docs.github.com/en/repositories/managing-your-repositorys-settings-and-features/customizing-your-repository/about-code-owners + +# ────────────────────────────────────────────── +# Default owner (catch-all for anything not listed below) +# ────────────────────────────────────────────── +* @bconsolvo + +# ────────────────────────────────────────────── +# Repository root files +# ────────────────────────────────────────────── +/*.md @bconsolvo +/LICENSE @bconsolvo +/CODEOWNERS @bconsolvo +/.gitignore @bconsolvo +/.gitattributes @bconsolvo +/.lychee.toml @bconsolvo + +# ────────────────────────────────────────────── +# CI/CD workflows (all managed together) +# ────────────────────────────────────────────── +/.github/ @bconsolvo + +# ────────────────────────────────────────────── +# Website infrastructure (Docusaurus config, theme, scripts, i18n) +# ────────────────────────────────────────────── +/website/ @bconsolvo + +# ══════════════════════════════════════════════ +# DOCUMENTATION (docs/) +# ══════════════════════════════════════════════ + +# ────────────────────────────────────────────── +# Getting Started +# ────────────────────────────────────────────── +/docs/getting-started/installation.mdx @bconsolvo +/docs/getting-started/overview.mdx @bconsolvo +/docs/getting-started/quickstart.mdx @bconsolvo +/docs/getting-started/supported-hardware.mdx @bconsolvo + +# ────────────────────────────────────────────── +# Applications +# ────────────────────────────────────────────── +/docs/applications/ @bconsolvo + +# ────────────────────────────────────────────── +# Models & Tutorials - Overview +# ────────────────────────────────────────────── +/docs/models-tutorials/index.mdx @bconsolvo +/docs/models-tutorials/examples.mdx @bconsolvo + +# Models & Tutorials - Audio (docs + examples) +/docs/models-tutorials/audio/ @bconsolvo +/examples/audio/ @bconsolvo + +# Models & Tutorials - Large Language Models (docs + examples) +/docs/models-tutorials/llms/ @bconsolvo +/examples/llms/ @bconsolvo + +# Models & Tutorials - Multimodal (docs + examples) +/docs/models-tutorials/multimodal/ @bconsolvo +/examples/multimodal/ @bconsolvo + +# Models & Tutorials - NLP (docs + examples) +/docs/models-tutorials/nlp/ @bconsolvo +/examples/nlp/ @bconsolvo + +# Models & Tutorials - Vision (docs + examples) +/docs/models-tutorials/vision/ @bconsolvo +/examples/vision/ @bconsolvo + +# ────────────────────────────────────────────── +# Develop & Tools (docs + examples) +# ────────────────────────────────────────────── +/docs/develop/ @bconsolvo +/docs/tools/ @bconsolvo +/examples/tools/ @bconsolvo + +# ────────────────────────────────────────────── +# Reference +# ────────────────────────────────────────────── +/docs/reference/ @bconsolvo + +# ────────────────────────────────────────────── +# Site map +# ────────────────────────────────────────────── +/docs/README.mdx @bconsolvo diff --git a/Demos/ASR/Whisper/README.md b/Demos/ASR/Whisper/README.md deleted file mode 100644 index 3f4bee5d..00000000 --- a/Demos/ASR/Whisper/README.md +++ /dev/null @@ -1,158 +0,0 @@ - - - - -

Ryzen™ AI Automatic Speech Recognition

-
- -# Automatic Speech Recognition using OpenAI Whisper - -Unlock fast, on-device speech recognition with RyzenAI and OpenAI’s Whisper. This demo walks you through preparing and running OpenAI's Whisper (base, small, medium) for fast, local ASR on AMD NPU. - -## Features - -* 🚀 Download NPU Optimized Whisper ONNX models from HF -* ⚡ Run ASR locally on CPU or NPU -* 📊 Evaluate ASR on LibriSpeech samples and report WER/CER -* 🎧 Supports transcription of audio files and microphone input -* ⏱️ Reports Performance using RTF and TTFT - -## 🔗 Quick Links -- [Prerequisites](#prerequisites) -- [Accelerate Whisper on AMD NPU](#accelerate-whisper-on-amd-npu) - - [Why run on NPU?](#why-run-on-npu) - - [Set up VitisEP Configuration for NPU](#set-up-vitisep-configuration-for-npu) -- [ Usage](#usage) - - [Transcribe Audio File](#transcribe-audio-file) - - [Transcribe from Microphone](#transcribe-from-microphone) - - [Evaluate on Dataset](#evaluate-on-dataset) -- [ Notes](#notes) - -## 📦 Prerequisites - -1. **Install Ryzen AI SDK** - Follow [RyzenAI documentation](https://ryzenai.docs.amd.com/en/latest/inst.html#) to install SDK and drivers. - -2. **Activate environment** - - ```bash - conda activate ryzen-ai- - ``` - -3. **Clone repository** - - ```bash - git clone https://github.com/amd/RyzenAI-SW.git - cd RyzenAI-SW/demo/ASR/Whisper - ``` - -4. **Install dependencies** - - ```bash - pip install -r requirements.txt - ``` - -## ⚡Accelerate Whisper on AMD NPU - -### Why run on NPU? - -* Offloads compute from CPU onto NPU, freeing up CPU for other tasks. -* Delivers higher throughput and lower power consumption when running AI workloads -* Optimized execution of Whisper’s encoder and decoder models. -* Runs models with BFP16 precision for near-FP32 accuracy and INT8-like performance. - -#### NPU Run for Whisper-Base -When running inference on the NPU, 100% of the encoder operators and 93.4% of the decoder operators are executed on the NPU. -```bash - #encoder operations - [Vitis AI EP] No. of Operators : VAIML 225 - [Vitis AI EP] No. of Subgraphs : VAIML 1 - - #decoder operations - [Vitis AI EP] No. of Operators : CPU 24 VAIML 341 - [Vitis AI EP] No. of Subgraphs : VAIML 2 -``` -#### Set up VitisEP Configuration for NPU - -* Edit `config/model_config.json` to specify Execution Providers. -* For NPU: - - * Set `cache_key` and `cache_dir` - * Use corresponding `vitisai_config` from `config/` - -Example: - -```json -{ - "config_file": "config/vitisai_config_whisper_decoder.json", - "cache_dir": "./cache", - "cache_key": "whisper_medium_decoder" -} -``` -#### ⚠️ Special Instructions for Whisper-Medium -When running whisper-medium on NPU, it is recommended to add the following flags to `configs\vitisai_config_whisper_encoder.json` incase of compilation issues. -```json -"vaiml_config": { - "optimize_level": 3, - "aiecompiler_args": "--system-stack-size=512" -} -``` -These settings: - -- optimize_level=3: Enables aggressive optimizations for larger models. -- --system-stack-size=512: Increases the AI Engine system stack size to handle Whisper-Medium’s higher resource demand. - -## 🚀 Usage - -### Transcribe Audio File -Use this to transcribe a pre-recorded `.wav` file into text using the Whisper mode -```bash -python run_whisper.py \ - --model-type \ - --device npu \ - --input path/to/audio.wav -``` -- Replace with whisper-base, whisper-small, or whisper-medium. - -- Replace path/to/audio.wav with your audio file. - -For example, run whisper-large-v3-turbo -```bash -python run_whisper.py --model-type whisper-large-v3-turbo --device npu --input audio_files\1089-134686-0000.wav -``` - -### Transcribe from Microphone -Run real-time speech-to-text by capturing audio from your microphone. This allows you to speak and see live transcription: - -```bash -python run_whisper.py \ - --model-type \ - --device npu \ - --input mic \ - --duration 0 -``` -- --duration 0 means continuous recording until stopped (Ctrl+C) or detects silence for a set duration - -- Ideal for demos and testing live ASR performance. - -### Evaluate on Dataset -Run batch evaluation on a dataset (e.g., LibriSpeech samples) to measure model performance with metrics like WER, CER, and RTF: -```bash -python run_whisper.py \ - --model-type \ - --device npu \ - --eval-dir eval_dataset/LibriSpeech-samples \ - --results-dir results -``` -- --eval-dir specifies the dataset directory. - -- --results-dir is where evaluation reports (WER, CER, TTFT, RTF) will be saved. - -- Useful for benchmarking and validating models. - -## Notes - -* First run on NPU may take \~15 min for model compilation. -* Ensure paths for encoder, decoder, and config files are correct. -* Supports CPU and NPU devices. - diff --git a/LICENSE.txt b/LICENSE old mode 100755 new mode 100644 similarity index 100% rename from LICENSE.txt rename to LICENSE diff --git a/LLM-examples/RAG-OGA/README.md b/LLM-examples/RAG-OGA/README.md deleted file mode 100644 index d4421630..00000000 --- a/LLM-examples/RAG-OGA/README.md +++ /dev/null @@ -1,275 +0,0 @@ - - - - -

Ryzen™ AI RAG

-
- -## Introduction -Welcome to this repository, a showcase of an **ONNX Runtime GenAI(OGA)‑based RAG LLM sample application** running on a **Ryzen AI processor**. -This repo provides supplemental code to the AMD Blog [RAG with Hybrid LLM on AMD Ryzen AI Processor](https://www.amd.com/en/developer/resources/technical-articles/2025/rag-with-hybrid-llm-on-amd-ryzen-ai-processors.html). - -## What You’ll Find Here - -- **Retrieval-Augmented Generation (RAG) pipeline** powered by: - - A **hybrid LLM** enables disaggregated inference in which the compute-heavy prefill phase runs on the NPU, while the decode phase executes on the GPU. - - An **embedding model** compiled with **Vitis AI Execution Provider** -- Built using the widely adopted **LangChain** orchestration framework - -## Quick Setup - -Follow these simple steps to get started: - -1. Execute the setup steps outlined below to provision your environment. -2. After setup, this README will guide you through how to run the sample application. - ---- - -## 1. Installation and Setup - -### 1.1 Download the ONNX-Based Llama Model from Hugging Face - -```sh -git clone https://huggingface.co/amd/Llama-3.2-3B-Instruct-onnx-ryzenai-1.7-hybrid /path/to/your/directory/ -``` -replace `/path/to/your/directory/` with actual path where you want to download the model. - -### 1.2 Activate Ryzen AI Environment - -To ensure compatibility with ONNX-based Llama model, activate the ryzen-ai-1.7.0 Conda environment. - -Please follow instructions provided in the official AMD documentation to install Ryzen AI 1.7.0: - -👉 [Ryzen AI 1.7.0 Installation and Conda Environment Creation](https://ryzenai.docs.amd.com/en/latest/inst.html) - -After installation, activate the environment by running: - -```sh -conda activate ryzen-ai-1.7.0 -``` -### 1.3 Install Dependencies - -After activating the environment, install the required Python dependencies by running: -```sh -git clone https://github.com/amd/RyzenAI-SW.git -cd RyzenAI-SW/example/llm/RAG-OGA -pip install -r requirements.txt -``` - -## 2. Demo -To explore the use case, please refer below steps: - -### 2.1 Retrieval-Augmented Generation (RAG) Pipeline - -This example demonstrates a Retrieval-Augmented Generation (RAG) pipeline orchestrated using the LangChain framework. In this setup, documents are indexed into a Facebook AI Similarity Search(FAISS) vector database and retrieved at inference time to enrich user prompts with relevant contextual information. - -The following models are deployed using Ryzen AI 1.7.0: - -- **Embedding Model**: [BGE (BAAI General Embedding)](https://huggingface.co/BAAI/bge-large-en-v1.5), compiled using Vitis AI Execution Provider. - -- **Hybrid LLM**: [Llama3.2-3B-Instruct](https://huggingface.co/amd/Llama-3.2-3B-Instruct-onnx-ryzenai-1.7-hybrid), a quantized ONNX model, running using the OGA(OnnxRuntime GenAI) framework on Ryzen AI 1.7.0. - -By running both critical models on the NPU and/or GPU, this setup enables faster and more efficient inference, delivering a high-performance RAG system optimized for AI PCs. - -

- RAG Diagram -

-

RAG Workflow with LangChain and ONNX

- -### 2.2 🔑 Key Components of the LangChain-Based RAG Pipeline - -This RAG pipeline runs locally on an AMD Ryzen AI PC (with NPU & GPU). It combines LangChain, FAISS, embeddings, and an LLM to deliver fast, on‑device question‑answering. - -#### 🔹 Data Embedding -Documents are preprocessed and converted into dense vector representations using the BGE embedding model. - -#### 🔹 ONNX Inference on AMD NPU -The embedding model is executed using ONNX Runtime on the NPU (Ryzen AI). - -#### 🔹 Vector Store Creation -Document embeddings are stored in a FAISS-based vector database for efficient similarity search. - -#### 🔹 Context Retrieval -The vector database returns the most relevant document chunks based on the embedded query. - -#### 🔹 LLM Prompt Construction -LangChain constructs a prompt using the user’s query, prompt template, and the retrieved context. - -#### 🔹 LLM Response Generation -The retrieved data, along with the user’s query, is fed into a custom LLM, running on a hybrid flow (GPU and NPU), to generate a response from the retrieved data. - -### 2.3 Download, Export to ONNX, and Compile the Embedding Model. - -Run the following command to perform download, export and compile steps: - -```bash -python custom_embedding/export_bge_onnx.py -``` -Note : Please ensure that you have activated your ryzen‑ai‑1.7.0 environment and are in the RyzenAI‑SW/example/llm/RAG‑OGA directory. - -This script generates a static‑shape, non‑quantized FP32 ONNX model that serves as the baseline for further deployment. -The compiled BGE (BAAI General Embedding) ONNX model will be stored in the cache folder named ``modelcachekey_bge``. - - -### 2.4 Run the sample RAG application - -The system supports two modes of query handling. -- ``--direct_llm`` mode, where the user's query is directly sent to the LLM without any document retrieval. - -- If ``--direct_llm`` flag is not specified, the query triggers retrieval from a FAISS index, enriching the prompt with relevant context before passing it to the LLM. - -#### Required Setup: Update Paths in rag.py -- Dataset Path: - Replace the placeholder with the dataset provided in this directory used to build the FAISS index. -``` -dataset_path = r"./Dataset" -``` - -- LLM Model Path: - Replace the path to your LLM model that you downloaded in step 1.1 -``` -llm = custom_llm(model_path="path/to/llm") -``` - - -## 2.5 Sample Outputs - -**Case 1: Direct LLM mode (where no retrieval is being done)** -```sh -python rag.py --direct_llm -``` -Ask any question - -**For instance,** -``` -Enter your question: what is NPU and tell me the three important feature of NPU. -Direct_llm mode is on. No retrieval has been performed. -LLM_call invoked: 1 time(s) -Answer: -NPU stands for Net Protein Utilization, which is a measure of the proportion of dietary protein that is actually utilized by the body for growth and maintenance of tissues. The three important features of NPU are: (1) It is a measure of protein quality, indicating the extent to which a protein is effective in promoting growth and maintenance of body tissues. (2) It is influenced by factors such as the protein's amino acid composition, digestibility, and bioavailability. (3) NPU is a critical factor in determining the adequacy of protein intake, as it helps to identify the protein sources that are most effective in meeting the body's protein needs. - -``` - -**Case 2: Retrieval mode** - -In the **Retrieval mode**, documents most similar to the query are retrieved using FAISS, enabling efficient semantic search based on vector similarity. -You can observe how the model behaves differently between direct mode and retrieval mode: - -For instance, -```sh -python rag.py -``` -**Sample Output** - -***Question 1*** -``` -Enter your question: what is NPU and tell me the three important feature of NPU. -Retrieval mode is on. -Loading existing FAISS index from disk... -LLM_call invoked: 1 time(s) -Answer: -The NPU (Neural Processing Unit) is a specialized processor designed for neural network processing, specifically for deep learning and artificial intelligence applications. -The three important features of NPU are: -1. **High Performance**: NPU is designed to provide high-performance computing for deep learning workloads, making it an ideal choice for applications that require fast processing of large amounts of data. -2. **Energy Efficiency**: NPU is designed to be energy-efficient, which is critical for mobile devices and other applications where power consumption is a major concern. -3. **Low Latency**: NPU is designed to provide low latency, which is critical for real-time applications such as autonomous vehicles, robotics, and other IoT devices. -``` - -***Question 2*** - -``` -Enter your question: what are the main feature provided by the AMD analyzer, and how does it help in visualizing model execution on Ryzen AI ? -Retrieval mode is on. -Loading existing FAISS index from disk... -LLM_call invoked: 1 time(s) - -Answer: - ## Step 1: Identify the main features of the AMD AI Analyzer -The AMD AI Analyzer is a tool that supports analysis and visualization of model compilation and inference on Ryzen AI. The main features provided by the AMD AI Analyzer include: - -- Graph and operator partitions between the NPU and CPU -- Visualization of graph and operator partitions -- Profiling and visualization of model execution -- Generation of artifacts related to inference profile and graph partitions - -## Step 2: Explain how the AMD AI Analyzer helps in visualizing model execution on Ryzen AI -The AMD AI Analyzer helps in visualizing model execution on Ryzen AI by providing a comprehensive view of the model's performance and execution on the NPU. The tool allows users to: - -- Visualize graph and operator partitions to understand how the model is processed by the hardware -- Profile and visualize model execution to identify performance bottlenecks -- Generate artifacts related to inference profile and graph partitions to gain deeper insights into the model's behavior - -## Step 3: Highlight the benefits of using the AMD AI Analyzer -The AMD AI Analyzer provides several benefits, including: - -- Improved understanding of model execution on Ryzen AI -- Identification of performance bottlenecks and optimization opportunities -- Generation of artifacts for further analysis and optimization - -The final answer is: The AMD AI Analyzer provides a comprehensive set of features that help in visualizing model execution on Ryzen AI, including graph and operator partitions, profiling and visualization, and generation of artifacts related to inference profile and graph partitions. These features enable users to gain a deeper understanding of the model's performance and behavior on the NPU, identify performance bottlenecks, and optimize the model for better performance and power efficiency. - - -``` - -***Question 3*** -``` -Enter your question: In the context of Ryzen AI Software's hybrid inference model, how does the integration of automated -operator assignment, encrypted context caching, and hardware-specific xclbin configurations collectively contribute to -optimizing performance, ensuring security, and minimizing compilation overhead across varying model types such as transformers -and CNNs? -Retriveval mode is on. - Loading existing FAISS index from disk... - -Answer: -The integration of automated operator assignment, encrypted context caching, and hardware-specific xclbin configurations collectively contributes to optimizing performance, ensuring security, and minimizing compilation overhead across varying model types such as transformers and CNNs in the following ways: - -1. **Automated Operator Assignment**: This feature optimizes the placement of operators in the model, ensuring that the most efficient and effective assignments are made, which leads to improved performance and reduced computational overhead. - -2. **Encrypted Context Caching**: This feature ensures that sensitive model data is protected from unauthorized access, thereby enhancing security. By caching context information, the model can be efficiently transferred and executed across different environments, reducing the need for manual intervention and minimizing compilation overhead. - -3. **Hardware-Specific xclbin Configurations**: These configurations are tailored to the specific capabilities of the target platform, ensuring that INT8 models are optimized for the hardware, which leads to improved performance and reduced power consumption. This also enhances security by protecting sensitive model data from unauthorized access. - -Together, these features work synergistically to optimize performance, ensure security, and minimize compilation overhead across varying model types such as transformers and CNNs. This results in faster inference times, reduced power consumption, and improved overall efficiency, making the Ryzen AI Software's hybrid inference model a powerful tool for AI and machine learning applications -``` - - -## 2.6 Profiling - -The example code also captures key LLM performance metrics, such as Time to First Token (TTFT), Tokens Per Second (TPS), input prompt length, and total generated tokens, providing a clear view of system responsiveness and throughput. - -To enable profiling, run the sample with the ``--profiling`` flag: - -```sh -python rag.py --profiling -``` -**Note:** -Actual numbers may vary depending on the LLM used, model version, and specific system configuration. - -**Sample output:** -``` ---- Aggregated Profiling Summary --- - -Q1: - Avg Input Tokens : 1607 - Avg Output Tokens : 339 - Avg TTFT(Sec) : 1.640761 - Avg TPS : 31.16 - -Q2: - Avg Input Tokens : 1171 - Avg Output Tokens : 354 - Avg TTFT(Sec) : 1.16953 - Avg TPS : 32.74 - -Q3: - Avg Input Tokens : 1458 - Avg Output Tokens : 1 - Avg TTFT(Sec) : 1.393054 - Avg TPS : 0.0 -``` - - - - - - diff --git a/LLM-examples/README.md b/LLM-examples/README.md deleted file mode 100644 index ed616e9e..00000000 --- a/LLM-examples/README.md +++ /dev/null @@ -1,23 +0,0 @@ - - - - -

Ryzen™ AI LLM Examples

-
- -# Ryzen AI LLM OGA (Onnx Runtime Generate API) Flow - -Ryzen AI Software supports deploying LLMs on Ryzen AI PCs using the native ONNX Runtime Generate (OGA) C++ or Python API. - -Refer to [OnnxRuntime GenAI (OGA)](oga_api/README.md) or https://ryzenai.docs.amd.com/en/latest/hybrid_oga.html for more details. - -## LLM flow examples - -- [LLMs on RyzenAI with ONNX Runtime GenAI API](oga_api) -- [ONNX Runtime GenAI(OGA)-based RAG LLM](RAG-OGA) -- [Running Vision Language Model (VLM) on RyzenAI NPU](VLM) -- [Running GPT-OSS-20B with chat template](oga_inference) - -# Copyright - -Copyright(C) 2025 Advanced Micro Devices, Inc. All rights reserved. diff --git a/README.md b/README.md index 3c25a3e5..5c155a0b 100644 --- a/README.md +++ b/README.md @@ -1,79 +1,99 @@ - - - - -

Ryzen™ AI Software

-
+# Ryzen AI Software -## Introduction +AMD Ryzen AI Software includes the tools and runtime libraries for optimizing and deploying AI inference on [AMD Ryzen AI](https://www.amd.com/en/products/processors/consumer/ryzen-ai.html) PCs. It enables developers to build and run AI applications on the neural processing unit (NPU), integrated GPU, and discrete GPU. -AMD Ryzen™ AI Software includes the tools and runtime libraries for optimizing and deploying AI inference on your [AMD Ryzen™ AI](https://www.amd.com/en/products/processors/consumer/ryzen-ai.html) based PC. It enables developers to quickly build and run a variety of AI applications for Ryzen™ AI. It is designed with high efficiency and ease-of-use in mind, unleashing the full potential of AI acceleration on Ryzen™ AI. +This repository contains documentation, examples, and tutorials demonstrating the usage and capabilities of Ryzen AI Software. -This repository contains the demos, examples and tutorials, demonstrating usage and capabilities of the Ryzen™ AI Software. It is a subset of the Ryzen™ AI Software release. +## Documentation -Follow the instructions at [Ryzen™ AI Software](https://ryzenai.docs.amd.com/en/latest/inst.html) for installation. +The full documentation site is built from the `docs/` directory using [Docusaurus](https://docusaurus.io/). -## Git LFS and Instructions to clone: +**Live docs:** [ryzenai.docs.amd.com](https://ryzenai.docs.amd.com) - Due to the presence of large files in some examples/tutorials, Git Large File Storage (LFS) has been configured in this repository. Follow the instructions below to ensure Git LFS is properly set up: - - Install Git LFS by downloading it from the [official website](https://git-lfs.com/) - - After installation, run the following command in your terminal to set up Git LFS on your machine: -``` - git lfs install -``` - - Clone the repository (or a fork of it): +To run the docs site locally: + +```bash +cd docs/website +npm install +npx docusaurus start ``` -git clone https://github.com/amd/RyzenAI-SW.git + +## Repository Structure + ``` -- Pull the actual LFS files: -``` -git lfs pull +RyzenAI-SW/ +├── docs/ # Documentation and examples +│ ├── getting-started/ # Installation, quickstart, hardware support +│ ├── applications/ # Showcased AI PC applications +│ ├── models-tutorials/ # Models, tutorials, and example code +│ │ ├── llms/ # LLM tutorials and examples +│ │ ├── vision/ # CNN, object detection, image classification +│ │ ├── audio/ # Whisper ASR examples +│ │ ├── multimodal/ # VLM and multi-model pipelines +│ │ └── nlp/ # DistilBERT and NLP examples +│ ├── develop/ # Developer guides (deployment, quantization) +│ ├── tools/ # AI Analyzer, NPU management, benchmarking +│ ├── reference/ # Changelog, model list, supported operators +│ └── website/ # Docusaurus build infrastructure +└── .github/workflows/ # CI/CD pipelines ``` -To run the demos and examples in this repository, please follow the instructions of README.md in each directory. +## Getting Started +- [Installation](docs/getting-started/installation.mdx) +- [Quickstart](docs/getting-started/quickstart.mdx) +- [Supported Hardware](docs/getting-started/supported-hardware.mdx) -## Getting Started Tutorials +## LLM Tutorials -- [Getting started tutorial with a fine-tuned ResNet model](CNN-examples/getting_started_resnet) -- [Hello world jupyter notebook tutorial](CNN-examples/hello_world) -- [Getting started ResNet50 example on iGPU](CNN-examples/iGPU/getting_started) +- [LLMs on Ryzen AI with OGA API](docs/models-tutorials/llms/oga_api) +- [RAG with OGA](docs/models-tutorials/llms/RAG-OGA) +- [Vision Language Model (VLM)](docs/models-tutorials/llms/VLM) +- [OGA Inference](docs/models-tutorials/llms/oga_inference) +- [LLM Fine-tuning and Deployment](docs/models-tutorials/llms/llm-sft-deploy) -## LLM Flow +## Vision Examples -- [LLMs on RyzenAI with ONNX Runtime GenAI API](LLM-examples/oga_api) -- [ONNX Runtime GenAI(OGA)‑based RAG LLM](LLM-examples/RAG-OGA) -- [Running Vision Language Model (VLM) on RyzenAI NPU](LLM-examples/VLM) -- [Running GPT-OSS-20B with chat template](LLM-examples/oga_inference) +- [Getting Started with ResNet](docs/models-tutorials/vision/getting_started_resnet) +- [Hello World Notebook](docs/models-tutorials/vision/hello_world) +- [iGPU Getting Started](docs/models-tutorials/vision/iGPU/getting_started) +- [Image Classification](docs/models-tutorials/vision/image_classification) +- [Object Detection (YOLOv8)](docs/models-tutorials/vision/object_detection) +- [Super-Resolution](docs/models-tutorials/vision/super-resolution) +- [Torchvision Inference](docs/models-tutorials/vision/torchvision_inference) +- [AMD Quark Quantization](docs/models-tutorials/vision/quark_quantization) +- [CVML Library](docs/models-tutorials/vision/cvml) -## Examples +## Audio Examples -- BF16 Model Examples - - [Finetuned DistilBERT for Text Classification](Transformer-examples/DistilBERT_text_classification_bf16) - - [Image classification](CNN-examples/image_classification) -- [Object detection with Yolov8 models](CNN-examples/object_detection) -- [Automatic Speech Recognition: Step by Step guide to run Whisper-base on NPU](Transformer-examples/ASR/Whisper-AI) +- [Whisper ASR](docs/models-tutorials/audio/whisper) +## Multimodal Examples -## Demos +- [NPU-GPU Pipeline](docs/models-tutorials/multimodal/npu-gpu-pipeline) -- [NPU-GPU pipeline on RyzenAI](Demos/NPU-GPU-Pipeline) -- [Automatic Speech Recognition using OpenAI Whisper](Demos/ASR/Whisper) +## NLP Examples -## Other Tutorials +- [DistilBERT Text Classification](docs/models-tutorials/nlp/distilbert) -- [AMD Quark Quantization](CNN-examples/quark_quantization) -- [Run Ryzen AI CVML library application](Ryzen-AI-CVML-Library) -- [Torchvision models End-to-End inference with Ryzen AI](CNN-examples/torchvision_inference) +## Tools +- [ONNX Benchmark Utilities](docs/tools/benchmarking) +- [NPU Check](docs/tools/npu-check) -## Benchmarking +## Git LFS -- [ONNX benchmark utilities](onnx-benchmark) +Some examples contain large files managed by Git LFS. After cloning: +```bash +git lfs install +git lfs pull +``` ## Reference -- [Ryzen™ AI Developer Guide](https://ryzenai.docs.amd.com/en/latest) -- [ONNX Runtime Vitis-AI EP](https://onnxruntime.ai/docs/execution-providers/Vitis-AI-ExecutionProvider.html) +- [AMD AI Developer Program](https://www.amd.com/en/developer/ai-dev-program.html) +- [AMD Developer Community Discord](https://discord.gg/amd-dev) - [AMD AI Developer Forum](https://community.amd.com/t5/ai/ct-p/amd_ai) +- [Ryzen AI Developer Guide](https://ryzenai.docs.amd.com) +- [ONNX Runtime Vitis-AI EP](https://onnxruntime.ai/docs/execution-providers/Vitis-AI-ExecutionProvider.html) diff --git a/Ryzen-AI-CVML-Library/linux/onnx/cpu/libonnxruntime.so b/Ryzen-AI-CVML-Library/linux/onnx/cpu/libonnxruntime.so deleted file mode 120000 index a64790aa..00000000 --- a/Ryzen-AI-CVML-Library/linux/onnx/cpu/libonnxruntime.so +++ /dev/null @@ -1 +0,0 @@ -libonnxruntime.so.1 \ No newline at end of file diff --git a/Ryzen-AI-CVML-Library/linux/onnx/cpu/libonnxruntime.so.1 b/Ryzen-AI-CVML-Library/linux/onnx/cpu/libonnxruntime.so.1 deleted file mode 120000 index b56b9bac..00000000 --- a/Ryzen-AI-CVML-Library/linux/onnx/cpu/libonnxruntime.so.1 +++ /dev/null @@ -1 +0,0 @@ -libonnxruntime.so.1.22.0 \ No newline at end of file diff --git a/Ryzen-AI-CVML-Library/linux/onnx/ryzen14/libglog.so b/Ryzen-AI-CVML-Library/linux/onnx/ryzen14/libglog.so deleted file mode 120000 index c0455457..00000000 --- a/Ryzen-AI-CVML-Library/linux/onnx/ryzen14/libglog.so +++ /dev/null @@ -1 +0,0 @@ -libglog.so.1 \ No newline at end of file diff --git a/Ryzen-AI-CVML-Library/linux/onnx/ryzen14/libglog.so.1 b/Ryzen-AI-CVML-Library/linux/onnx/ryzen14/libglog.so.1 deleted file mode 120000 index f662722a..00000000 --- a/Ryzen-AI-CVML-Library/linux/onnx/ryzen14/libglog.so.1 +++ /dev/null @@ -1 +0,0 @@ -libglog.so.0.6.0 \ No newline at end of file diff --git a/Ryzen-AI-CVML-Library/linux/onnx/ryzen14/libonnxruntime.so b/Ryzen-AI-CVML-Library/linux/onnx/ryzen14/libonnxruntime.so deleted file mode 120000 index a64790aa..00000000 --- a/Ryzen-AI-CVML-Library/linux/onnx/ryzen14/libonnxruntime.so +++ /dev/null @@ -1 +0,0 @@ -libonnxruntime.so.1 \ No newline at end of file diff --git a/Ryzen-AI-CVML-Library/linux/onnx/ryzen14/libonnxruntime.so.1 b/Ryzen-AI-CVML-Library/linux/onnx/ryzen14/libonnxruntime.so.1 deleted file mode 120000 index 15331b1a..00000000 --- a/Ryzen-AI-CVML-Library/linux/onnx/ryzen14/libonnxruntime.so.1 +++ /dev/null @@ -1 +0,0 @@ -libonnxruntime.so.1.20.1 \ No newline at end of file diff --git a/Ryzen-AI-CVML-Library/linux/onnx/ryzen14/libonnxruntime_vitisai_ep.so b/Ryzen-AI-CVML-Library/linux/onnx/ryzen14/libonnxruntime_vitisai_ep.so deleted file mode 120000 index 1c746796..00000000 --- a/Ryzen-AI-CVML-Library/linux/onnx/ryzen14/libonnxruntime_vitisai_ep.so +++ /dev/null @@ -1 +0,0 @@ -libonnxruntime_vitisai_ep.so.1 \ No newline at end of file diff --git a/Ryzen-AI-CVML-Library/linux/onnx/ryzen14/libonnxruntime_vitisai_ep.so.1 b/Ryzen-AI-CVML-Library/linux/onnx/ryzen14/libonnxruntime_vitisai_ep.so.1 deleted file mode 120000 index 511b90ad..00000000 --- a/Ryzen-AI-CVML-Library/linux/onnx/ryzen14/libonnxruntime_vitisai_ep.so.1 +++ /dev/null @@ -1 +0,0 @@ -libonnxruntime_vitisai_ep.so.1.0.0 \ No newline at end of file diff --git a/Ryzen-AI-CVML-Library/linux/onnx/ryzen14/libtarget-factory.so b/Ryzen-AI-CVML-Library/linux/onnx/ryzen14/libtarget-factory.so deleted file mode 120000 index 9f5e671b..00000000 --- a/Ryzen-AI-CVML-Library/linux/onnx/ryzen14/libtarget-factory.so +++ /dev/null @@ -1 +0,0 @@ -libtarget-factory.so.3 \ No newline at end of file diff --git a/Ryzen-AI-CVML-Library/linux/onnx/ryzen14/libtarget-factory.so.3 b/Ryzen-AI-CVML-Library/linux/onnx/ryzen14/libtarget-factory.so.3 deleted file mode 120000 index 46871628..00000000 --- a/Ryzen-AI-CVML-Library/linux/onnx/ryzen14/libtarget-factory.so.3 +++ /dev/null @@ -1 +0,0 @@ -libtarget-factory.so.3.5.0 \ No newline at end of file diff --git a/Ryzen-AI-CVML-Library/linux/onnx/ryzen14/libunilog.so b/Ryzen-AI-CVML-Library/linux/onnx/ryzen14/libunilog.so deleted file mode 120000 index 8b837155..00000000 --- a/Ryzen-AI-CVML-Library/linux/onnx/ryzen14/libunilog.so +++ /dev/null @@ -1 +0,0 @@ -libunilog.so.3 \ No newline at end of file diff --git a/Ryzen-AI-CVML-Library/linux/onnx/ryzen14/libunilog.so.3 b/Ryzen-AI-CVML-Library/linux/onnx/ryzen14/libunilog.so.3 deleted file mode 120000 index 0b4d95b5..00000000 --- a/Ryzen-AI-CVML-Library/linux/onnx/ryzen14/libunilog.so.3 +++ /dev/null @@ -1 +0,0 @@ -libunilog.so.3.5.0 \ No newline at end of file diff --git a/Ryzen-AI-CVML-Library/linux/onnx/ryzen14/libvart-cpu-runner.so b/Ryzen-AI-CVML-Library/linux/onnx/ryzen14/libvart-cpu-runner.so deleted file mode 120000 index 7ea3baa8..00000000 --- a/Ryzen-AI-CVML-Library/linux/onnx/ryzen14/libvart-cpu-runner.so +++ /dev/null @@ -1 +0,0 @@ -libvart-cpu-runner.so.3 \ No newline at end of file diff --git a/Ryzen-AI-CVML-Library/linux/onnx/ryzen14/libvart-cpu-runner.so.3 b/Ryzen-AI-CVML-Library/linux/onnx/ryzen14/libvart-cpu-runner.so.3 deleted file mode 120000 index 891e4402..00000000 --- a/Ryzen-AI-CVML-Library/linux/onnx/ryzen14/libvart-cpu-runner.so.3 +++ /dev/null @@ -1 +0,0 @@ -libvart-cpu-runner.so.3.5.0 \ No newline at end of file diff --git a/Ryzen-AI-CVML-Library/linux/onnx/ryzen14/libvart-mem-manager.so b/Ryzen-AI-CVML-Library/linux/onnx/ryzen14/libvart-mem-manager.so deleted file mode 120000 index fe716c6d..00000000 --- a/Ryzen-AI-CVML-Library/linux/onnx/ryzen14/libvart-mem-manager.so +++ /dev/null @@ -1 +0,0 @@ -libvart-mem-manager.so.3 \ No newline at end of file diff --git a/Ryzen-AI-CVML-Library/linux/onnx/ryzen14/libvart-mem-manager.so.3 b/Ryzen-AI-CVML-Library/linux/onnx/ryzen14/libvart-mem-manager.so.3 deleted file mode 120000 index a8b4af52..00000000 --- a/Ryzen-AI-CVML-Library/linux/onnx/ryzen14/libvart-mem-manager.so.3 +++ /dev/null @@ -1 +0,0 @@ -libvart-mem-manager.so.3.5.0 \ No newline at end of file diff --git a/Ryzen-AI-CVML-Library/linux/onnx/ryzen14/libvart-runner.so b/Ryzen-AI-CVML-Library/linux/onnx/ryzen14/libvart-runner.so deleted file mode 120000 index f6dcdb9a..00000000 --- a/Ryzen-AI-CVML-Library/linux/onnx/ryzen14/libvart-runner.so +++ /dev/null @@ -1 +0,0 @@ -libvart-runner.so.3 \ No newline at end of file diff --git a/Ryzen-AI-CVML-Library/linux/onnx/ryzen14/libvart-runner.so.3 b/Ryzen-AI-CVML-Library/linux/onnx/ryzen14/libvart-runner.so.3 deleted file mode 120000 index 8a99f742..00000000 --- a/Ryzen-AI-CVML-Library/linux/onnx/ryzen14/libvart-runner.so.3 +++ /dev/null @@ -1 +0,0 @@ -libvart-runner.so.3.5.0 \ No newline at end of file diff --git a/Ryzen-AI-CVML-Library/linux/onnx/ryzen14/libvart-trace.so b/Ryzen-AI-CVML-Library/linux/onnx/ryzen14/libvart-trace.so deleted file mode 120000 index 6480799c..00000000 --- a/Ryzen-AI-CVML-Library/linux/onnx/ryzen14/libvart-trace.so +++ /dev/null @@ -1 +0,0 @@ -libvart-trace.so.3 \ No newline at end of file diff --git a/Ryzen-AI-CVML-Library/linux/onnx/ryzen14/libvart-trace.so.3 b/Ryzen-AI-CVML-Library/linux/onnx/ryzen14/libvart-trace.so.3 deleted file mode 120000 index 8fee3ae3..00000000 --- a/Ryzen-AI-CVML-Library/linux/onnx/ryzen14/libvart-trace.so.3 +++ /dev/null @@ -1 +0,0 @@ -libvart-trace.so.3.5.0 \ No newline at end of file diff --git a/Ryzen-AI-CVML-Library/linux/onnx/ryzen14/libvart-util.so b/Ryzen-AI-CVML-Library/linux/onnx/ryzen14/libvart-util.so deleted file mode 120000 index 9ecbafc3..00000000 --- a/Ryzen-AI-CVML-Library/linux/onnx/ryzen14/libvart-util.so +++ /dev/null @@ -1 +0,0 @@ -libvart-util.so.3 \ No newline at end of file diff --git a/Ryzen-AI-CVML-Library/linux/onnx/ryzen14/libvart-util.so.3 b/Ryzen-AI-CVML-Library/linux/onnx/ryzen14/libvart-util.so.3 deleted file mode 120000 index 14af522d..00000000 --- a/Ryzen-AI-CVML-Library/linux/onnx/ryzen14/libvart-util.so.3 +++ /dev/null @@ -1 +0,0 @@ -libvart-util.so.3.5.0 \ No newline at end of file diff --git a/Ryzen-AI-CVML-Library/linux/onnx/ryzen14/libxcompiler-xcompiler-core.so b/Ryzen-AI-CVML-Library/linux/onnx/ryzen14/libxcompiler-xcompiler-core.so deleted file mode 120000 index d06cb889..00000000 --- a/Ryzen-AI-CVML-Library/linux/onnx/ryzen14/libxcompiler-xcompiler-core.so +++ /dev/null @@ -1 +0,0 @@ -libxcompiler-xcompiler-core.so.3 \ No newline at end of file diff --git a/Ryzen-AI-CVML-Library/linux/onnx/ryzen14/libxcompiler-xcompiler-core.so.3 b/Ryzen-AI-CVML-Library/linux/onnx/ryzen14/libxcompiler-xcompiler-core.so.3 deleted file mode 120000 index 2c77fd12..00000000 --- a/Ryzen-AI-CVML-Library/linux/onnx/ryzen14/libxcompiler-xcompiler-core.so.3 +++ /dev/null @@ -1 +0,0 @@ -libxcompiler-xcompiler-core.so.3.5.0 \ No newline at end of file diff --git a/Ryzen-AI-CVML-Library/linux/onnx/ryzen14/libxir.so b/Ryzen-AI-CVML-Library/linux/onnx/ryzen14/libxir.so deleted file mode 120000 index 9752e81e..00000000 --- a/Ryzen-AI-CVML-Library/linux/onnx/ryzen14/libxir.so +++ /dev/null @@ -1 +0,0 @@ -libxir.so.3 \ No newline at end of file diff --git a/Ryzen-AI-CVML-Library/linux/onnx/ryzen14/libxir.so.3 b/Ryzen-AI-CVML-Library/linux/onnx/ryzen14/libxir.so.3 deleted file mode 120000 index 537fb538..00000000 --- a/Ryzen-AI-CVML-Library/linux/onnx/ryzen14/libxir.so.3 +++ /dev/null @@ -1 +0,0 @@ -libxir.so.3.5.0 \ No newline at end of file diff --git a/Transformer-examples/ASR/Whisper-AI/README.md b/Transformer-examples/ASR/Whisper-AI/README.md deleted file mode 100644 index a104aafb..00000000 --- a/Transformer-examples/ASR/Whisper-AI/README.md +++ /dev/null @@ -1,148 +0,0 @@ - - - - -

Ryzen™ AI ASR

-
- -# Running Whisper on Ryzen AI - -This Ryzen AI example lets you bring in OpenAI’s Whisper model and run fast, local automatic speech recognition (ASR) on your AMD NPU. Whisper is a versatile speech model trained on 680,000+ hours of diverse audio, capable of speech-to-text, translation, and language detection. -This example uses the [Whisper-base](https://huggingface.co/openai/whisper-base) variant and provides a simple demonstration of how to run it on the NPU. For real-time factor (RTF) evaluation of the model on the NPU, please refer to the [whisper-demo](https://github.com/amd/RyzenAI-SW/tree/main/demo/ASR/Whisper). - -Learn how you can: -- **Export Whisper models** from Hugging Face to ONNX format -- **Optimize** them for static shape inference -- **Run ASR** fully on-device using CPU or AMD NPU -- **Evaluate ASR** performance on sample data from public datasets like LibriSpeech. - -This example supports: -- **Audio file transcription** – load your own `.wav` files for instant speech-to-text - -## Prerequisites -**Step 1:** Install the latest Conda environment using [RyzenAI Documentation](https://ryzenai.docs.amd.com/en/latest/inst.html#). -Ensure the SDK and driver are installed. - -**Step 2:** Export Hugging face Whisper model to onnx and set static shape as mentioned below: -1. Activate conda environment: -```bash - conda create --name asr --clone ryzen-ai- - conda activate asr -``` -2. Navigate to the Whisper-AI directory: -```bash - cd \Transformer-examples\ASR\Whisper-AI -``` -3. Install the necessary libraries: -```bash - pip install -r requirements.txt - ``` -4. Export Whisper AI model to ONNX using [Hugging Face Optimum library](https://huggingface.co/docs/optimum/exporters/onnx/usage_guides/export_a_model): -```bash - optimum-cli export onnx --model openai/whisper-base.en --opset 17 exported_model_directory - ``` -**Note:** -The above command creates a new directory `exported_model_directory` in the current path. In `exported_model_directory`, you should see `encoder_model.onnx` and `decoder_model.onnx` models available. - -5. Convert the dynamic ONNX model to static using the `dynamic_to_static.py` script. -```bash - #Convert the encoder - python dynamic_to_static.py --input_model ".\exported_model_directory\encoder_model.onnx" - - #Convert the decoder - python dynamic_to_static.py --input_model ".\exported_model_directory\decoder_model.onnx" - ``` -The `dynamic_to_static.py` script utilizes `onnxruntime.tools.make_dynamic_shape_fixed` to convert dynamic shapes in an ONNX model to static shapes. It takes as input a `params.json` file, which specifies the dynamic dimensions to be fixed and their target static values. After the conversion, the script verifies the correctness of the modified ONNX model using the ONNX Checker and performs a dummy inference to ensure the model runs as expected. - -The `params.json` file defines the static shapes used to convert a dynamic Whisper-base ONNX model to a fixed-shape version suitable for optimized inference on NPUs. - -```bash - { - "batch_size": "1", - "encoder_sequence_length / 2": "1500", - "decoder_sequence_length": "180" -} -``` -- `"batch_size": "1"` - Fixes the model to process one audio sample at a time. -- `"encoder_sequence_length / 2": "1500"` - Whisper converts audio to a log-Mel spectrogram with 3000 frames for 30s of audio. After 2× downsampling, the encoder input length becomes 1500. This is fixed in params.json for optimized static-shape inference. -- `"decoder_sequence_length": "180"` - Fixed to 180 to match 30s of audio input (3000 tokens). At ~5 tokens/sec, average output is 150 tokens; 30-token buffer ensures completeness and handles variation - -**Note:** The final static ONNX models are stored in `.\exported_model_directory\encoder_model.onnx` and `.\exported_model_directory\decoder_model.onnx`. - -## Whisper ONNX Inference and Evaluation - -The `run_whisper.py` script performs speech-to-text transcription using a Whisper-base model exported to ONNX format. It supports transcribing audio from WAV files or a live microphone stream and can evaluate model accuracy on a labeled dataset using WER and CER metrics. The script runs the encoder and decoder models via ONNX Runtime, with support for both CPU and NPU backends, and includes chunk-based processing for long audio inputs. - -The `load_provider_options` function returns ONNX Runtime execution providers and configuration options based on the selected device (cpu or npu). - -```bash - provider = "VitisAIExecutionProvider" - - encoder_options = { - "config_file": "vaiep_config.json", - "cache_dir": "./cache/", - "cache_key": "whisper_encoder" - } - - decoder_options = { - "config_file": "vaiep_config.json", - "cache_dir": "./cache/", - "cache_key": "whisper_decoder" - } -``` -When running on the NPU, the provider options for the encoder and decoder are identical, except for the cache directory used. Both utilize the official RAI `vaiep_config.json` [configuration file](https://ryzenai.docs.amd.com/en/latest/modelrun.html#config-file-options). - -When running inference on the NPU, 100% of the encoder operators and 93.4% of the decoder operators are executed on the NPU. - -```bash - #encoder operations - [Vitis AI EP] No. of Operators : VAIML 225 - [Vitis AI EP] No. of Subgraphs : VAIML 1 - - #decoder operations - [Vitis AI EP] No. of Operators : CPU 24 VAIML 341 - [Vitis AI EP] No. of Subgraphs : VAIML 2 -``` - -Command to run transcription using `.wav` file or microphone: -```bash - python run_whisper.py \ - --encoder exported_model_directory\encoder_model.onnx \ - --decoder exported_model_directory\decoder_model.onnx \ - --device \ - --input -``` - -### Expected Output - -Run the above command with sample audio file and observe the expected Model output below - ---input audio_files\61-52s.wav - -```bash -Transcription: Also, there was a stripling page who turned into a maze with so sweet a lady, sir. -And in some manner, I do think she died. But then the picture was gone as quickly as it came. -Sister Nell, do you hear these models? Take your place and let us see what the crystal can show to you, like is not young, Master. -Though I am an old man. With all rant the opening of the tent to see what might be a miss. -But Master Will, who peeped out first, needed no more than one glance. -Mistress Fitzsooth to the rear of the Ted cries of "A knotting ham! A knotting ham!" before them fled the stroller and his three sons, capless and tear away. -"What is that tumult and rioting?" cried out the squire, authoritatively, and he blew twice on the silver whistle which hung at his belt. - -``` - -### Model Evaluation - -To evaluate model performance, we provide an eval_dataset directory containing sample audio from the LibriSpeech dataset. You can run the following command to generate a detailed report including WER, and CER metrics: -```bash -python run_whisper.py \ - --encoder exported_model_directory\encoder_model.onnx \ - --decoder exported_model_directory\decoder_model.onnx \ - --device \ - --eval-dir eval_dataset\LibriSpeech-samples \ - --results-dir -``` - -### Notes - -- If the model has not been precompiled before, the first run will take approximately 15 minutes to compile. -- Ensure that the paths to the encoder, decoder, and configuration file are correctly set based on your environment. diff --git a/Transformer-examples/ASR/Whisper-AI/audio_files/1089-134686-0000.wav b/Transformer-examples/ASR/Whisper-AI/audio_files/1089-134686-0000.wav deleted file mode 100644 index 2cdc6df3..00000000 Binary files a/Transformer-examples/ASR/Whisper-AI/audio_files/1089-134686-0000.wav and /dev/null differ diff --git a/Transformer-examples/ASR/Whisper-AI/audio_files/61-52s.wav b/Transformer-examples/ASR/Whisper-AI/audio_files/61-52s.wav deleted file mode 100644 index b94fb03b..00000000 Binary files a/Transformer-examples/ASR/Whisper-AI/audio_files/61-52s.wav and /dev/null differ diff --git a/Transformer-examples/ASR/Whisper-AI/eval_dataset/LibriSpeech-samples/transcripts.txt b/Transformer-examples/ASR/Whisper-AI/eval_dataset/LibriSpeech-samples/transcripts.txt deleted file mode 100644 index a9b64e71..00000000 --- a/Transformer-examples/ASR/Whisper-AI/eval_dataset/LibriSpeech-samples/transcripts.txt +++ /dev/null @@ -1,3 +0,0 @@ -61-70968-0000 HE BEGAN A CONFUSED COMPLAINT AGAINST THE WIZARD WHO HAD VANISHED BEHIND THE CURTAIN ON THE LEFT -1089-134686-0009 AT MOST BY AN ALMS GIVEN TO A BEGGAR WHOSE BLESSING HE FLED FROM HE MIGHT HOPE WEARILY TO WIN FOR HIMSELF SOME MEASURE OF ACTUAL GRACE -3570-5694-0000 BUT ALREADY AT A POINT IN ECONOMIC EVOLUTION FAR ANTEDATING THE EMERGENCE OF THE LADY SPECIALISED CONSUMPTION OF GOODS AS AN EVIDENCE OF PECUNIARY STRENGTH HAD BEGUN TO WORK OUT IN A MORE OR LESS ELABORATE SYSTEM \ No newline at end of file diff --git a/Transformer-examples/ASR/Whisper-AI/eval_dataset/LibriSpeech-samples/wav/1089-134686-0009.wav b/Transformer-examples/ASR/Whisper-AI/eval_dataset/LibriSpeech-samples/wav/1089-134686-0009.wav deleted file mode 100644 index 9c711054..00000000 Binary files a/Transformer-examples/ASR/Whisper-AI/eval_dataset/LibriSpeech-samples/wav/1089-134686-0009.wav and /dev/null differ diff --git a/Transformer-examples/ASR/Whisper-AI/eval_dataset/LibriSpeech-samples/wav/3570-5694-0000.wav b/Transformer-examples/ASR/Whisper-AI/eval_dataset/LibriSpeech-samples/wav/3570-5694-0000.wav deleted file mode 100644 index 31643ce5..00000000 Binary files a/Transformer-examples/ASR/Whisper-AI/eval_dataset/LibriSpeech-samples/wav/3570-5694-0000.wav and /dev/null differ diff --git a/Transformer-examples/ASR/Whisper-AI/eval_dataset/LibriSpeech-samples/wav/61-70968-0000.wav b/Transformer-examples/ASR/Whisper-AI/eval_dataset/LibriSpeech-samples/wav/61-70968-0000.wav deleted file mode 100644 index ab3cab4f..00000000 Binary files a/Transformer-examples/ASR/Whisper-AI/eval_dataset/LibriSpeech-samples/wav/61-70968-0000.wav and /dev/null differ diff --git a/Transformer-examples/ASR/Whisper-AI/requirements.txt b/Transformer-examples/ASR/Whisper-AI/requirements.txt deleted file mode 100644 index 35db0d29..00000000 --- a/Transformer-examples/ASR/Whisper-AI/requirements.txt +++ /dev/null @@ -1,9 +0,0 @@ -torch==2.8.0 -torchaudio==2.8.0 -sounddevice==0.5.3 -transformers==4.52.4 -onnxsim==0.4.36 -optimum==1.27.0 -accelerate==1.11.0 -jiwer==4.0.0 -PySoundFile==0.9.0 \ No newline at end of file diff --git a/Transformer-examples/ASR/Whisper-AI/run_whisper.py b/Transformer-examples/ASR/Whisper-AI/run_whisper.py deleted file mode 100644 index 9e143eea..00000000 --- a/Transformer-examples/ASR/Whisper-AI/run_whisper.py +++ /dev/null @@ -1,281 +0,0 @@ -import argparse -import json -import numpy as np -import onnxruntime as ort -import torchaudio -import sounddevice as sd -import queue -import threading -import time -from transformers import WhisperFeatureExtractor, WhisperTokenizer -from pathlib import Path -from jiwer import wer, cer - -SAMPLE_RATE = 16000 -CHUNK_SIZE = 1600 # 0.1 sec chunks - - -class WhisperONNX: - def __init__(self, encoder_path, decoder_path, - tokenizer_dir=None,encoder_providers=None, decoder_providers=None): - - self.encoder = ort.InferenceSession(encoder_path, providers=encoder_providers) - self.decoder = ort.InferenceSession(decoder_path, providers=decoder_providers) - - if tokenizer_dir is None: - tokenizer_dir = Path(encoder_path).parent - print(f"\nLoading tokenizer and feature extractor from: {Path(tokenizer_dir).resolve()}") - self.feature_extractor = WhisperFeatureExtractor.from_pretrained(tokenizer_dir) - self.tokenizer = WhisperTokenizer.from_pretrained(tokenizer_dir) - - self.decoder_start_token = self.sot_token = self.tokenizer.convert_tokens_to_ids("<|startoftranscript|>") - self.eos_token = self.tokenizer.eos_token_id - self.max_length = min(448, self.decoder.get_inputs()[0].shape[1]) - if not isinstance(self.max_length, int): - raise ValueError("Invalid/Dynamic input shapes") - def preprocess(self, audio): - """ - Convert raw audio to Whisper log-mel spectrogram - """ - inputs = self.feature_extractor(audio, sampling_rate=SAMPLE_RATE, return_tensors="np") - return inputs["input_features"] - - def encode(self, input_features): - """ - Run encoder ONNX model - """ - return self.encoder.run(None, {"input_features": input_features})[0] - - def decode(self, encoder_out): - """ - Greedy decode with fixed-length input_ids - """ - tokens = [self.decoder_start_token] - for _ in range(self.max_length): - # Pad input_ids to (1, max_length) - decoder_input = np.full((1, self.max_length), self.eos_token, dtype=np.int64) - decoder_input[0, :len(tokens)] = tokens - - outputs = self.decoder.run(None, { - "input_ids": decoder_input, - "encoder_hidden_states": encoder_out - }) - logits = outputs[0] - next_token = int(np.argmax(logits[0, len(tokens)-1])) - - if next_token == self.eos_token: - break - tokens.append(next_token) - return tokens - - def transcribe(self, audio, chunk_length_s=30, is_mic=False): - """ - Full encode-decode pipeline with support for long-form transcription using chunking. - """ - chunk_size = SAMPLE_RATE * chunk_length_s - total_samples = len(audio) - transcription = [] - chunk_idx = 0 - - overlap = SAMPLE_RATE * 1 #Tune this - for start in range(0, total_samples, chunk_size - overlap): - end = min(start + chunk_size, total_samples) - audio_chunk = audio[start:end] - - # Process the chunk - input_features = self.preprocess(audio_chunk) - encoder_out = self.encode(input_features) - tokens = self.decode(encoder_out) - transcription.append(self.tokenizer.decode(tokens, skip_special_tokens=True).strip()) - chunk_idx+= 1 - - # Combine all transcriptions - return " ".join(transcription) - -def evaluate(model, dataset_dir, results_dir): - dataset_name = Path(dataset_dir).name - wav_dir = Path(dataset_dir) / "wav" - transcript_file = Path(dataset_dir) / "transcripts.txt" - - if not transcript_file.exists() or not wav_dir.exists(): - print(f"Missing transcripts.txt or wav folder in {dataset_dir}") - return - - with open(transcript_file, "r", encoding="utf-8") as f: - references = {line.split()[0]: " ".join(line.strip().split()[1:]) for line in f.readlines()} - - output_dir = Path(results_dir) / dataset_name - output_dir.mkdir(parents=True, exist_ok=True) - result_file = output_dir / "results.txt" - - total_wer, total_cer, count = 0, 0, 0 - - with result_file.open("w", encoding="utf-8") as out_f: - for wav_path in sorted(wav_dir.glob("*.wav")): - key = wav_path.stem - if key not in references: - print(f"Reference for {key} not found in transcripts.txt") - continue - reference = references[key].lower() - # FIX: Convert Path to str for torchaudio - waveform, sr = torchaudio.load(str(wav_path)) - if sr != SAMPLE_RATE: - waveform = torchaudio.transforms.Resample(orig_freq=sr, new_freq=SAMPLE_RATE)(waveform) - audio = waveform.squeeze(0).numpy() - predicted = model.transcribe(audio) - - sample_wer = wer(reference, predicted) - sample_cer = cer(reference, predicted) - total_wer += sample_wer - total_cer += sample_cer - count += 1 - - out_f.write(f"{key}\n") - out_f.write(f"Reference: {reference}\n") - out_f.write(f"Predicted: {predicted}\n") - out_f.write(f"WER: {sample_wer:.3f}, CER: {sample_cer:.3f} \n\n") - - if count: - avg_wer = total_wer / count - avg_cer = total_cer / count - print(f"Evaluation completed for {count} files.") - print(f"Average WER: {avg_wer:.3f}, Average CER: {avg_cer:.3f}") - out_f.write(f"Summary:\nAverage WER: {avg_wer:.3f}\nAverage CER: {avg_cer:.3f}\n") - else: - print("No valid audio-transcript pairs found.") - -def load_provider_options(device): - if device == "cpu": - provider = "CPUExecutionProvider" - return [provider], [provider] - - elif device == "npu": - provider = "VitisAIExecutionProvider" - - encoder_options = { - "config_file": "vaiep_config.json", - "cache_dir": "./cache/", - "cache_key": "whisper_encoder" - } - - decoder_options = { - "config_file": "vaiep_config.json", - "cache_dir": "./cache/", - "cache_key": "whisper_decoder" - } - - return [(provider, encoder_options)], [(provider, decoder_options)] - - -def mic_stream(model, duration=0, silence_threshold=0.01, silence_duration=5.0): - """ - Capture microphone audio and transcribe in real time. - Automatically stops on silence if duration=0. - """ - q_audio = queue.Queue() - stop_flag = threading.Event() - - def audio_callback(indata, frames, time, status): - if status: - print(status, flush=True) - q_audio.put(indata.copy()) - - def feeder(): - try: - with sd.InputStream(samplerate=SAMPLE_RATE, channels=1, dtype='float32', - blocksize=CHUNK_SIZE, callback=audio_callback): - if duration > 0: - sd.sleep(int(duration * 1000)) - stop_flag.set() - else: - while not stop_flag.is_set(): - sd.sleep(100) - except sd.PortAudioError as e: - print(f"\n Microphone error: {e}") - print("Could not initialize microphone. Please check your audio device settings.") - stop_flag.set() - - threading.Thread(target=feeder, daemon=True).start() - - buffer = np.zeros((0,), dtype=np.float32) - silence_start = None - print("\n Real-time Transcription:") - while not stop_flag.is_set(): - try: - chunk = q_audio.get(timeout=0.1).squeeze() - buffer = np.concatenate((buffer, chunk)) - - # Check for silence - rms = np.sqrt(np.mean(chunk**2)) - if rms < silence_threshold: - if silence_start is None: - silence_start = time.time() - elif time.time() - silence_start >= silence_duration: - print("\n Silence detected. Stopping transcription.") - stop_flag.set() - break - else: - silence_start = None # Reset silence timer - - if len(buffer) >= SAMPLE_RATE * 2: - text, _ = model.transcribe(buffer, is_mic=True) - print(text) - buffer = np.zeros((0,), dtype=np.float32) - except queue.Empty: - continue - - - -def main(): - parser = argparse.ArgumentParser() - parser.add_argument("--input", help="WAV file path or 'mic'") - parser.add_argument("--encoder", required=True, help="Path to Whisper encoder ONNX model") - parser.add_argument("--decoder", required=True, help="Path to Whisper decoder ONNX model") - parser.add_argument("--tokenizer-dir", default=None, - help="Path to directory containing tokenizer and feature extractor files. " - "If not set, defaults to directory of encoder/decoder models.") - parser.add_argument("--eval-dir", help="Dataset directory with wavs/ and transcripts.txt") - parser.add_argument("--results-dir", default="results", help="Directory to store evaluation results") - parser.add_argument("--device", choices=['cpu', 'npu'], default='cpu') - parser.add_argument("--duration", type=int, default=0, help="Mic duration in seconds (0 = unlimited)") - args = parser.parse_args() - - - encoder_providers, decoder_providers = load_provider_options(args.device - ) - - model = WhisperONNX(args.encoder, - args.decoder, - tokenizer_dir=args.tokenizer_dir, - encoder_providers=encoder_providers, - decoder_providers=decoder_providers) - - if args.eval_dir: - evaluate(model, args.eval_dir, args.results_dir) - return - - if not args.input and not args.eval_dir: - print("Error: You must provide --input (wav or mic) or --eval-dir.") - return - - if args.input and args.input.lower() not in ['mic'] and not Path(args.input).suffix == '.wav': - print("Error: --input must be 'mic' or path to a .wav file.") - return - - if args.input.lower() == 'mic': - try: - mic_stream(model, args.duration) - except sd.PortAudioError as e: - print("Fix your device or try using a .wav file instead of mic. Exiting") - return - else: - waveform, sr = torchaudio.load(args.input) - if sr != SAMPLE_RATE: - waveform = torchaudio.transforms.Resample(orig_freq=sr, new_freq=SAMPLE_RATE)(waveform) - audio = waveform.squeeze(0).numpy() - text = model.transcribe(audio, chunk_length_s=30) - print("\n Transcription:", text) - - -if __name__ == "__main__": - main() diff --git a/docs/README.md b/docs/README.md new file mode 100644 index 00000000..b12f0cb4 --- /dev/null +++ b/docs/README.md @@ -0,0 +1,290 @@ + +# Ryzen AI Documentation Map + +This page is the single source of truth for how the old Sphinx/RST documentation at [ryzenai.docs.amd.com](https://ryzenai.docs.amd.com/en/latest/) and the old GitHub examples at [github.com/amd/RyzenAI-SW](https://github.com/amd/RyzenAI-SW) map to this new Docusaurus/MDX site. + +## Repository Structure + +``` +RyzenAI-SW/ + docs/ # Pure MDX documentation (source of truth for the website) + examples/ # Runnable code examples with plain README.md + website/ # Docusaurus infrastructure (config, theme, scripts) + scripts/ # Build-time scripts (sync-examples.mjs) + src/ # Theme customizations, components, CSS + i18n/ # Translation files (11 locales) + .github/ # CI/CD workflows +``` + +## How Tutorial Pages Work + +Tutorial pages in `docs/` that correspond to runnable code in `examples/` are auto-generated: + +```bash +node website/scripts/sync-examples.mjs # Generate docs from examples +node website/scripts/sync-examples.mjs --check # Verify sync (used in CI) +``` + +The `examples/*/README.md` files are the source of truth. Edit those, then re-run sync. Do **not** directly edit auto-generated `.mdx` files (they have a comment at the top). + +--- + +## Old RST to New MDX: Complete Migration Map + +The old documentation lived in [github.com/amd/ryzen-ai-documentation](https://github.com/amd/ryzen-ai-documentation) (24 RST files) and was published to [ryzenai.docs.amd.com/en/latest/](https://ryzenai.docs.amd.com/en/latest/). Below is every RST file and what happened to it. + +### "Getting Started on the NPU" (old section) + +| Old RST File | Old Published URL | New MDX File | Status | +|---|---|---|---| +| `index.rst` | [ryzenai.docs.amd.com/en/latest/](https://ryzenai.docs.amd.com/en/latest/) | `getting-started/overview.mdx` | **Replaced** with "Overview and Architecture" | +| `inst.rst` | [ryzenai.docs.amd.com/en/latest/inst.html](https://ryzenai.docs.amd.com/en/latest/inst.html) | `getting-started/installation.mdx` | **Migrated** (manual install merged in) | +| `examples.rst` | [ryzenai.docs.amd.com/en/latest/examples.html](https://ryzenai.docs.amd.com/en/latest/examples.html) | `models-tutorials/index.mdx` | **Merged** into Models and Tutorials landing | +| `relnotes.rst` | [ryzenai.docs.amd.com/en/latest/relnotes.html](https://ryzenai.docs.amd.com/en/latest/relnotes.html) | `reference/changelog/index.mdx` + `getting-started/supported-hardware.mdx` | **Split** into Changelog and Supported Hardware | + +### "Running Models on the NPU" (old section) + +| Old RST File | Old Published URL | New MDX File | Status | +|---|---|---|---| +| `model_quantization.rst` | [ryzenai.docs.amd.com/en/latest/model_quantization.html](https://ryzenai.docs.amd.com/en/latest/model_quantization.html) | `develop/model-quantization.mdx` | **Migrated** | +| `modelrun.rst` | [ryzenai.docs.amd.com/en/latest/modelrun.html](https://ryzenai.docs.amd.com/en/latest/modelrun.html) | `develop/model-deployment.mdx` | **Migrated** | +| `app_development.rst` | [ryzenai.docs.amd.com/en/latest/app_development.html](https://ryzenai.docs.amd.com/en/latest/app_development.html) | `develop/app-development.mdx` | **Migrated** | +| `whisper_cpp.rst` | [ryzenai.docs.amd.com/en/latest/whisper_cpp.html](https://ryzenai.docs.amd.com/en/latest/whisper_cpp.html) | `models-tutorials/audio/whisper/index.mdx` | **Migrated** under Audio | +| `getstartex.rst` | [ryzenai.docs.amd.com/en/latest/getstartex.html](https://ryzenai.docs.amd.com/en/latest/getstartex.html) | `models-tutorials/vision/cnn-examples.mdx` | **Migrated** as "ResNet INT8 Tutorial" | + +### "Running LLMs on the NPU" (old section) + +| Old RST File | Old Published URL | New MDX File | Status | +|---|---|---|---| +| `llm/overview.rst` | [ryzenai.docs.amd.com/en/latest/llm/overview.html](https://ryzenai.docs.amd.com/en/latest/llm/overview.html) | `models-tutorials/llms/overview.mdx` | **Migrated** (now LLM landing page) | +| `llm/server_interface.rst` | [ryzenai.docs.amd.com/en/latest/llm/server_interface.html](https://ryzenai.docs.amd.com/en/latest/llm/server_interface.html) | `models-tutorials/llms/server-interface.mdx` | **Migrated** | +| `llm/high_level_python.rst` | [ryzenai.docs.amd.com/en/latest/llm/high_level_python.html](https://ryzenai.docs.amd.com/en/latest/llm/high_level_python.html) | `models-tutorials/llms/python-api.mdx` | **Migrated** | +| `hybrid_oga.rst` | [ryzenai.docs.amd.com/en/latest/hybrid_oga.html](https://ryzenai.docs.amd.com/en/latest/hybrid_oga.html) | `models-tutorials/llms/hybrid-inference.mdx` | **Migrated** | +| `oga_model_prepare.rst` | [ryzenai.docs.amd.com/en/latest/oga_model_prepare.html](https://ryzenai.docs.amd.com/en/latest/oga_model_prepare.html) | `develop/onnx-model-preparation.mdx` | **Migrated** to Develop | +| `oga_op_prepare.rst` | [ryzenai.docs.amd.com/en/latest/oga_op_prepare.html](https://ryzenai.docs.amd.com/en/latest/oga_op_prepare.html) | `develop/operator-preparation.mdx` | **Migrated** to Develop | +| `llm_linux.rst` | (not in public sidebar) | `models-tutorials/llms/linux-setup.mdx` | **Migrated** | +| `hybrid_oga_pip_install_draft.rst` | (unpublished draft) | N/A | **Deleted** | + +### "Running Models on the GPU" (old section) + +| Old RST File | Old Published URL | New MDX File | Status | +|---|---|---|---| +| `gpu/ryzenai_gpu.rst` | [ryzenai.docs.amd.com/en/latest/gpu/ryzenai_gpu.html](https://ryzenai.docs.amd.com/en/latest/gpu/ryzenai_gpu.html) | `develop/rocm-client-gpu.mdx` | **Migrated** as "DirectML Flow (GPU)" | + +### "Additional Topics" (old section) + +| Old RST File | Old Published URL | New MDX File | Status | +|---|---|---|---| +| `xrt_smi.rst` | [ryzenai.docs.amd.com/en/latest/xrt_smi.html](https://ryzenai.docs.amd.com/en/latest/xrt_smi.html) | `tools/npu-management.mdx` | **Migrated** | +| `ai_analyzer.rst` | [ryzenai.docs.amd.com/en/latest/ai_analyzer.html](https://ryzenai.docs.amd.com/en/latest/ai_analyzer.html) | `tools/ai-analyzer.mdx` | **Migrated** | +| `sd_demo.rst` | [ryzenai.docs.amd.com/en/latest/sd_demo.html](https://ryzenai.docs.amd.com/en/latest/sd_demo.html) | `models-tutorials/vision/stable-diffusion.mdx` | **Migrated** under Vision | +| `ryzen_ai_libraries.rst` | [ryzenai.docs.amd.com/en/latest/ryzen_ai_libraries.html](https://ryzenai.docs.amd.com/en/latest/ryzen_ai_libraries.html) | `develop/cvml-library.mdx` | **Migrated** to Develop | +| `ops_support.rst` | [ryzenai.docs.amd.com/en/latest/ops_support.html](https://ryzenai.docs.amd.com/en/latest/ops_support.html) | `reference/supported-operators.mdx` | **Migrated** | +| `licenses.rst` | [ryzenai.docs.amd.com/en/latest/licenses.html](https://ryzenai.docs.amd.com/en/latest/licenses.html) | `reference/licenses.mdx` | **Migrated** | +| `model_list.rst` | [ryzenai.docs.amd.com/en/latest/model_list.html](https://ryzenai.docs.amd.com/en/latest/model_list.html) | `reference/model-list.mdx` | **Migrated** | + +--- + +## Tutorial and Example Pages: Full Lineage + +These pages originate from example code that lived in the **old GitHub repo** ([github.com/amd/RyzenAI-SW](https://github.com/amd/RyzenAI-SW)). Some were also linked from pages on the **old docs site** ([ryzenai.docs.amd.com](https://ryzenai.docs.amd.com/en/latest/)). They have been reorganized into `examples/` with auto-generated `.mdx` docs pages. + +### Vision Tutorials + +| Title | Old GitHub Location | Old Docs Site Reference | New `examples/` Location | New Docs MDX | +|---|---|---|---|---| +| ResNet INT8 Tutorial | [CNN-examples/](https://github.com/amd/RyzenAI-SW/tree/main/CNN-examples) | [getstartex.html](https://ryzenai.docs.amd.com/en/latest/getstartex.html) (Getting Started Tutorial) | `examples/vision/getting_started_resnet/` | `vision/cnn-examples.mdx` (hand-written) | +| ResNet Getting Started | [CNN-examples/](https://github.com/amd/RyzenAI-SW/tree/main/CNN-examples) | [getstartex.html](https://ryzenai.docs.amd.com/en/latest/getstartex.html) | `examples/vision/getting_started_resnet/` | `vision/getting-started-resnet/index.mdx` (auto-gen) | +| ResNet INT8 Quantization | [CNN-examples/](https://github.com/amd/RyzenAI-SW/tree/main/CNN-examples) | [getstartex.html](https://ryzenai.docs.amd.com/en/latest/getstartex.html) | `examples/vision/getting_started_resnet/int8/` | `vision/getting-started-resnet/int8/index.mdx` (auto-gen) | +| ResNet BF16 Tutorial | [CNN-examples/](https://github.com/amd/RyzenAI-SW/tree/main/CNN-examples) | [examples.html](https://ryzenai.docs.amd.com/en/latest/examples.html) (BF16 Model Examples) | `examples/vision/getting_started_resnet/bf16/` | `vision/getting-started-resnet/bf16/index.mdx` (auto-gen) | +| ResNet BF16 C++ Deployment | [CNN-examples/](https://github.com/amd/RyzenAI-SW/tree/main/CNN-examples) | [examples.html](https://ryzenai.docs.amd.com/en/latest/examples.html) | `examples/vision/getting_started_resnet/bf16/docs/` | `vision/getting-started-resnet/bf16/cpp-deployment.mdx` (auto-gen) | +| Hello World Tutorial | [CNN-examples/](https://github.com/amd/RyzenAI-SW/tree/main/CNN-examples) | [examples.html](https://ryzenai.docs.amd.com/en/latest/examples.html) (Hello world jupyter notebook) | `examples/vision/hello_world/` | `vision/hello-world/index.mdx` (auto-gen) | +| iGPU Getting Started | [CNN-examples/](https://github.com/amd/RyzenAI-SW/tree/main/CNN-examples) | [examples.html](https://ryzenai.docs.amd.com/en/latest/examples.html) (Getting started on iGPU) | `examples/vision/iGPU/getting_started/` | `vision/igpu-getting-started/index.mdx` (auto-gen) | +| Image Classification | [Transformer-examples/](https://github.com/amd/RyzenAI-SW/tree/main/Transformer-examples) | [examples.html](https://ryzenai.docs.amd.com/en/latest/examples.html) (BF16 Image classification) | `examples/vision/image_classification/` | `vision/image-classification/index.mdx` (auto-gen) | +| YOLOv8m Object Detection | [CNN-examples/](https://github.com/amd/RyzenAI-SW/tree/main/CNN-examples) | [examples.html](https://ryzenai.docs.amd.com/en/latest/examples.html) (Object detection with Yolov8) | `examples/vision/object_detection/yolov8m/` | `vision/object-detection/yolov8m/index.mdx` (auto-gen) | +| YOLOv8s-WorldV2 | [CNN-examples/](https://github.com/amd/RyzenAI-SW/tree/main/CNN-examples) | [examples.html](https://ryzenai.docs.amd.com/en/latest/examples.html) (Object detection with Yolov8) | `examples/vision/object_detection/yolov8s-worldv2/` | `vision/object-detection/yolov8s-worldv2/index.mdx` (auto-gen) | +| Super Resolution | Not in old repo | New | `examples/vision/super-resolution/` | `vision/super-resolution/index.mdx` (auto-gen) | +| Quark Quantization | Not in old repo | [examples.html](https://ryzenai.docs.amd.com/en/latest/examples.html) (AMD Quark Quantization) | `examples/vision/quark_quantization/` | `vision/quark-quantization/index.mdx` (auto-gen) | +| Advanced Quantization | Not in old repo | [examples.html](https://ryzenai.docs.amd.com/en/latest/examples.html) | `examples/vision/quark_quantization/docs/` | `vision/quark-quantization/advanced.mdx` (auto-gen) | +| CVML Library Tutorial | [Ryzen-AI-CVML-Library/](https://github.com/amd/RyzenAI-SW/tree/main/Ryzen-AI-CVML-Library) | [ryzen_ai_libraries.html](https://ryzenai.docs.amd.com/en/latest/ryzen_ai_libraries.html) + [examples.html](https://ryzenai.docs.amd.com/en/latest/examples.html) | `examples/vision/cvml/` | `vision/cvml/index.mdx` (auto-gen) | +| Torchvision Inference | Not in old repo | [examples.html](https://ryzenai.docs.amd.com/en/latest/examples.html) (Torchvision models) | `examples/vision/torchvision_inference/` | `vision/torchvision-inference/index.mdx` (auto-gen) | +| Stable Diffusion Demo | [Demos/](https://github.com/amd/RyzenAI-SW/tree/main/Demos) | [sd_demo.html](https://ryzenai.docs.amd.com/en/latest/sd_demo.html) | N/A (hand-written page) | `vision/stable-diffusion.mdx` (hand-written) | + +### LLM Tutorials + +| Title | Old GitHub Location | Old Docs Site Reference | New `examples/` Location | New Docs MDX | +|---|---|---|---|---| +| OGA C++ API | [LLM-examples/](https://github.com/amd/RyzenAI-SW/tree/main/LLM-examples) | [hybrid_oga.html](https://ryzenai.docs.amd.com/en/latest/hybrid_oga.html) (OGA Flow) | `examples/llms/oga_api/` | `llms/oga-api/index.mdx` (auto-gen) | +| OGA Inference (Python) | [LLM-examples/](https://github.com/amd/RyzenAI-SW/tree/main/LLM-examples) | [examples.html](https://ryzenai.docs.amd.com/en/latest/examples.html) (Running GPT-OSS-20B with chat template) | `examples/llms/oga_inference/` | `llms/oga-inference/index.mdx` (auto-gen) | +| Fine-tune and Deploy LLMs | [LLM-examples/](https://github.com/amd/RyzenAI-SW/tree/main/LLM-examples) | New | `examples/llms/llm-sft-deploy/` | `llms/llm-sft-deploy/index.mdx` (auto-gen) | +| RAG with OGA | [LLM-examples/](https://github.com/amd/RyzenAI-SW/tree/main/LLM-examples) | [examples.html](https://ryzenai.docs.amd.com/en/latest/examples.html) (OGA-based RAG LLM) | `examples/llms/RAG-OGA/` | `llms/rag-oga/index.mdx` (auto-gen) | +| Vision-Language Models (VLM) | [LLM-examples/](https://github.com/amd/RyzenAI-SW/tree/main/LLM-examples) | [examples.html](https://ryzenai.docs.amd.com/en/latest/examples.html) (Running VLM on RyzenAI NPU) | `examples/llms/VLM/` | `llms/vlm/index.mdx` (auto-gen) | + +### Audio Tutorials + +| Title | Old GitHub Location | Old Docs Site Reference | New `examples/` Location | New Docs MDX | +|---|---|---|---|---| +| Running Whisper on Ryzen AI | [Demos/](https://github.com/amd/RyzenAI-SW/tree/main/Demos) | [whisper_cpp.html](https://ryzenai.docs.amd.com/en/latest/whisper_cpp.html) + [examples.html](https://ryzenai.docs.amd.com/en/latest/examples.html) | `examples/audio/whisper/` | `audio/whisper/index.mdx` (auto-gen) | + +### Multimodal Tutorials + +| Title | Old GitHub Location | Old Docs Site Reference | New `examples/` Location | New Docs MDX | +|---|---|---|---|---| +| NPU-GPU Pipeline | [Demos/](https://github.com/amd/RyzenAI-SW/tree/main/Demos) | [examples.html](https://ryzenai.docs.amd.com/en/latest/examples.html) (NPU-GPU pipeline on RyzenAI) | `examples/multimodal/npu-gpu-pipeline/` | `multimodal/npu-gpu-pipeline/index.mdx` (auto-gen) | + +### NLP Tutorials + +| Title | Old GitHub Location | Old Docs Site Reference | New `examples/` Location | New Docs MDX | +|---|---|---|---|---| +| DistilBERT Text Classification | [Transformer-examples/](https://github.com/amd/RyzenAI-SW/tree/main/Transformer-examples) | [examples.html](https://ryzenai.docs.amd.com/en/latest/examples.html) (Finetuned DistilBERT) | `examples/nlp/distilbert/` | `nlp/distilbert/index.mdx` (auto-gen) | + +### Tool Tutorials + +| Title | Old GitHub Location | Old Docs Site Reference | New `examples/` Location | New Docs MDX | +|---|---|---|---|---| +| NPU Benchmark Tool | [onnx-benchmark/](https://github.com/amd/RyzenAI-SW/tree/main/onnx-benchmark) | [examples.html](https://ryzenai.docs.amd.com/en/latest/examples.html) (ONNX benchmark utilities) | `examples/tools/benchmarking/` | `tools/benchmarking/index.mdx` (auto-gen) | +| NPU Check Utilities | [utilities/npu_check/](https://github.com/amd/RyzenAI-SW/tree/main/utilities/npu_check) | New | `examples/tools/npu-check/` | `tools/npu-check/index.mdx` (auto-gen) | + +--- + +## New Pages (no old RST or GitHub equivalent) + +| New MDX File | Title | Notes | +|---|---|---| +| `getting-started/quickstart.mdx` | Quickstart | New content | +| `applications/index.mdx` | Applications | New: AMD + third-party apps | +| `models-tutorials/index.mdx` | Models and Tutorials | New landing (incorporates old `examples.rst`) | +| `models-tutorials/audio/index.mdx` | Audio Models | New category overview | +| `models-tutorials/audio/supported-models.mdx` | Audio: Supported Models | New model list | +| `models-tutorials/llms/index.mdx` | Large Language Models | New category overview | +| `models-tutorials/llms/supported-models.mdx` | Supported LLMs | New: detailed HuggingFace table | +| `models-tutorials/multimodal/index.mdx` | Multimodal Models | New category overview | +| `models-tutorials/multimodal/supported-models.mdx` | Multimodal: Supported Models | New model list | +| `models-tutorials/vision/index.mdx` | Vision Models | New category overview | +| `models-tutorials/vision/supported-models.mdx` | Vision: Supported Models | New model list | +| `models-tutorials/nlp/index.mdx` | NLP Models | New category overview | +| `develop/index.mdx` | Develop and Tools | New landing page | +| `tools/index.mdx` | Tools | New landing page | +| `reference/index.mdx` | Reference | New landing page | + +--- + +## Complete Current Sidebar (alphabetical within sections) + +### Getting Started (landing: Overview and Architecture) + +- Installation +- Quickstart +- Supported Hardware + +### Applications (landing: Applications index) + +### Models and Tutorials (landing: Models and Tutorials index) + +**Audio** (click goes to Audio Models overview) +- Audio: Supported Models +- Running Whisper on Ryzen AI + +**Large Language Models** (click goes to LLM Deployment Overview) +- DistilBERT Text Classification +- Fine-tune and Deploy LLMs +- High-Level Python SDK +- OGA C++ API +- OGA Inference (Python) +- OnnxRuntime GenAI (OGA) Flow +- RAG with OGA +- Running LLM on Linux +- Server Interface (REST API) +- Supported LLMs +- Vision-Language Models (VLM) + +**Multimodal** (click goes to Multimodal: Supported Models) +- NPU-GPU Pipeline + +**Vision** (click goes to Vision Models overview) +- Hello World Tutorial +- iGPU Getting Started +- Image Classification +- ResNet BF16 C++ Deployment +- ResNet BF16 Tutorial +- ResNet INT8 Tutorial +- Stable Diffusion Demo +- Super Resolution +- Vision: Supported Models +- YOLOv8m Object Detection +- YOLOv8s-WorldV2 + +### Develop and Tools (landing: Develop and Tools index) + +- Advanced Quantization +- AI Analyzer +- Application Development +- CVML Library (Tutorial) +- DirectML Flow (GPU) +- Model Compilation and Deployment +- Model Quantization +- NPU Benchmark Tool +- NPU Check Utilities +- NPU Management Interface +- ONNX Model Preparation +- Operator Preparation +- Quark Quantization (Tutorial) +- Ryzen AI CVML Library +- Torchvision Inference + +### Reference (landing: Reference index) + +- Changelog +- Licensing Information +- Model Table +- Supported Operators + +--- + +## Pages Removed or Merged + +| Page | What Happened | +|---|---| +| `examples.rst` (old docs site) | **Merged** into `models-tutorials/index.mdx` | +| `hybrid_oga_pip_install_draft.rst` (old docs site) | **Deleted** (unpublished draft) | +| `models-tutorials/examples.mdx` (new site) | **Merged** into `models-tutorials/index.mdx` | +| `getting-started-resnet/index.mdx` (new site) | **Removed from sidebar** (thin wrapper, only linked to INT8/BF16) | +| `getting-started-resnet/int8/index.mdx` (new site) | **Removed from sidebar** (no substantive content) | + +--- + +## Old GitHub Repo Structure to New Structure + +The old [github.com/amd/RyzenAI-SW](https://github.com/amd/RyzenAI-SW) repo had these top-level folders for examples: + +| Old GitHub Folder | What it Contained | New Location | +|---|---|---| +| `CNN-examples/` | ResNet INT8/BF16, Hello World, iGPU, YOLOv8 | `examples/vision/` (various subfolders) | +| `Transformer-examples/` | DistilBERT, Image Classification | `examples/nlp/distilbert/` + `examples/vision/image_classification/` | +| `LLM-examples/` | OGA API, OGA Inference, RAG, VLM, SFT Deploy | `examples/llms/` (various subfolders) | +| `Demos/` | NPU-GPU Pipeline, Whisper | `examples/multimodal/npu-gpu-pipeline/` + `examples/audio/whisper/` | +| `Ryzen-AI-CVML-Library/` | CVML samples (face detection, face mesh) | `examples/vision/cvml/` | +| `onnx-benchmark/` | ONNX benchmark tool | `examples/tools/benchmarking/` | +| `utilities/npu_check/` | NPU check utility | `examples/tools/npu-check/` | + +--- + +## i18n (Translations) + +Translation scaffolding exists for 11 locales. English fallback content is pre-populated at `website/i18n/{locale}/docusaurus-plugin-content-docs/current/`. + +| Locale | Language | +|---|---| +| `en` | English (default) | +| `zh-Hans` | Chinese (Simplified) | +| `ja` | Japanese | +| `ko` | Korean | +| `pt-BR` | Portuguese (Brazil) | +| `es` | Spanish | +| `hi` | Hindi | +| `de` | German | +| `fr` | French | +| `ru` | Russian | +| `uk` | Ukrainian | + +The Docusaurus dev server only serves one locale at a time. To test translations locally, run `npx docusaurus start --locale fr`. The production build generates all locales. diff --git a/docs/applications/index.mdx b/docs/applications/index.mdx new file mode 100644 index 00000000..bf3a20cd --- /dev/null +++ b/docs/applications/index.mdx @@ -0,0 +1,42 @@ +--- +title: 💻 Applications with Ryzen +sidebar_position: 1 +--- + +import DocCardList from '@theme/DocCardList'; + +# 💻 Applications with Ryzen + +AI applications optimized for AMD Ryzen AI hardware, running locally on NPU and GPU. + +| Application | Developer | Description | NPU | GPU | +|------------|-----------|-------------|-----|-----| +| [AI Toolkit for VS Code](https://marketplace.visualstudio.com/items?itemName=ms-windows-ai-studio.windows-ai-studio) | Microsoft | AI model development toolkit for Visual Studio Code | | Yes | +| [Amuse](https://www.tensorstack.ai/amuse) | TensorStack | AI image generation desktop application | Yes | Yes | +| [AnythingLLM](https://anythingllm.com/) | Mintplex Labs | All-in-one desktop LLM application | Yes | | +| [Belt Desktop](https://belt.ai) | Belt AI | Local AI assistant for productivity | [Yes](https://www.amd.com/en/ecosystem/isv/consumer-partners/belt.html) | [Yes](https://www.amd.com/en/ecosystem/isv/consumer-partners/belt.html) | +| [Camo Studio](https://reincubate.com) | Reincubate | AI-powered webcam with background effects | [Yes](https://www.amd.com/en/ecosystem/isv/consumer-partners/camo.html) | [Yes](https://www.amd.com/en/ecosystem/isv/consumer-partners/camo.html) | +| [CodeGPT](https://codegpt.co/) | CodeGPT | AI code assistant for VS Code | Yes | | +| [ComfyUI](https://github.com/comfyanonymous/ComfyUI) | Comfy Org | Node-based Stable Diffusion workflow editor | Yes | Yes | +| [Continue](https://continue.dev/) | Continue | Open-source AI code assistant for VS Code and JetBrains | Yes | | +| [FastFlowLM](https://fastflowlm.com/) | FastFlowLM | High-performance local LLM inference engine | Yes | | +| [GAIA](https://github.com/amd/gaia) | AMD | Open-source local AI agent framework | [Yes](https://github.com/amd/gaia#features) | [Yes](https://github.com/amd/gaia#features) | +| [Generate](https://iterate.ai/platform/generate) | Iterate.ai | Enterprise AI platform for building AI applications | [Yes](https://www.amd.com/en/resources/case-studies/iterate-ai.html) | [Yes](https://www.amd.com/en/resources/case-studies/iterate-ai.html) | +| [Hyperlink](https://hyperlink.nexa.ai) | Nexa AI | Local AI-powered web browser | [Yes](https://www.amd.com/en/ecosystem/isv/consumer-partners/nexa.html) | [Yes](https://www.amd.com/en/ecosystem/isv/consumer-partners/nexa.html) | +| [Lemonade](https://github.com/lemonade-sdk/lemonade) | AMD | LLM serving platform with OpenAI-compatible API | [Yes](https://github.com/lemonade-sdk/lemonade#supported-hardware) | [Yes](https://github.com/lemonade-sdk/lemonade#supported-hardware) | +| [LM Studio](https://lmstudio.ai) | LM Studio | Desktop application for running local LLMs | [Yes](https://www.amd.com/en/ecosystem/isv/consumer-partners/lm-studio.html) | [Yes](https://www.amd.com/en/ecosystem/isv/consumer-partners/lm-studio.html) | +| [LM-Eval](https://github.com/EleutherAI/lm-evaluation-harness) | EleutherAI | LLM evaluation and benchmarking harness | Yes | | +| [Open WebUI](https://openwebui.com/) | Open WebUI | Self-hosted LLM chat interface | Yes | | +| [Promeo](https://www.cyberlink.com) | CyberLink | AI video creation and editing suite | Yes | Yes | +| [Render FX](https://www.distinctplugins.io/renderfx) | Distinct AI | Real-time AI image generation plugin | | Yes | +| [Scam Detector](https://www.mcafee.com) | McAfee | AI-powered scam and phishing detection | [Yes](https://www.amd.com/en/ecosystem/isv/consumer-partners/mcafee.html) | | +| [Video AI](https://www.topazlabs.com/video-ai) | Topaz Labs | AI video enhancement and upscaling | | [Yes](https://gpuopen.com/learn/topaz-labs-brings-generative-ai-video-enhancement/) | +| [Voicemod](https://www.voicemod.net) | Voicemod | Real-time AI voice changer | | [Yes](https://www.amd.com/en/ecosystem/isv/consumer-partners/voicemod.html) | + +## Lemonade Server Compatibility + +[Lemonade](https://github.com/lemonade-sdk/lemonade) is AMD's open-source LLM serving platform. It provides an [OpenAI-compatible REST API](/models-tutorials/llms/server-interface) that allows any application supporting the OpenAI API standard to run LLMs locally on Ryzen AI hardware. Applications marked with NPU support above can use Lemonade Server as a backend for local inference. + +:::tip +Want to build your own application on Ryzen AI? See the [Tools](/develop) section for SDK documentation and the [Models & Tutorials](/models-tutorials) section for supported models. +::: diff --git a/docs/develop/app-development.mdx b/docs/develop/app-development.mdx new file mode 100644 index 00000000..0025251b --- /dev/null +++ b/docs/develop/app-development.mdx @@ -0,0 +1,144 @@ +--- +title: Application Development +sidebar_position: 1 +ci_validated: false +ci_last_run: 2026-02-27 +--- + +import CIStatus from '@site/src/components/CIStatus'; + +# Application Development + + + +This page captures requirements and recommendations for developers looking to create, package and distribute applications targeting NPU-enabled AMD processors. + +## VitisAI EP / NPU Driver Compatibility + +The VitisAI EP requires a compatible version of the NPU drivers. For each version of the VitisAI EP, compatible drivers are bounded by a minimum version and a maximum release date. NPU drivers are backward compatible with VitisAI EP released up to three years. The maximum driver release date is therefore set to three years after the release date of the corresponding VitisAI EP. + +The following table summarizes the driver requirements for the different versions of the VitisAI EP. + +| VitisAI EP version | Minimum NPU Driver version | Maximum NPU Driver release date | +|-------------------|----------------------------|----------------------------------| +| 1.7 | 32.0.203.280 | Jan 22nd, 2029 | +| 1.6 | 32.0.203.280 | Oct 7th, 2028 | +| 1.5 | 32.0.203.280 | July 1st, 2028 | +| 1.4.1 | 32.0.203.259 | May 13th, 2028 | +| 1.4 | 32.0.203.257 | March 25th, 2028 | +| 1.3.1 | 32.0.203.242 | January 17th, 2028 | +| 1.3 | 32.0.203.237 | November 26th, 2027 | +| 1.2 | 32.0.201.204 | July 30th, 2027 | + +The application must verify that NPU drivers compatible with the version of the Vitis AI EP in use are installed. + +## APU Types + +The Ryzen AI Software supports various types of NPU-enabled APUs, referred to as PHX, HPT, STX, and KRK. To programmatically determine the type of the local APU, you can enumerate the PCI devices and look for an instance with a matching Hardware ID. + +| Vendor | Device | Revision | APU Type | +|--------|--------|----------|----------| +| 0x1022 | 0x1502 | 0x00 | PHX or HPT | +| 0x1022 | 0x17F0 | 0x00 | STX | +| 0x1022 | 0x17F0 | 0x10 | STX | +| 0x1022 | 0x17F0 | 0x11 | STX | +| 0x1022 | 0x17F0 | 0x20 | KRK | + +The application must verify that it is running on an AMD processor with an NPU, and that the NPU type is supported by the version of the Vitis AI EP in use. + +## NPU Utilities + +When deploying applications across various NPU devices, users can determine the specific type of NPU device using Python/C++ code. Based on the detected device—such as PHX, STX, KRK, or other device—users should configure the appropriate provider options as outlined in [Model Compilation and Deployment](/develop/model-deployment). + +For Python, the user can get the specific NPU type using the following example `get_npu_info` function in the `%RYZEN_AI_INSTALLATION_PATH%\quicktest\quicktest.py` + +For C++, a set of APIs are provided to extract information about the NPU and check driver compatibility of the VitisAI EP with the rest of the environment. For details refer to [C++ NPU Utilities](/tools/npu-check) + +## Application Development Requirements + +### ONNX-RT Session + +The application should only use the Vitis AI Execution Provider if the following conditions are met: + +- The application is running on an AMD processor with an NPU type supported by the version of the Vitis AI EP in use. See [APU Types](#apu-types). +- NPU drivers compatible with the version of the Vitis AI EP being used are installed. See [compatibility table](#vitisai-ep--npu-driver-compatibility). + +:::info +**NOTE**: Sample C++ code that implements the compatibility checks to be performed before using the Vitis AI EP is available [here](/tools/npu-check) +::: + +### VitisAI EP Provider Options + +For INT8 models, the application should detect the type of APU present (PHX, HPT, STX, or KRK) and set the `target` and `xclbin` provider options accordingly. Refer to the section on [using INT8 models](/develop/model-deployment#using-int8-models) for more details. + +For BF16 models, the application should set the `config_file` provider option to the same file that was used to precompile the BF16 model. Refer to the section on [using BF16 models](/develop/model-deployment#using-bf16-models) for more details. + +### Pre-Compiled Models + +To avoid the overhead of recompiling models, it is highly recommended to save the compiled models and use these precompiled versions in the final application. Precompiled models can be loaded instantly and executed immediately on the NPU, significantly improving session creation time and overall end-user experience. + +AMD recommends using the ONNXRuntime [EP Context Cache](/develop/model-deployment#onnx-runtime-ep-context-cache) feature for saving and reloading compiled models. + +**BF16 models** + +The deployment version of the VitisAI Execution Provider (EP) does not support the on-the-fly compilation of BF16 models. Applications utilizing BF16 models must include pre-compiled versions of these models. The VitisAI EP can then load the pre-compiled models and deploy them efficiently on the NPU. + +**INT8 models** + +Including pre-compiled versions of INT8 models is recommended but not mandatory. + +## Application Packaging Requirements + +An updated version of the Ryzen AI deployment DLLs is available for download at the [following link](https://download.amd.com/opendownload/RyzenAI/rai-deployment-dlls-1.7.0.p1.zip). These updated DLLs are intended to replace the ones located in the %RYZEN_AI_INSTALLATION_PATH%/deployment folder of the Ryzen AI 1.7 installation tree. + +A C++ application built on the Ryzen AI ONNX Runtime must include the following components in its distribution package: + +**For INT8 models** + +- DLLs: + + - %RYZEN_AI_INSTALLATION_PATH%\deployment\aiecompiler_client.dll + - %RYZEN_AI_INSTALLATION_PATH%\deployment\DirectML.dll + - %RYZEN_AI_INSTALLATION_PATH%\deployment\dyn_dispatch_core.dll + - %RYZEN_AI_INSTALLATION_PATH%\deployment\onnxruntime_providers_shared.dll + - %RYZEN_AI_INSTALLATION_PATH%\deployment\onnxruntime_providers_vitisai.dll + - %RYZEN_AI_INSTALLATION_PATH%\deployment\onnxruntime_vitis_ai_custom_ops.dll + - %RYZEN_AI_INSTALLATION_PATH%\deployment\onnxruntime_vitisai_ep.dll + - %RYZEN_AI_INSTALLATION_PATH%\deployment\onnxruntime.dll + - %RYZEN_AI_INSTALLATION_PATH%\deployment\zlib.dll + - %RYZEN_AI_INSTALLATION_PATH%\deployment\zstd.dll + +- NPU Binary files (.xclbin) from the `%RYZEN_AI_INSTALLATION_PATH%\voe-4.0-win_amd64\xclbins` folder + +- Recommended but not mandatory: pre-compiled models in the form of [Onnx Runtime EP context models](/develop/model-deployment#onnx-runtime-ep-context-cache) + +**For BF16 models** + +- DLLs: + + - %RYZEN_AI_INSTALLATION_PATH%\deployment\onnxruntime.dll + - %RYZEN_AI_INSTALLATION_PATH%\deployment\onnxruntime_providers_shared.dll + - %RYZEN_AI_INSTALLATION_PATH%\deployment\onnxruntime_providers_vitisai.dll + - %RYZEN_AI_INSTALLATION_PATH%\deployment\onnxruntime_vitisai_ep.dll + - %RYZEN_AI_INSTALLATION_PATH%\deployment\dyn_dispatch_core.dll + - %RYZEN_AI_INSTALLATION_PATH%\deployment\libutf8_validity.dll + - %RYZEN_AI_INSTALLATION_PATH%\deployment\ryzenai_onnx_utils.dll + - %RYZEN_AI_INSTALLATION_PATH%\deployment\zlib.dll + - %RYZEN_AI_INSTALLATION_PATH%\deployment\zstd.dll + - %RYZEN_AI_INSTALLATION_PATH%\deployment\abseil_dll.dll + - %RYZEN_AI_INSTALLATION_PATH%\deployment\aiecompiler_client.dll + +- Pre-compiled models in the form of [Vitis AI EP cache folders](/develop/model-deployment#vitisai-ep-cache) +- To compile BF16 models from C++ code, applications should include `${CONDA_PREFIX}/Lib/site-packages/flexml/flexml_extras/lib/vaiml.dll` in addition to the DLLs listed above. + +**For LLMs** + +- DLLs: + + - %RYZEN_AI_INSTALLATION_PATH%\deployment\onnxruntime-genai.dll + - %RYZEN_AI_INSTALLATION_PATH%\deployment\onnxruntime.dll + - %RYZEN_AI_INSTALLATION_PATH%\deployment\ryzen_mm.dll + - %RYZEN_AI_INSTALLATION_PATH%\deployment\onnx_custom_ops.dll + - %RYZEN_AI_INSTALLATION_PATH%\deployment\libutf8_validity.dll + - %RYZEN_AI_INSTALLATION_PATH%\deployment\abseil_dll.dll + - %RYZEN_AI_INSTALLATION_PATH%\deployment\DirectML.dll diff --git a/docs/develop/cvml-library.mdx b/docs/develop/cvml-library.mdx new file mode 100644 index 00000000..22a3873d --- /dev/null +++ b/docs/develop/cvml-library.mdx @@ -0,0 +1,168 @@ +--- +title: Ryzen AI CVML library +sidebar_position: 2 +ci_validated: false +ci_last_run: 2026-02-27 +--- + +import CIStatus from '@site/src/components/CIStatus'; +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +# Ryzen AI CVML library + + + +The Ryzen AI CVML libraries build on top of the Ryzen AI drivers and execution infrastructure to provide powerful AI capabilities to C++ applications without having to worry about training specific AI models and integrating them to the Ryzen AI framework. + +Each Ryzen AI CVML library feature offers a simple C++ application programming interface (API) that can be easily incorporated into existing applications. The following AI features are currently available, + +- **Depth Estimation**: Generates a depth map to assess relative distances within a two-dimensional image. +- **Face Detection**: Identifies and locates faces within an image. +- **Face Mesh**: Constructs a mesh overlay of landmarks for a specified facial image. + +The Ryzen AI CVML library is distributed through the RyzenAI-SW Github repository: [Ryzen-AI-CVML-Library](/models-tutorials/vision/cvml) + +## Building sample applications + +This section describes the steps to build Ryzen AI CVML library sample applications. Before starting, ensure that the following prerequisites are available in the build environment, + +- CMake, version 3.18 or newer +- C++ compilation toolchain. On Windows, this may be Visual Studio's "Desktop development with C++" build tools, or a comparable C++ toolchain +- OpenCV, version 4.11 or newer + +### Navigate to the folder containing Ryzen AI samples + +Download the Ryzen AI CVML sources, and go to the 'samples' sub-folder of the library. + + + + +```powershell +git clone https://github.com/amd/RyzenAI-SW.git -b main --depth-1 +cd RyzenAI-SW\Ryzen-AI-CVML-Library\samples +``` + + + + +```bash +git clone https://github.com/amd/RyzenAI-SW.git -b main --depth-1 +cd RyzenAI-SW/Ryzen-AI-CVML-Library/samples +``` + + + + +### OpenCV libraries + +Ryzen AI CVML library samples make use of OpenCV, so set an environment variable to let the build scripts know where to find OpenCV. + + + + +```powershell +set OPENCV_INSTALL_ROOT= +``` + + + + +```bash +export OPENCV_INSTALL_ROOT= +``` + + + + +### Build Instructions + +Create a build folder and use CMAKE to build the sample(s). + + + + +```powershell +mkdir build +cmake -S %CD% -B %CD%\build -DOPENCV_INSTALL_ROOT=%OPENCV_INSTALL_ROOT% +cmake --build %CD%\build --config Release +``` + + + + +```bash +mkdir build +cmake -S $PWD -B $PWD/build -DOPENCV_INSTALL_ROOT=$OPENCV_INSTALL_ROOT +cmake --build $PWD/build --config Release +``` + + + + +The compiled sample applications will be placed in `build/<application>/Release` folders under the `samples` folder. + +## Running sample applications + +This section describes how to execute Ryzen AI CVML library sample applications. + +### Update the console and/or system PATH + +Ryzen AI CVML library applications need to be able to find the library files. + + + + +```powershell +set PATH=%PATH%;\windows +set PATH=%PATH%;%OPENCV_INSTALL_ROOT%\x64\vc16\bin +``` + + + + +```bash +export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/linux +export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$OPENCV_INSTALL_ROOT/lib +export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/xilinx/xrt/lib +``` + + + + +Adjust the aforementioned commands to match the actual location of Ryzen AI and OpenCV libraries, respectively. + +### Select an input source (image or video) + +Ryzen AI CVML library samples can accept a variety of image and video input formats, or even open the default camera on the system if "0" is specified as an input. + +In this example, a publicly available video file is used for the application's input. The following command downloads a video file and saves it locally as 'dancing.mp4': + +```bash +curl -o dancing.mp4 https://videos.pexels.com/video-files/4540332/4540332-hd_1920_1080_25fps.mp4 +``` + +### Execute the sample application + +Finally, the previously built sample application may be executed with the selected input source. + + + + +```powershell +build\cvml-sample-depth-estimation\Release\cvml-sample-depth-estimation.exe -i dancing.mp4 +``` + + + + +```bash +build/cvml-sample-depth-estimation/Release/cvml-sample-depth-estimation.exe -i dancing.mp4 +``` + + + + +--- + +Ryzen AI is licensed under MIT License. Refer to the LICENSE file for the full license text and copyright notice. diff --git a/docs/develop/index.mdx b/docs/develop/index.mdx new file mode 100644 index 00000000..42099c10 --- /dev/null +++ b/docs/develop/index.mdx @@ -0,0 +1,12 @@ +--- +title: 🔧 Tools +sidebar_position: 1 +--- + +import DocCardList from '@theme/DocCardList'; + +# 🔧 Tools + +Libraries, SDKs, and developer tools for building applications on AMD Ryzen AI hardware. This section covers application development, model deployment, quantization, analysis tools, and GPU acceleration. + + diff --git a/docs/develop/model-deployment.mdx b/docs/develop/model-deployment.mdx new file mode 100644 index 00000000..0850036c --- /dev/null +++ b/docs/develop/model-deployment.mdx @@ -0,0 +1,455 @@ +--- +title: Model Compilation and Deployment +sidebar_position: 3 +ci_validated: false +ci_last_run: 2026-02-27 +--- + +import CIStatus from '@site/src/components/CIStatus'; + +# Model Compilation and Deployment + + + +## Introduction + +The Ryzen AI Software supports models saved in the ONNX format and uses ONNX Runtime as the primary mechanism to load, compile and run models. + +:::info +**NOTE**: Models with ONNX opset 17 are recommended. If your model uses a different opset version, consider converting it using the [ONNX Version Converter](https://github.com/onnx/onnx/blob/main/docs/VersionConverter.md). +::: + +For a complete list of supported operators, consult this page: [Supported Operators](/reference/supported-operators). + +## Loading Models + +Models are loaded by creating an ONNX Runtime `InferenceSession` using the Vitis AI Execution Provider (VAI EP): + +```python +import onnxruntime + +session_options = onnxruntime.SessionOptions() +vai_ep_options = {} # Vitis AI EP options go here + +session = onnxruntime.InferenceSession( + path_or_bytes = model, # Path to the ONNX model + sess_options = session_options, # Standard ORT options + providers = ['VitisAIExecutionProvider'], # Use the Vitis AI Execution Provider + provider_options = [vai_ep_options] # Pass options to the Vitis AI Execution Provider +) +``` + +The `provider_options` parameter enables the configuration of the Vitis AI Execution Provider (EP). For a comprehensive list of supported provider options, refer to the [Vitis AI EP Options Reference Guide](#vitis-ai-ep-options-reference-guide) below. + +When a model is first loaded into an ONNX Runtime (ORT) inference session, it is compiled into the format required by the NPU. The resulting compiled output can be saved as an ORT EP context file or stored in the Vitis AI EP cache directory. + +If a compiled version of the ONNX model is already available — either as an EP context file or within the Vitis AI EP cache — the model will not be recompiled. Instead, the precompiled version will be loaded automatically. This greatly reduces session creation time and improves overall efficiency. For more details, refer to the section on [Managing Compiled Models](#managing-compiled-models). + +## Deploying Models + +Once the ONNX Runtime inference session is initialized and the model is compiled, the model is deployed using the ONNX Runtime `run()` API: + +```python +input_data = {} +for input in session.get_inputs(): + input_data[input.name] = ... # Initialize input tensors + +outputs = session.run(None, input_data) # Run the model +``` + +The ONNX graph is automatically partitioned into multiple subgraphs by the Vitis AI Execution Provider (EP). During deployment, the subgraph(s) containing operators supported by the NPU are executed on the NPU. The remaining subgraph(s) are executed on the CPU. This graph partitioning and deployment technique across CPU and NPU is fully automated by the VAI EP and is totally transparent to the end-user. + +## Vitis AI EP Options Reference Guide + +### VitisAI EP Provider Options + +The `provider_options` parameter of the ORT `InferenceSession` allows passing options to configure the Vitis AI EP. The following options are supported: + +| Option | Description | Values / Type | Default | +|--------|-------------|---------------|---------| +| `config_file` | Config file for BF16 compilation options. See [Config File Options](#config-file-options). | String | N/A | +| `target` | Set which Vitis AI EP backend to use for compiling/running integer model. For details see [Using INT8 Models](#using-int8-models). | `X2`, `X1` | `X2` | +| `xclbin` | To be used only when running INT8 CNN models on PHX/HPT devices. For details see [Using INT8 Models](#using-int8-models) | String | None | +| `encryption_key` | 256-bit key for encrypting EP context model. See [EP Context Cache](#onnx-runtime-ep-context-cache). | String (64 hex) | None | +| `opt_level` | Compiler optimization level for INT8 only. | 0, 1, 2, 3, 65536 (maximum effort, experimental) | 0 | +| `cache_dir` | VitisAI cache directory. For INT8, `enable_cache_file_io_in_mem` must be 0. | String | `C:\temp\%USERNAME%\vaip\.cache` | +| `cache_key` | Subfolder in cache for compiled model. For INT8, `enable_cache_file_io_in_mem` must be 0. | String | MD5 hash of model | +| `enable_cache_file_io_in_mem` | Keep compiled model in memory (1) or save to disk (0). INT8 only. | 0, 1 | 1 | +| `ai_analyzer_visualization` | Enable compile-time analysis data. | Boolean | False | +| `ai_analyzer_profiling` | Enable inference-time analysis data. | Boolean | False | + +### Enabling ONNX Runtime Logs + +To enable detailed logging for debugging purposes, set the ONNX Runtime session log severity level using `SessionOptions.log_severity_level`: + +```python +import onnxruntime + +session_options = onnxruntime.SessionOptions() +session_options.log_severity_level = 1 # Set log level (see table below) + +vai_ep_options = {} + +session = onnxruntime.InferenceSession( + path_or_bytes = model, + sess_options = session_options, + providers = ['VitisAIExecutionProvider'], + provider_options = [vai_ep_options] +) +``` + +| Level | Description | Value | +|-------|-------------|-------| +| Verbose | All messages (most detailed) | 0 | +| Info | Informational messages and above | 1 | +| Warning (Default) | Warnings and above | 2 | +| Error | Errors and above | 3 | +| Fatal | Fatal errors only | 4 | + +:::warning +**NOTE**: The `log_level` parameter in provider options has been deprecated. To enable logging, use `SessionOptions.log_severity_level` as shown in the example above. +::: + +### Config File Options + +When compiling BF16 models, a JSON configuration file can be provided to the VitisAI EP using the `config_file` provider option. This configuration file is used to specify additional options to the compiler. + +The default the configuration file for compiling BF16 models contains the following: + +```json +{ + "passes": [ + { + "name": "init", + "plugin": "vaip-pass_init" + }, + { + "name": "vaiml_partition", + "plugin": "vaip-pass_vaiml_partition", + "vaiml_config": { + "optimize_level": 1, + "preferred_data_storage": "auto" + } + } + ], + "target": "VAIML", + "targets": [ + { + "name": "VAIML", + "pass": [ + "init", + "vaiml_partition" + ] + } + ] +} +``` + +The `vaiml_config` section of the configuration file contains the user options. The supported user options are described below. + +| Option | Description | Values | Default | +|--------|-------------|--------|---------| +| `optimize_level` | Compiler optimization level. | 1, 2, 3 | 1 | +| `preferred_data_storage` | Data layout: "auto" (let compiler choose), "vectorized" (for CNNs), "unvectorized" (for Transformers). | auto, vectorized, unvectorized | auto | + +## Using BF16 Models + +When compiling BF16 models, an optional configuration file can be provided to the VitisAI EP. This file is specified using the `config_file` provider option. For more details, refer to [Config File Options](#config-file-options) section. + +:::info +**NOTE**: + +- Running BF16 Models is only supported for STX/KRK or newer devices. For the model compatibility table see [Release Notes](/reference/changelog). +- For C++ applications that need to compile BF16 models at runtime, include `${CONDA_PREFIX}/Lib/site-packages/flexml/flexml_extras/lib/vaiml.dll` along with the DLLs specified in [Application Development](/develop/app-development). However, using pre-compiled BF16 models is recommended for C++ deployment. +::: + +### Sample Python Code + +Python example loading a configuration file called vai_ep_config.json: + +```python +import onnxruntime + +vai_ep_options = { + 'config_file': 'vai_ep_config.json', +} + +session = onnxruntime.InferenceSession( + "resnet50.onnx", + providers=['VitisAIExecutionProvider'], + provider_options=[vai_ep_options] +) +``` + +### Sample C++ Code + +C++ example loading a configuration file called vai_ep_config.json: + +```cpp +#include + +auto onnx_model = "resnet50.onnx"; +Ort::Env env(ORT_LOGGING_LEVEL_WARNING, "resnet50_bf16"); +auto session_options = Ort::SessionOptions(); +auto vai_ep_options = std::unordered_map({}); +vai_ep_options["config_file"] = "vai_ep_config.json"; +session_options.AppendExecutionProvider_VitisAI(vai_ep_options); +auto session = Ort::Session( + env, + std::basic_string(onnx_model.begin(), onnx_model.end()).c_str(), + session_options); +``` + +## Using INT8 Models + +Ryzen AI features a new compiler for INT8 models. This compiler is enabled by default and provides the following improvements: + +- Improved ease of use and enhanced performance for models running on STX, KRK, and later devices. +- General asymmetric quantization support to enable third-party quantized models +- Support for XINT8, A8W8, and A16W8 quantization configuration providing greater flexibility for model optimization. + +The `target` provider options can be used to select which backend to use when compiling the INT8 model. The option accepts the following values: + +- `X2` — Default backend for integer models. Supports STX, KRK and newer devices. +- `X1` — Legacy backend for integer models. Supports PHX, HPT, STX and KRK devices. This setting should be used when running on PHX and HPT devices. It can also be used on STX and KRK devices in the cases where better results are achieved than with the default X2 setting. + +### Device-Specific Settings + +Suitable settings for the `target` and `xclbin` provider options are dependent on the type of device. The application must perform a device detection check before configuring the Vitis AI EP. For more details on how to do this, refer to the [Application Development](/develop/app-development) page. + +When compiling INT8 models on STX/KRK devices: + +- The `target` provider option can be set to `X2` (default) or `X1` (legacy backend, may provide better results for some models). +- The `xclbin` provider option must not be set. + +When compiling INT8 models on PHX/HPT devices: + +- The `target` provider option must be set to `X1`. +- The `xclbin` provider option must be set to `%RYZEN_AI_INSTALLATION_PATH%\voe-4.0-win_amd64\xclbins\phoenix\4x4.xclbin` or to a copy of this file included in the final version of the application. The legacy "1x4" and "Nx4" xclbin files are no longer supported and should not be used. + +### Sample Python Code + +Python example code for running an INT8 model on STX/KRK NPU (target=X2, no xclbin): + +```python +import os +import onnxruntime + +vai_ep_options = { + 'cache_dir': './model_cache', + 'cache_key': 'resnet_trained_for_cifar10', + 'enable_cache_file_io_in_mem': '0', + 'target': 'X2' # Default option 'X2' +} + +session = onnxruntime.InferenceSession( + "resnet50_int8.onnx", + providers=['VitisAIExecutionProvider'], + provider_options=[vai_ep_options] +) +``` + +### Sample C++ Code + +C++ example code for running an INT8 model on STX/KRK NPU (target=X2, no xclbin): + +```cpp +#include + +auto onnx_model = "resnet50_int8.onnx"; +Ort::Env env(ORT_LOGGING_LEVEL_WARNING, "resnet50_int8"); +auto session_options = Ort::SessionOptions(); +auto vai_ep_options = std::unordered_map({}); +vai_ep_options["cache_dir"] = exe_dir + "\\my_cache_dir"; +vai_ep_options["cache_key"] = "resnet_trained_for_cifar10"; +vai_ep_options["enable_cache_file_io_in_mem"] = "0"; +vai_ep_options["target"] = "X2"; +session_options.AppendExecutionProvider_VitisAI(vai_ep_options); +auto session = Ort::Session( + env, + std::basic_string(onnx_model.begin(), onnx_model.end()).c_str(), + session_options); +``` + +## Managing Compiled Models + +To avoid the overhead of recompiling models, it is very advantageous to save the compiled models and use these pre-compiled versions in the final application. Pre-compiled models can be loaded instantaneously and immediately executed on the NPU. This greatly improves the session creation time and overall end-user experience. + +The RyzenAI Software supports two mechanisms for saving and reloading compiled models: + +- VitisAI EP Cache +- ONNX Runtime EP Context Cache + +:::info +**TIP**: The VitisAI EP Cache mechanism is most convenient to quickly iterate during the development cycle. The OnnxRuntime EP Context Cache mechanism is recommended for the final version of the application. +::: + +### VitisAI EP Cache + +The VitisAI EP includes a built-in caching mechanism. When a model is compiled for the first time, it is automatically saved in the VitisAI EP cache directory. Any subsequent creation of an ONNX Runtime session using the same model will load the precompiled model from the cache directory, thereby reducing session creation time. + +The VitisAI EP Cache mechanism can be used to quickly iterate during the development cycle, but it is not recommended for the final version of the application. + +Cache directories generated by the Vitis AI Execution Provider should not be reused across different versions of the Vitis AI EP or across different version of the NPU drivers. + +If using the VitisAI EP Cache the application should check the version of the Vitis AI EP and of the NPU drivers. If the application detects a version change, it should delete the cache, or create a new cache directory with a different name. + +The location of the VitisAI EP cache is specified with the `cache_dir` and `cache_key` provider options. For INT8 models, the `enable_cache_file_io_in_mem` must be set to 0 otherwise the output of the compiler is kept in memory and is not saved to disk. + +Python example: + +```python +import onnxruntime +from pathlib import Path + +vai_ep_options = { + 'cache_dir': str(Path(__file__).parent.resolve()), + 'cache_key': 'compiled_resnet50_int8', + 'enable_cache_file_io_in_mem': '0' +} + +session = onnxruntime.InferenceSession( + "resnet50_int8.onnx", + providers=['VitisAIExecutionProvider'], + provider_options=[vai_ep_options] +) +``` + +In the example above, the cache directory is set to the absolute path of the folder containing the script being executed. Once the session is created, the compiled model is saved inside a subdirectory named `compiled_resnet50_int8` within the specified cache folder. + +### ONNX Runtime EP Context Cache + +The Vitis AI EP supports the ONNX Runtime EP context cache feature. This features allows dumping and reloading a snapshot of the EP context before deployment. + +The user can enable dumping of the EP context by setting the `ep.context_enable` session option to 1. + +The following options can be used for additional control: + +- `ep.context_file_path` – Specifies the output path for the dumped context model. +- `ep.context_embed_mode` – Embeds the EP context into the ONNX model when set to 1. + +For further details, refer to the official ONNX Runtime documentation: https://onnxruntime.ai/docs/execution-providers/EP-Context-Design.html + +#### EP Context Encryption + +By default, the generated context model is unencrypted and can be used directly during inference. If needed, the context model can be encrypted using one of the methods described below. + +**User-managed encryption** + +After the context model is generated, the developer can encrypt the generated file using a method of choice. At runtime, the encrypted file can be loaded by the application, decrypted in memory and passed as a serialized string to the inference session. + +This method gives complete control to the developer over the encryption process. + +**EP-managed encryption** + +The VitisAI EP can optionally encrypt the EP context model using AES256. This is enabled by passing an encryption key using the `encryption_key` VAI EP provider options. The key is a 256-bit value represented as a 64-digit string. At runtime, the same encryption key must be provided to decrypt and load the context model. + +With this method, encryption and decryption is seamlessly managed by the VitisAI EP. + +Python example: + +```python +import onnxruntime + +vai_ep_options = { + 'encryption_key': '89703f950ed9f738d956f6769d7e45a385d3c988ca753838b5afbc569ebf35b2' +} + +# Compilation session +session_options = onnxruntime.SessionOptions() +session_options.add_session_config_entry('ep.context_enable', '1') +session_options.add_session_config_entry('ep.context_file_path', 'context_model.onnx') +session_options.add_session_config_entry('ep.context_embed_mode', '1') +session = onnxruntime.InferenceSession( + path_or_bytes='resnet50_int8.onnx', # Load the ONNX model + sess_options=session_options, + providers=['VitisAIExecutionProvider'], + provider_options=[vai_ep_options] +) + +# Inference session +session_options = onnxruntime.SessionOptions() +session = onnxruntime.InferenceSession( + path_or_bytes='context_model.onnx', # Load the EP context model + sess_options=session_options, + providers=['VitisAIExecutionProvider'], + provider_options=[vai_ep_options] +) +``` + +C++ example: + +```cpp +Ort::Env env(ORT_LOGGING_LEVEL_WARNING, "ort"); + +// VAI EP Provider options +auto vai_ep_options = std::unordered_map({}); +vai_ep_options["encryption_key"] = "89703f950ed9f738d956f6769d7e45a385d3c988ca753838b5afbc569ebf35b2"; + +// Session options +auto session_options = Ort::SessionOptions(); +session_options.AppendExecutionProvider_VitisAI(vai_ep_options); + +// Inference session +auto onnx_model = "context_model.onnx"; // The EP context model +auto session = Ort::Session( + env, + std::basic_string(onnx_model.begin(), onnx_model.end()).c_str(), + session_options); +``` + +:::info +**NOTE**: It is possible to precompile the EP context model using Python and to deploy it using a C++ program. +::: + +## Operator Assignment Report + +The compiler can optionally generate a report on operator assignments across CPU and NPU. To generate this report: + +- The `enable_cache_file_io_in_mem` provider option must be set to 0 +- The XLNX_ONNX_EP_REPORT_FILE environment variable must be used to specify the name of the generated report. For instance: + +```bat +set XLNX_ONNX_EP_REPORT_FILE=vitisai_ep_report.json +``` + +When these conditions are satisfied, the report file is automatically generated in the cache directory. This report includes information such as the total number of nodes, the list of operator types in the model, and which nodes and operators run on the NPU or on the CPU. Additionally, the report includes node statistics, such as input to a node, the applied operation, and output from the node. + +Example report output: + +```json +{ + "deviceStat": [ + { + "name": "all", + "nodeNum": 400, + "supportedOpType": [ + "::Add", + "::Conv", + ... + ] + }, + { + "name": "CPU", + "nodeNum": 2, + "supportedOpType": [ + "::DequantizeLinear", + "::QuantizeLinear" + ] + }, + { + "name": "NPU", + "nodeNum": 398, + "supportedOpType": [ + "::Add", + "::Conv", + ... + ] + ... +``` + +To disable generation of the report, unset the XLNX_ONNX_EP_REPORT_FILE environment variable: + +```bat +set XLNX_ONNX_EP_REPORT_FILE= +``` diff --git a/docs/develop/model-quantization.mdx b/docs/develop/model-quantization.mdx new file mode 100644 index 00000000..41c81ffe --- /dev/null +++ b/docs/develop/model-quantization.mdx @@ -0,0 +1,82 @@ +--- +title: Model Quantization +sidebar_position: 5 +ci_validated: false +ci_last_run: 2026-02-27 +--- + +import CIStatus from '@site/src/components/CIStatus'; + + +**Model quantization** is the process of mapping high-precision weights/activations to a lower precision format, such as BF16/INT8, while maintaining model accuracy. This technique enhances the computational and memory efficiency of the model for deployment on NPU devices. It can be applied post-training, allowing existing models to be optimized without the need for retraining. + +The Ryzen AI compiler supports input models in the following formats: + +- CNN Models + + - INT8 (quantized) + - FP32 (automatically converted to BF16 during compilation) + +- Transformer Models: + + - FP32 (automatically converted to BF16 during compilation) + +Ryzen AI Software natively supports both CNN and Transformer models in floating-point (FP32) format. When FP32 models are provided as input, the VitisAI EP automatically converts them to bfloat16 (BF16) precision and processes them through the optimized BF16 compilation pipeline. + +For CNN models, AMD Quark quantization enables conversion to INT8 format, delivering improved inference performance compared to higher precision alternatives. This quantization pathway provides an additional optimization option for CNN workloads requiring maximum efficiency. + +The complete list of operations supported for different quantization types can be found in [Supported Operations](/reference/supported-operators). + +## FP32 to BF16 Conversion + +Ryzen AI provides seamless support for deploying original floating-point (FP32) models on NPU hardware through automatic conversion to BFLOAT16 (BF16) format. The conversion from FP32 to BF16 is performed when the model is compiled by the VitisAI EP. BF16 is a 16-bit floating-point format designed to have the same exponent size as FP32, allowing a wide dynamic range, but with reduced precision to save memory and speed up computations. This feature enables developers to deploy models in their native format while leveraging the Ryzen AI automatic conversion for efficient execution on NPU. + +### FP32 to BF16 Examples + +Explore these practical examples demonstrating FP32 to BF16 conversion across different CNN, NLP model types: + +- [Image Classification](/models-tutorials/vision/image-classification) using ResNet50 model on NPU +- [Finetuned DistilBERT for Text Classification](/models-tutorials/nlp/distilbert) +- Advanced quantization techniques [Fast Finetuning](https://quark.docs.amd.com/latest/supported_accelerators/ryzenai/tutorial_convert_fp32_or_fp16_to_bf16.html) for BF16 models. + +## FP32 to INT8 Conversion + +Quantization to INT8 format introduces several challenges, primarily revolving around the potential drop in model accuracy. Choosing the right quantization parameters—such as data type, bit-width, scaling factors, and the decision between per-channel or per-tensor quantization—adds layers of complexity to the design process. These decisions significantly impact both model accuracy and performance. While **AMD Quark** is the recommended quantization tool, third-party tools that support QDQ (Quantize-Dequantize) operations can also be used for model quantization. + +RyzenAI supports the following INT8 datatypes: + +- XINT8: uses symmetric INT8 activation and weights quantization with power-of-two scales +- A8W8: uses symmetric INT8 activation and weights quantization with float scales +- A16W8: uses symmetric INT16 activation and symmetric INT8 weights quantization with float scales + +[AMD Quark](https://quark.docs.amd.com/latest/supported_accelerators/ryzenai/index.html) is the recommended quantization tool to convert FP32 models to INT8. But third-party tools that support QDQ (Quantize-Dequantize) operations can also be used for model quantization to A8W8 and A16W8. + +### AMD Quark + +[AMD Quark](https://quark.docs.amd.com/latest/supported_accelerators/ryzenai/index.html) is a comprehensive cross-platform deep learning toolkit designed to simplify and enhance the quantization of deep learning models. Supporting both PyTorch and ONNX models, Quark empowers developers to optimize their models for deployment on a wide range of hardware backends, achieving significant performance gains without compromising accuracy. + +**AMD Quark** provides default configurations that support INT8 quantization. For example, `XINT8` uses symmetric INT8 activation and weights quantization with power-of-two scales using the MinMSE calibration method. + +For more challenging model quantization needs, **AMD Quark** supports different quantization configurations such as `A8W8`, `A16W8`, and advanced quantization techniques. For more details, refer to [AMD Quark for Ryzen AI](https://quark.docs.amd.com/latest/supported_accelerators/ryzenai/index.html) + +The quantization configuration can be customized using the `QuantizationConfig` class. The following example shows how to set up the quantization configuration for INT8 quantization: + +```python +quant_config = QuantizationConfig(calibrate_method=PowerOfTwoMethod.MinMSE, + activation_type=QuantType.QUInt8, + weight_type=QuantType.QInt8, + enable_npu_cnn=True, + extra_options={'ActivationSymmetric': True}) +config = Config(global_quant_config=quant_config) +print("The configuration of the quantization is {}".format(config)) +``` + +The user can use the `get_default_config('XINT8')` function to get the default configuration for INT8 quantization. + +### FP32 to INT8 Examples + +Explore practical INT8 quantization examples: + +- Running INT8 model on NPU using [Getting Started Tutorial](/models-tutorials/vision/cnn-examples) +- [AMD Quark Tutorial](/models-tutorials/vision/quark-quantization) for Ryzen AI Deployment +- Advanced quantization techniques [Fast Finetuning and Cross Layer Equalization](https://github.com/amd/RyzenAI-SW/blob/main/CNN-examples/quark_quantization/docs/advanced_quant_readme.md) for INT8 model diff --git a/docs/develop/onnx-model-preparation.mdx b/docs/develop/onnx-model-preparation.mdx new file mode 100644 index 00000000..05c7b2ed --- /dev/null +++ b/docs/develop/onnx-model-preparation.mdx @@ -0,0 +1,179 @@ +--- +title: ONNX Model Preparation +sidebar_position: 6 +ci_validated: false +ci_last_run: 2026-02-27 +--- + +import CIStatus from '@site/src/components/CIStatus'; + +# Preparing OGA Models + + + +This section describes the process for preparing LLMs for deployment on a Ryzen AI PC using the hybrid or NPU-only execution mode. Currently, the flow supports only fine-tuned versions of the models already supported (as listed in the [hybrid OGA](/models-tutorials/llms/supported-models) page). For example, fine-tuned versions of Llama2 or Llama3 can be used. However, different model families with architectures not supported by the hybrid flow cannot be used. + +For fine-tuned models that introduce architectural changes requiring new operator shapes not available in the Ryzen AI runtime, refer to the [Operator Preparation](/develop/operator-preparation) page. + +Preparing a LLM for deployment on a Ryzen AI PC involves 2 steps: + +1. **Quantization**: The pretrained model is quantized to reduce memory footprint and better map to compute resources in the hardware accelerators +2. **Postprocessing**: During the postprocessing the model is exported to OGA followed by NPU-only or Hybrid execution mode specific postprocess to obtain the final deployable model. + +## Quantization + +### Prerequisites + +Linux machine with AMD (e.g., AMD Instinct MI Series) or Nvidia GPUs + +### Setup + +1. Create and activate Conda Environment + +```bash +conda create --name python=3.11 +conda activate +``` + +2. If Using AMD GPUs, update PyTorch to use ROCm + +```bash +pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm6.1 +python -c "import torch; print(torch.cuda.is_available())" # Must return `True` +``` + +3. Download [AMD Quark 0.11](https://download.amd.com/opendownload/Quark/amd_quark-0.11.zip) and unzip the archive + +4. Install Quark: + +```bash +cd +pip install amd_quark-+<>.whl +``` + +5. Install other dependencies + +```bash +pip install datasets +pip install transformers +pip install accelerate +pip install evaluate +pip install nltk +``` + +Some models may require a specific version of `transformers`. For example, ChatGLM3 requires version 4.44.0. + +### Generate Quantized Model + +Use following command to run Quantization. In a GPU equipped Linux machine the quantization can take about 30-60 minutes. + +```bash +cd examples/torch/language_modeling/llm_ptq/ + +python quantize_quark.py \ + --no_trust_remote_code \ + --model_dir "meta-llama/Llama-2-7b-chat-hf" \ + --output_dir \ + --quant_scheme w_uint4_per_group_asym \ + --group_size 128 \ + --num_calib_data 128 \ + --seq_len 512 \ + --quant_algo awq \ + --dataset pileval_for_awq_benchmark \ + --model_export hf_format \ + --data_type \ + --exclude_layers [] +``` + +- Use `--data_type bfloat16` for bf16 pretrained model. For fp32/fp16 pretrained model use `--datatype float16` +- Not using `--exclude_layers` parameter may result in model-specific defaults which may exclude certain layers like output layers. + +The quantized model is generated in the <quantized safetensor output dir> folder. + +:::info +**Note:** For the Phi-4 model, the following quantization recipe is recommended for better accuracy: + +- Use `--quant_algo gptq` +- Add `--group_size_per_layer lm_head 32` +::: + +:::info +**Note:** Currently the following files are not copied into the quantized model folder and must be copied manually: + +- For Phi-4 models: `configuration_phi3.py` +- For ChatGLM-6b models: `tokenizer.json` +::: + +## Postprocessing + +Copy the quantized model to the Windows PC with Ryzen AI installed, activate the Ryzen AI Conda environment. + +```bash +conda activate ryzen-ai- +pip install onnx_ir +pip install torch==2.7.1 +``` + +Generate the final model for Hybrid execution mode: + +```bash +conda activate ryzen-ai- + +model_generate --hybrid +``` + +Generate the final model for NPU execution mode: + +```bash +conda activate ryzen-ai- + +model_generate --npu --optimize decode +``` + +Generate model for hybrid execution mode (prefill fused version) + +```bash +conda activate ryzen-ai- + +model_generate --hybrid --optimize prefill +``` + +- Prefill fused hybrid models are only supported for Phi-3.5-mini-instruct and Mistral-7B-Instruct-v0.2 +- Edit `genai_config.json` with the following entries + +```json +"decoder": { + "session_options": { + "log_id": "onnxruntime-genai", + "custom_ops_library": "onnx_custom_ops.dll", + "external_data_file": "token.pb.bin", + "custom_allocator": "ryzen_mm", + "config_entries": { + "dd_cache": "", + "hybrid_opt_token_backend": "gpu", + "hybrid_opt_max_seq_length": "4096", + "max_length_for_kv_cache": "4096" + }, + "provider_options": [] + }, + "filename": "fusion.onnx" +} +``` + +:::info +**Note**: During the `model_generate` step, the quantized model is first converted to an OGA model using ONNX Runtime GenAI Model Builder (version 0.9.2). It is possible to use a standalone environment for exporting an OGA model, refer to the official [ONNX Runtime GenAI Model Builder documentation](https://github.com/microsoft/onnxruntime-genai/tree/main/src/python/py/models). Once you have an exported OGA model, you can pass it directly to the `model_generate` command, which will skip the export step and perform only the post-processing. +::: + +Here are simple commands to export OGA model from quantized model using a standalone environment + + +```bash +conda create --name oga_builder_env python=3.10 +conda activate oga_builder_env + +pip install onnxruntime-genai==0.9.2 +pip install torch transformers onnx numpy + + +python3 -m onnxruntime_genai.models.builder -m -o -p int4 -e dml +``` diff --git a/docs/develop/operator-preparation.mdx b/docs/develop/operator-preparation.mdx new file mode 100644 index 00000000..9c9c1c43 --- /dev/null +++ b/docs/develop/operator-preparation.mdx @@ -0,0 +1,85 @@ +--- +title: Operator Preparation +sidebar_position: 7 +ci_validated: false +ci_last_run: 2026-02-27 +--- + +import CIStatus from '@site/src/components/CIStatus'; + +# Compiling Operators for OGA Models + + + +Ryzen AI currently supports many popular LLMs in both hybrid and NPU-only flows. For these models, the required operators are already compiled and included in the Ryzen AI runtime. Such models can be run directly on Ryzen AI without any additional preparation. + +When users fine-tune these models, only the weights change and no new operator shapes are introduced. In that case, follow the steps from [ONNX Model Preparation](/develop/onnx-model-preparation) to prepare the model, which will run on the Ryzen AI runtime using the precompiled operators. + +However, in cases where architectural changes introduce new operator shapes not available in the Ryzen AI runtime, additional operator compilation is required. This page provides a recipe to compile operators that are not already present in the runtime. **This flow is experimental, and results may vary depending on the extent of the architectural changes**. + +:::info +All OGA models are currently based on the [ONNX Runtime GenAI Model Builder](https://github.com/microsoft/onnxruntime-genai/tree/main/src/python/py/models#current-support) architecture. Therefore, this operator compilation flow requires the models are supported by ONNX Runtime GenAI. +::: + +## Operator Compilation Flow (Hybrid Execution) + +Currently this flow is primarily supported for hybrid execution. + +1. Ensure the model is quantized following the [quantization recipe](/develop/onnx-model-preparation#quantization) + +2. Build the OGA DML model using the ONNX Runtime GenAI Model Builder included in the Ryzen AI software environment: + +```bash +conda activate ryzen-ai- +python -m onnxruntime_genai.models.builder \ + -i -o \ + -p int4 -e dml +``` + +3. Compile the operators extracted from the OGA DML model: + +```bash +onnx_utils vaiml --model-dir --plugin_name --compile --ops_type bfp16 +``` + +This generates a compiled operator package at: `transaction-plugin\.zip`. + +4. Generate the hybrid model: + +Create a folder named `dd_plugins` in the current working directory and place the plugin zip file inside it. By default, the flow looks for the operator zip in `dd_plugins`. To use a different location, see "Additional Details" below. + +Generate the hybrid model: + +```bash +model_generate --hybrid +``` + +5. Run the hybrid model + +Follow the [OGA Flow guide](/models-tutorials/llms/hybrid-inference#c-program) to copy `model_benchmark.exe` and required DLL dependencies to the current working directory. Then run: + +```bat +.\model_benchmark.exe -i -f amd_genai_prompt.txt -l "128, 256, 512, 1024, 2048" --verbose +``` + +## Additional Details + +### 1. Path to operator zip file + +If `.zip` is not placed in the `dd_plugins` folder, set the `DD_PLUGINS_ROOT` environment variable to point to its location: + +```bat +set DD_PLUGINS_ROOT=C:\path\to\folder\containing\plugin.zip +``` + +### 2. Enabling tracing + +To enable tracing for debug purposes, set the `DD_PLUGINS_TRACING` environment variable before generating the hybrid model: + +```bat +:: Optional: enable tracing +set DD_PLUGINS_TRACING=1 + +:: Generate the model +model_generate --hybrid +``` diff --git a/docs/develop/rocm-client-gpu.mdx b/docs/develop/rocm-client-gpu.mdx new file mode 100644 index 00000000..1d8240f4 --- /dev/null +++ b/docs/develop/rocm-client-gpu.mdx @@ -0,0 +1,51 @@ +--- +title: DirectML Flow (GPU) +sidebar_position: 9 +ci_validated: false +ci_last_run: 2026-02-27 +--- + +import CIStatus from '@site/src/components/CIStatus'; + +# DirectML Flow + + + +## Prerequisites + +- DirectX12 capable Windows OS (Windows 11 recommended) +- Latest AMD [GPU device driver](https://www.amd.com/en/support) installed +- [Microsoft Olive](https://microsoft.github.io/Olive/how-to/installation.html) for model conversion and optimization +- Latest [ONNX Runtime DirectML EP](https://onnxruntime.ai/docs/execution-providers/DirectML-ExecutionProvider.html) + +You can ensure GPU driver and DirectX version from `Windows Task Manager` -> `Performance` -> `GPU` + +## Running models on Ryzen AI GPU + +Running models on the Ryzen AI GPU is accomplished in two simple steps: + +**Model Conversion and Optimization**: After the model is trained, Microsoft Olive Optimizer can be used to convert the model to ONNX and optimize it for optimal target execution. + +For additional information, refer to the [Microsoft Olive Documentation](https://microsoft.github.io/Olive/) + +**Deployment**: Once the model is in the ONNX format, the ONNX Runtime DirectML EP (`DmlExecutionProvider`) is used to run the model on the AMD Ryzen AI GPU. + +For additional information, refer to the [ONNX Runtime documentation for the DirectML Execution Provider](https://onnxruntime.ai/docs/execution-providers/DirectML-ExecutionProvider.html) + +## Examples + +- Optimizing and running [ResNet on Ryzen AI GPU](/models-tutorials/vision/igpu-getting-started) + +## Additional Resources + +- Article on how AMD and Black Magic Design worked together to accelerate [Davinci Resolve Studio](https://www.blackmagicdesign.com/products/davinciresolve/studio) workload on AMD hardware: + + - [AI Accelerated Video Editing with DaVinci Resolve 18.6 & AMD Radeon Graphics](https://www.amd.com/en/blogs/2023/ai-accelerated-video-editing-with-davinci-resolve-.html) + +- Blog posts on using the Ryzen AI Software for various generative AI workloads on GPU: + + - [Automatic1111 Stable Diffusion WebUI with DirectML Extension on AMD GPUs](https://www.amd.com/en/blogs/2023/-how-to-automatic1111-stable-diffusion-webui-with.html) + + - [Running Optimized Llama2 with Microsoft DirectML on AMD Radeon Graphics](https://www.amd.com/en/blogs/2023/-how-to-running-optimized-llama2-with-microsoft-d.html) + + - [AI-Assisted Mobile Workstation Workflows Powered by AMD Ryzen™ AI](https://www.amd.com/en/blogs/2024/ai-assisted-mobile-workstation-workflows-powered-b.html) diff --git a/docs/getting-started/index.mdx b/docs/getting-started/index.mdx new file mode 100644 index 00000000..597763f6 --- /dev/null +++ b/docs/getting-started/index.mdx @@ -0,0 +1,12 @@ +--- +title: 🚀 Getting Started +sidebar_position: 1 +--- + +import DocCardList from '@theme/DocCardList'; + +# 🚀 Getting Started + +Get up and running with AMD Ryzen AI Software. Install the SDK, run your first model, and learn the hardware and software architecture. + + diff --git a/docs/getting-started/installation.mdx b/docs/getting-started/installation.mdx new file mode 100644 index 00000000..bc4d6658 --- /dev/null +++ b/docs/getting-started/installation.mdx @@ -0,0 +1,220 @@ +--- +title: Installation +sidebar_position: 2 +ci_validated: false +ci_last_run: 2026-02-27 +--- + +import CIStatus from '@site/src/components/CIStatus'; +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; +import ExpectedOutput from '@site/src/components/ExpectedOutput'; + +# Installation + + + +This page covers Ryzen AI Software installation on Windows. For Linux LLM setup, see [Running LLM on Linux](/models-tutorials/llms/linux-setup). + +## Prerequisites + +The Ryzen AI Software supports AMD processors with a Neural Processing Unit (NPU). Refer to the [Supported Hardware](/getting-started/supported-hardware) page for the full list of supported configurations. + +The following must be installed before installing the Ryzen AI Software: + +| Dependency | Version Requirement | +|------------|---------------------| +| **Windows 11** | Build >= 22621.3527 | +| **Visual Studio 2022** | Optional — only required for AMD Quark custom op flows | +| **cmake** | Version >= 3.26 | +| **Python distribution** | [Miniforge](https://github.com/conda-forge/miniforge) (preferred) | + +:::warning +**IMPORTANT**: + +- **Visual Studio 2022 Community** (Optional for AMD Quark, to support custom op flow): ensure that `Desktop Development with C++` is installed. + +- **Miniforge**: ensure that the following path is set in the System PATH variable: `path\to\miniforge3\condabin` or `path\to\miniforge3\Scripts\` or `path\to\miniforge3\`. The System PATH variable should be set in the *System Variables* section of the *Environment Variables* window. +::: + +## Step 1: Install NPU Drivers + +Download and install the NPU driver (version 32.0.203.280 or newer): + +| Driver Version | Supported Platforms | Download | +|---------------|---------------------|----------| +| 32.0.203.280 | Phoenix, Hawk Point, Strix, Strix Halo, Krackan Point | [Download](https://account.amd.com/en/forms/downloads/ryzenai-eula-public-xef.html?filename=NPU_RAI1.5_280_WHQL.zip) | +| 32.0.203.314 | Latest platforms | [Download](https://account.amd.com/en/forms/downloads/ryzenai-eula-public-xef.html?filename=NPU_RAI1.6.1_314_WHQL.zip) | + +**Installation steps:** + +1. Extract the downloaded ZIP file +2. Open a terminal in **administrator mode** +3. Run the installer: + +```powershell +.\npu_sw_installer.exe +``` + +**Verify the driver** by opening **Task Manager → Performance → NPU0**. You should see the NPU device listed. + +## Step 2: Install Ryzen AI Software + +Download the Ryzen AI Software bundled installer: [ryzenai-lt-1.7.0.exe](https://account.amd.com/en/forms/downloads/ryzenai-eula-public-xef.html?filename=ryzen-ai-lt-1.7.0.exe) + +Launch the EXE installer and follow the installation wizard: + +1. Accept the terms of the License agreement +2. Provide the destination folder for Ryzen AI installation (default: `C:\Program Files\RyzenAI\1.7.0`) +3. Specify the name for the conda environment (default: `ryzen-ai-1.7.0`) + +The installer creates the conda environment and installs all Ryzen AI Software packages into it automatically. + +:::info +NuGet package is also available: [ryzen-ai-1.7.0-nuget.zip](https://account.amd.com/en/forms/downloads/ryzenai-eula-public-xef.html?filename=signed_nuget_1.7.0.zip) +::: + +## Step 3: Test the Installation + +The Ryzen AI Software installation includes a `quicktest` to verify that everything is correctly installed. This test is expected to work for Strix (STX) or newer devices. + +1. Open a Conda command prompt (search for "Miniforge Prompt" in the Windows start menu) + +2. Activate the Conda environment created by the Ryzen AI installer: + +```bat +conda activate ryzen-ai-1.7.0 +``` + +3. Run the test: + +```bat +cd %RYZEN_AI_INSTALLATION_PATH%\quicktest +python quicktest.py +``` + + +{`INFO: [aiecompiler 77-749] Reading logical device aie2p_8x4_device +Using TXN FORMAT 0.1 +Test Passed`} + + +4. Verify NPU activity by opening **Task Manager → Performance → NPU** while the test is running. You should see NPU utilization increase during model inference. + +:::tip +To see detailed NPU offloading logs, run with verbose filtering: + +```bat +python quicktest.py 2>&1 | findstr /i "Operators Subgraphs VITIS_EP_CPU NPU Test" +``` + + +{`[Vitis AI EP] No. of Operators : + NPU 398 + VITIS_EP_CPU 2 +[Vitis AI EP] No. of Subgraphs : + NPU 1 +Test Passed`} + +::: + +:::info +- The full installation path is stored in the `RYZEN_AI_INSTALLATION_PATH` environment variable. +- For Phoenix/Hawk Point hardware, additional session options are required (`target` set to `X1`). See the [NPU Offloading with Session Options](#npu-offloading-with-session-options) section below. +::: + +## NPU Offloading with Session Options + +This section demonstrates how to enable NPU offloading logs using ONNX Runtime session options. The code also includes changes needed in `quicktest.py` to run on Phoenix/Hawk Point devices. +To view detailed logging information, update the session options in `quicktest.py` as shown below: + +```python +import os +import sys +import subprocess +import numpy as np +import onnxruntime as ort + +def get_npu_info(): + command = r'pnputil /enum-devices /bus PCI /deviceids ' + process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + stdout, stderr = process.communicate() + npu_type = '' + if 'PCI\\VEN_1022&DEV_1502&REV_00' in stdout.decode(): npu_type = 'PHX/HPT' + if 'PCI\\VEN_1022&DEV_17F0&REV_00' in stdout.decode(): npu_type = 'STX' + if 'PCI\\VEN_1022&DEV_17F0&REV_10' in stdout.decode(): npu_type = 'STX' + if 'PCI\\VEN_1022&DEV_17F0&REV_11' in stdout.decode(): npu_type = 'STX' + if 'PCI\\VEN_1022&DEV_17F0&REV_20' in stdout.decode(): npu_type = 'KRK' + return npu_type + +npu_type = get_npu_info() +install_dir = os.environ['RYZEN_AI_INSTALLATION_PATH'] +model = os.path.join(install_dir, 'quicktest', 'test_model.onnx') +providers = ['VitisAIExecutionProvider'] +provider_options = [{}] + +if npu_type == 'PHX/HPT': + print("Setting environment for PHX/HPT") + xclbin_file = os.path.join(install_dir, 'voe-4.0-win_amd64', 'xclbins', 'phoenix', '4x4.xclbin') + provider_options = [{ + 'target': 'X1', + 'xlnx_enable_py3_round': 0, + 'xclbin': xclbin_file, + }] + +session_options = ort.SessionOptions() +session_options.log_severity_level = 1 + +try: + session = ort.InferenceSession(model, + sess_options=session_options, + providers=providers, + provider_options=provider_options) +except Exception as e: + print(f"Failed to create an InferenceSession: {e}") + sys.exit(1) + +def preprocess_random_image(): + image_array = np.random.rand(3, 32, 32).astype(np.float32) + return np.expand_dims(image_array, axis=0) + +input_data = preprocess_random_image() +try: + outputs = session.run(None, {'input': input_data}) +except Exception as e: + print(f"Failed to run the InferenceSession: {e}") + sys.exit(1) +else: + print("Test finished") +``` + +Run the test with verbose filtering: + +```bat +cd %RYZEN_AI_INSTALLATION_PATH%\quicktest +python quicktest.py 2>&1 | findstr /i "Operators Subgraphs VITIS_EP_CPU NPU Test" +``` + + +{`[Vitis AI EP] No. of Operators : + NPU 398 + VITIS_EP_CPU 2 +[Vitis AI EP] No. of Subgraphs : + NPU 1 +Test finished`} + + +:::info +- For Phoenix/Hawk Point hardware, set the `target` to `X1` in the provider options. +::: + +## Alternative: Standalone LLM Install (pip) + +If you only need to run LLMs and prefer not to use the bundled installer, you can set up a standalone environment. See [Running LLM via pip install](/models-tutorials/llms/hybrid-inference#running-llm-via-pip-install) for instructions. + +For the high-level Python SDK (Lemonade), see [High-Level Python SDK](/models-tutorials/llms/python-api) which provides a quick PyPI-based setup. + +## Next Steps + +- [Quickstart](/getting-started/quickstart) -- Run your first LLM on the NPU +- [Supported Hardware](/getting-started/supported-hardware) -- Full hardware compatibility matrix diff --git a/docs/getting-started/overview.mdx b/docs/getting-started/overview.mdx new file mode 100644 index 00000000..78dbd116 --- /dev/null +++ b/docs/getting-started/overview.mdx @@ -0,0 +1,45 @@ +--- +title: Overview & Architecture +sidebar_position: 2 +ci_validated: false +ci_last_run: 2026-02-27 +--- + +import CIStatus from '@site/src/components/CIStatus'; +import FeatureState from '@site/src/components/FeatureState'; +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +# Ryzen AI Software + + + +AMD Ryzen™ AI Software includes the tools and runtime libraries for optimizing and deploying AI inference on AMD Ryzen™ AI powered PCs. Ryzen AI software enables applications to run on the neural processing unit (NPU) built in the AMD XDNA™ architecture, as well as on the integrated GPU and discrete GPU. Developers can build and deploy models trained in PyTorch or TensorFlow and run them directly on laptops powered by Ryzen AI. + +## Development Flow + +The Ryzen AI development flow does not require modifications to existing model training processes. A pre-trained model is the starting point: + +1. **Quantize** — Convert model parameters to lower precision (INT8, INT4) using [AMD Quark](/develop/model-quantization) for better performance and lower power consumption. Float32 models are also supported and internally converted to bfloat16. +2. **Compile & Deploy** — Deploy the quantized model using [ONNX Runtime](/develop/model-deployment) with the Vitis AI Execution Provider, which automatically partitions operations between the NPU and CPU. +3. **Build Applications** — Use the [Python SDK](/models-tutorials/llms/python-api), [Server Interface](/models-tutorials/llms/server-interface), or [C++ API](/models-tutorials/llms/hybrid-inference) to integrate AI into your application. + +## Supported Workloads + +| Domain | Examples | Devices | Documentation | +|--------|----------|---------|---------------| +| Large Language Models | [Llama 3.x](https://huggingface.co/meta-llama), [Mistral](https://huggingface.co/mistralai), [Phi-3/4](https://huggingface.co/microsoft), [Qwen 2.5](https://huggingface.co/Qwen), [DeepSeek](https://huggingface.co/deepseek-ai) | NPU, GPU | [LLM Tutorials](/models-tutorials/llms/overview) | +| Vision | [ResNet](https://huggingface.co/microsoft/resnet-50), [YOLOv8](https://huggingface.co/Ultralytics/YOLOv8), [Stable Diffusion](https://huggingface.co/stabilityai), [MobileNet](https://huggingface.co/google/mobilenet_v2_1.0_224) | NPU, GPU | [Vision Tutorials](/models-tutorials/vision/cnn-examples) | +| Audio | [Whisper](https://huggingface.co/openai/whisper-large-v3) (speech-to-text) | NPU | [Whisper Tutorial](/models-tutorials/audio/whisper) | +| Multimodal | [Gemma-3-4b-it](https://huggingface.co/google/gemma-3-4b-it) (VLM) | NPU | [Multimodal Models](/models-tutorials/multimodal) | + +## LLM Interfaces + +The Ryzen AI LLM stack provides three development interfaces, each suited for different use cases: + +| Interface | Use Case | Language | Details | +|-----------|----------|----------|---------| +| [Python SDK (Lemonade API)](/models-tutorials/llms/python-api) | Rapid prototyping, scripting | Python | High-level API built on OGA | +| [Server Interface (REST)](/models-tutorials/llms/server-interface) | Integration with existing apps, OpenAI-compatible | HTTP/REST | OpenAI-compatible endpoint | +| [OGA / C++ API](/models-tutorials/llms/hybrid-inference) | Production native apps, hybrid NPU+GPU | C++ | Low-level control, hybrid inference | + diff --git a/docs/getting-started/quickstart.mdx b/docs/getting-started/quickstart.mdx new file mode 100644 index 00000000..6427651b --- /dev/null +++ b/docs/getting-started/quickstart.mdx @@ -0,0 +1,79 @@ +--- +title: Quickstart +sidebar_position: 3 +ci_validated: false +ci_last_run: 2026-02-27 +--- + +import CIStatus from '@site/src/components/CIStatus'; +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; +import ExpectedOutput from '@site/src/components/ExpectedOutput'; +import TutorialDifficulty from '@site/src/components/TutorialDifficulty'; +import CodeBlock from '@theme/CodeBlock'; +import QuickstartLLMSource from '!!raw-loader!@site/code-samples/getting-started/quickstart_llm.py'; + +# Quickstart + + + +Run your first LLM on the AMD NPU in under 5 minutes. + +## Prerequisites + +- [Installation](/getting-started/installation) completed +- Ryzen AI conda environment activated (default: `ryzen-ai-1.7.0`) + +## Install Dependencies + + + + +```powershell +pip install onnxruntime-genai huggingface_hub +``` + + + + +```bash +pip install onnxruntime-genai huggingface_hub +``` + + + + +## Download a Model + + + + +```powershell +huggingface-cli download amd/Llama-3.2-1B-Instruct-onnx-ryzenai-hybrid --local-dir models/Llama-3.2-1B-Instruct-onnx-ryzenai-hybrid +``` + + + + +```bash +huggingface-cli download amd/Llama-3.2-1B-Instruct-onnx-ryzenai-hybrid --local-dir models/Llama-3.2-1B-Instruct-onnx-ryzenai-hybrid +``` + + + + +## Run Inference + +{QuickstartLLMSource} + +The model will stream a response to the terminal. Output will vary depending on the model. + +:::tip +The first run downloads and compiles the model, which takes longer. Subsequent runs start much faster. +::: + +## Next Steps + +- **[Chat Application](/models-tutorials/llms/server-interface)** -- Build a full chat interface +- **[Supported Models](/models-tutorials/llms/supported-models)** -- Browse all verified LLMs +- **[Applications](/applications)** -- Pre-built apps like Lemonade and GAIA diff --git a/docs/getting-started/supported-hardware.mdx b/docs/getting-started/supported-hardware.mdx new file mode 100644 index 00000000..3a021891 --- /dev/null +++ b/docs/getting-started/supported-hardware.mdx @@ -0,0 +1,37 @@ +--- +title: Supported Hardware +sidebar_position: 4 +ci_validated: false +ci_last_run: 2026-02-27 +--- + +import CIStatus from '@site/src/components/CIStatus'; + +# Supported Hardware + + + +## NPU-Enabled Processors + +Ryzen AI 1.7 Software supports AMD processors codenamed Phoenix, Hawk Point, Strix, Strix Halo, and Krackan Point. These processors can be found in the following Ryzen series: + +- Ryzen AI 300 Series, Ryzen AI PRO Series, Ryzen AI Max 300 Series (Strix, Strix Halo, Krackan Point — XDNA 2) +- Ryzen 8000 Series, Ryzen PRO 8000 Series (Hawk Point — XDNA) +- Ryzen 7000 Series, Ryzen PRO 7000 Series (Phoenix — XDNA) +- Ryzen 200 Series + +For a complete list, refer to the [AMD processor specifications](https://www.amd.com/en/products/specifications/processors.html) page (look for the "AMD Ryzen AI" column and select "Available"). + +## GPU Support + +Models can be run on the integrated AMD GPU using [DirectML](/develop/rocm-client-gpu). This uses the ONNX Runtime DirectML Execution Provider. + +## Operating Systems + +| OS | NPU | GPU (DirectML) | +|----|-----|----------------| +| Windows 11 23H2+ | Yes | Yes | + +:::info +Linux NPU support for LLMs is available. See [Running LLM on Linux](/models-tutorials/llms/linux-setup). +::: diff --git a/docs/models-tutorials/audio/index.mdx b/docs/models-tutorials/audio/index.mdx new file mode 100644 index 00000000..ca51c708 --- /dev/null +++ b/docs/models-tutorials/audio/index.mdx @@ -0,0 +1,26 @@ +--- +title: Audio +sidebar_position: 1 +--- + +# Audio + +Deploy audio AI models on AMD Ryzen AI hardware. + +[Whisper](https://github.com/openai/whisper) provides high-quality speech-to-text on AMD Ryzen AI NPUs. AMD offers NPU-optimized ONNX Whisper models from the [Ryzen AI 1.7 Whisper NPU Optimized ONNX Models](https://huggingface.co/collections/amd/ryzen-ai-17-whisper-npu-optimized-onnx-models) collection on Hugging Face. + +## Supported Models + +| Model | Device | Verified | +|-------|--------|----------| +| [amd/whisper-tiny-onnx-npu](https://huggingface.co/amd/whisper-tiny-onnx-npu) | NPU | Pending | +| [amd/whisper-base-onnx-npu](https://huggingface.co/amd/whisper-base-onnx-npu) | NPU | Pending | +| [amd/whisper-small-onnx-npu](https://huggingface.co/amd/whisper-small-onnx-npu) | NPU | Pending | +| [amd/whisper-small-en-onnx-npu](https://huggingface.co/amd/whisper-small-en-onnx-npu) | NPU | Pending | +| [amd/whisper-medium-onnx-npu](https://huggingface.co/amd/whisper-medium-onnx-npu) | NPU | Pending | +| [amd/whisper-large-v3-onnx-npu](https://huggingface.co/amd/whisper-large-v3-onnx-npu) | NPU | Pending | +| [amd/whisper-large-turbo-onnx-npu](https://huggingface.co/amd/whisper-large-turbo-onnx-npu) | NPU | Pending | + +## Tutorials + +- [Whisper Tutorial](/models-tutorials/audio/whisper) — Step-by-step guide to run Whisper speech-to-text on the NPU diff --git a/docs/models-tutorials/audio/supported-models.mdx b/docs/models-tutorials/audio/supported-models.mdx new file mode 100644 index 00000000..b5878643 --- /dev/null +++ b/docs/models-tutorials/audio/supported-models.mdx @@ -0,0 +1,26 @@ +--- +title: "Audio: Supported Models" +sidebar_position: 1 +ci_validated: false +ci_last_run: 2026-02-27 +--- + +import CIStatus from '@site/src/components/CIStatus'; + +# Audio: Supported Models + + + +[Whisper](https://github.com/openai/whisper) provides high-quality speech-to-text on AMD Ryzen AI NPUs. AMD offers NPU-optimized ONNX Whisper models from the [Ryzen AI 1.7 Whisper NPU Optimized](https://huggingface.co/collections/amd/ryzen-ai-17-whisper-npu-optimized-onnx-models) collection on Hugging Face. + +For a step-by-step guide, see the [Whisper tutorial](/models-tutorials/audio/whisper). + +| Model | Device | Verified | +|-------|--------|----------| +| [amd/whisper-tiny-onnx-npu](https://huggingface.co/amd/whisper-tiny-onnx-npu) | NPU | Pending | +| [amd/whisper-base-onnx-npu](https://huggingface.co/amd/whisper-base-onnx-npu) | NPU | Pending | +| [amd/whisper-small-onnx-npu](https://huggingface.co/amd/whisper-small-onnx-npu) | NPU | Pending | +| [amd/whisper-small-en-onnx-npu](https://huggingface.co/amd/whisper-small-en-onnx-npu) | NPU | Pending | +| [amd/whisper-medium-onnx-npu](https://huggingface.co/amd/whisper-medium-onnx-npu) | NPU | Pending | +| [amd/whisper-large-v3-onnx-npu](https://huggingface.co/amd/whisper-large-v3-onnx-npu) | NPU | Pending | +| [amd/whisper-large-turbo-onnx-npu](https://huggingface.co/amd/whisper-large-turbo-onnx-npu) | NPU | Pending | diff --git a/docs/models-tutorials/audio/whisper/index.mdx b/docs/models-tutorials/audio/whisper/index.mdx new file mode 100644 index 00000000..5763bcf0 --- /dev/null +++ b/docs/models-tutorials/audio/whisper/index.mdx @@ -0,0 +1,173 @@ +--- +title: "Running Whisper on Ryzen AI" +ci_validated: false +--- + +{/* AUTO-GENERATED from audio/whisper/README.md -- do not edit directly. */} +{/* Run "node website/scripts/sync-examples.mjs" after updating the source. */} + +import CIStatus from '@site/src/components/CIStatus'; + + + +:::info Source Code +Clone the repo and find this example at [`examples/audio/whisper/`](https://github.com/amd/RyzenAI-SW/tree/main/examples/audio/whisper). +::: + +# Automatic Speech Recognition with OpenAI Whisper + + +Run fast, on-device speech recognition with Ryzen AI and OpenAI Whisper. This guide covers running pre-built NPU-optimized models (quick start) and exporting your own models from Hugging Face (advanced). + +## Supported Models + +| Model | Parameters | NPU Support | +|-------|-----------|-------------| +| [amd/whisper-base-onnx-npu](https://huggingface.co/amd/whisper-base-onnx-npu) | 74M | Yes | +| [amd/whisper-small-onnx-npu](https://huggingface.co/amd/whisper-small-onnx-npu) | 244M | Yes | +| [amd/whisper-medium-onnx-npu](https://huggingface.co/amd/whisper-medium-onnx-npu) | 769M | Yes | +| [amd/whisper-large-v3-turbo-onnx-npu](https://huggingface.co/amd/whisper-large-v3-turbo-onnx-npu) | 809M | Yes | + +## Prerequisites + +1. **Install Ryzen AI Software** — follow the [Installation guide](/getting-started/installation). + +2. **Activate environment** + +```bash +conda activate ryzen-ai-1.4.0 +``` + +3. **Install dependencies** + +```bash +cd docs/models-tutorials/audio/whisper +pip install -r requirements.txt +``` + +## Quick Start: Transcribe an Audio File + +Models are auto-downloaded from AMD's Hugging Face repos on first run. + +```bash +python run_whisper.py \ + --model-type whisper-base \ + --device npu \ + --input audio_files/1089-134686-0000.wav +``` + +Replace `whisper-base` with any supported model (`whisper-small`, `whisper-medium`, `whisper-large-v3-turbo`). + +## Live Microphone Transcription + +```bash +python run_whisper.py \ + --model-type whisper-base \ + --device npu \ + --input mic \ + --duration 0 +``` + +`--duration 0` records continuously until Ctrl+C or silence is detected. + +## Dataset Evaluation (WER, CER, RTF) + +Evaluate on LibriSpeech samples to measure Word Error Rate, Character Error Rate, Real-Time Factor, and Time to First Token: + +```bash +python run_whisper.py \ + --model-type whisper-base \ + --device npu \ + --eval-dir eval_dataset/LibriSpeech-samples \ + --results-dir results +``` + +## NPU Configuration + +### How NPU Acceleration Works + +When running on the NPU, Whisper's encoder and decoder are accelerated through the Vitis AI Execution Provider. For whisper-base: + +```text notest +# Encoder operations +[Vitis AI EP] No. of Operators : VAIML 225 +[Vitis AI EP] No. of Subgraphs : VAIML 1 + +# Decoder operations +[Vitis AI EP] No. of Operators : CPU 24 VAIML 341 +[Vitis AI EP] No. of Subgraphs : VAIML 2 +``` + +100% of encoder operators and 93.4% of decoder operators run on the NPU. + +### Execution Provider Configuration + +Edit `config/model_config.json` to configure execution providers per model. For NPU, set `cache_key`, `cache_dir`, and point to the appropriate VitisAI config: + +```json +{ + "config_file": "config/vitisai_config_whisper_decoder.json", + "cache_dir": "./cache", + "cache_key": "whisper_medium_decoder" +} +``` + +### Whisper-Medium Special Configuration + +Whisper-medium requires additional flags in `config/vitisai_config_whisper_encoder.json`: + +```json +"vaiml_config": { + "optimize_level": 3, + "aiecompiler_args": "--system-stack-size=512" +} +``` + +- `optimize_level=3`: aggressive optimizations for larger models +- `--system-stack-size=512`: increases AI Engine stack size for whisper-medium's resource demands + +## Advanced: Export Your Own Models + +If you need to export a custom Whisper model (e.g., a fine-tuned variant) from Hugging Face to ONNX with static shapes for NPU: + +### Step 1: Export to ONNX + +```bash +optimum-cli export onnx \ + --model openai/whisper-base \ + --task automatic-speech-recognition \ + whisper-base-onnx/ +``` + +### Step 2: Convert Dynamic to Static Shapes + +The NPU requires static input shapes. Use the included conversion script: + +```bash +python dynamic_to_static.py +``` + +This uses `params.json` to fix dynamic dimensions in the encoder and decoder ONNX models. + +### Step 3: Run with Explicit Paths + +```bash +python run_whisper.py \ + --encoder whisper-base-onnx/encoder_model.onnx \ + --decoder whisper-base-onnx/decoder_model.onnx \ + --device npu \ + --input audio_files/1089-134686-0000.wav +``` + +## Whisper.cpp (C++ Alternative) + +Ryzen AI also provides NPU acceleration for [whisper.cpp](https://github.com/ggerganov/whisper.cpp) through an AMD-maintained fork. On Ryzen AI 300 Series, the encoder fully offloads to the NPU for significant speedup versus CPU-only runs. NPU acceleration is currently Windows-only with Linux support planned. + +For setup steps and NPU-optimized model guidance, see the [AMD whisper.cpp fork](https://github.com/amd/whisper.cpp?tab=readme-ov-file#amd-ryzen-ai-support-for-npu). + +## Notes + +- First run on NPU takes ~15 minutes for model compilation. Subsequent runs use the cached compiled model. +- Supports both CPU and NPU devices via the `--device` flag. +- Use `--language` to force a specific language for transcription. + diff --git a/docs/models-tutorials/examples.mdx b/docs/models-tutorials/examples.mdx new file mode 100644 index 00000000..0b4c20ba --- /dev/null +++ b/docs/models-tutorials/examples.mdx @@ -0,0 +1,74 @@ +--- +title: Examples & Demos +sidebar_position: 1 +ci_validated: false +ci_last_run: 2026-02-27 +--- + +import CIStatus from '@site/src/components/CIStatus'; + +# Examples and Demos + + + +All examples are available in the [RyzenAI-SW GitHub repository](https://github.com/amd/RyzenAI-SW). + +## Vision Examples + +| Example | Description | Device | Language | Link | +|---------|-------------|--------|----------|------| +| Hello World | Simple ONNX model on NPU | NPU | Python | [Code](/models-tutorials/vision/hello-world) | +| ResNet CIFAR-10 (INT8) | Image classification with INT8 quantization | NPU | Python, C++ | [Code](/models-tutorials/vision/getting-started-resnet/int8) | +| ResNet CIFAR-10 (BF16) | Image classification with BF16 | NPU | Python, C++ | [Code](/models-tutorials/vision/getting-started-resnet/bf16) | +| ResNet-50 ImageNet | BF16 image classification on ImageNet | NPU | Python | [Code](/models-tutorials/vision/image-classification) | +| YOLOv8m Object Detection | BF16 and XINT8 object detection | NPU | Python | [Code](/models-tutorials/vision/object-detection/yolov8m) | +| YOLOv8s-WorldV2 | Open-vocabulary object detection | NPU | Python | [Code](/models-tutorials/vision/object-detection/yolov8s-worldv2) | +| Torchvision Models | Run torchvision models on NPU | NPU | Python, Jupyter | [Code](/models-tutorials/vision/torchvision-inference) | +| Quark Quantization | Quantize models with AMD Quark | NPU | Python | [Code](/models-tutorials/vision/quark-quantization) | +| ResNet-50 on iGPU | Image classification on integrated GPU | iGPU | Python, C++ | [Code](/models-tutorials/vision/igpu-getting-started) | +| Super Resolution | Real-ESRGAN and SESR-M7 | NPU | - | [Tutorial](/models-tutorials/vision/super-resolution) | +| Stable Diffusion | SD 1.5 to 3.5 on iGPU | iGPU | Python | [Tutorial](/models-tutorials/vision/stable-diffusion) | + +## LLM Examples + +| Example | Description | Device | Language | Link | +|---------|-------------|--------|----------|------| +| OGA Python Inference | Chat with LLMs using OGA Python API | NPU, Hybrid | Python | [Code](/models-tutorials/llms/oga-inference) | +| OGA C++ API | Native C++ LLM inference | NPU, Hybrid | C++ | [Code](/models-tutorials/llms/oga-api) | +| LLM Fine-Tuning & Deploy | Fine-tune and deploy LLMs on NPU | NPU | Python | [Code](/models-tutorials/llms/llm-sft-deploy) | +| Vision-Language Model | Run VLMs with OGA | NPU | Python | [Code](/models-tutorials/llms/vlm) | +| RAG with OGA | Retrieval-augmented generation | NPU | Python | [Code](/models-tutorials/llms/rag-oga) | + +## Audio Examples + +| Example | Description | Device | Language | Link | +|---------|-------------|--------|----------|------| +| Whisper ASR | Speech-to-text with Whisper | NPU | Python | [Code](/models-tutorials/audio/whisper) | + +## NLP Examples + +| Example | Description | Device | Language | Link | +|---------|-------------|--------|----------|------| +| DistilBERT Text Classification | BF16 text classification | NPU | Python | [Code](/models-tutorials/nlp/distilbert) | + +## End-to-End Demos + +| Demo | Description | Devices | Link | +|------|-------------|---------|------| +| NPU-GPU Pipeline | YOLOv8 + RCAN on NPU, Stable Diffusion on iGPU | NPU + iGPU | [Code](/models-tutorials/multimodal/npu-gpu-pipeline) | +| Whisper ASR Demo | Real-time speech-to-text | NPU | [Code](/models-tutorials/audio/whisper) | + +## Tools + +| Tool | Description | Link | +|------|-------------|------| +| ONNX Benchmark | Benchmark ONNX model inference (FPS, latency, power) | [Code](/tools/benchmarking) | +| NPU Check Utility | C++ utility to check NPU compatibility | [Code](/tools/npu-check) | + +## CVML Library Samples + +| Sample | Description | Link | +|--------|-------------|------| +| Face Detection | Real-time face detection | [Code](/models-tutorials/vision/cvml/face-detection) | +| Face Mesh | Face landmark detection | [Code](/models-tutorials/vision/cvml/face-mesh) | +| Depth Estimation | Monocular depth estimation | [Code](/models-tutorials/vision/cvml) | diff --git a/docs/models-tutorials/index.mdx b/docs/models-tutorials/index.mdx new file mode 100644 index 00000000..84ce52e8 --- /dev/null +++ b/docs/models-tutorials/index.mdx @@ -0,0 +1,12 @@ +--- +title: 🧠 Models & Tutorials +sidebar_position: 1 +--- + +import DocCardList from '@theme/DocCardList'; + +# 🧠 Models & Tutorials + +Browse verified models and step-by-step tutorials organized by AI domain. All examples are available in the [RyzenAI-SW GitHub repository](https://github.com/amd/RyzenAI-SW). + + diff --git a/docs/models-tutorials/llms/hybrid-inference.mdx b/docs/models-tutorials/llms/hybrid-inference.mdx new file mode 100644 index 00000000..3a200a44 --- /dev/null +++ b/docs/models-tutorials/llms/hybrid-inference.mdx @@ -0,0 +1,284 @@ +--- +title: OnnxRuntime GenAI (OGA) Flow +sidebar_position: 4 +ci_validated: false +ci_last_run: 2026-02-27 +--- + +import CIStatus from '@site/src/components/CIStatus'; + +# OnnxRuntime GenAI (OGA) Flow + + + +Ryzen AI Software supports deploying LLMs on Ryzen AI PCs using the native ONNX Runtime Generate (OGA) C++ or Python API. The OGA API is the lowest-level API available for building LLM applications on a Ryzen AI PC. It supports the following execution modes: + +- **Hybrid execution mode**: This mode uses both the NPU and iGPU to achieve the best TTFT and TPS during the prefill and decode phases. +- **NPU-only execution mode**: This mode uses the NPU exclusively for both the prefill and decode phases. + +## Supported Configurations + +The Ryzen AI OGA flow supports Strix and Krackan Point processors. Phoenix (PHX) and Hawk (HPT) processors are not supported. + +## Requirements + +- Install NPU Drivers and Ryzen AI MSI installer. See [Installation](/getting-started/installation) for more details. +- Install GPU device driver: Ensure GPU device driver https://www.amd.com/en/support is installed +- Install Git for Windows (needed to download models from HF): https://git-scm.com/downloads + +## Pre-optimized Models + +AMD provides a set of pre-optimized LLMs ready to be deployed with Ryzen AI Software and the supporting runtime for hybrid and/or NPU-only execution. These include popular architectures such as Llama-2, Llama-3, Mistral, DeepSeek Distill models, Qwen-2, Qwen-2.5, Qwen-3, Gemma-2, Gemma-3, GPT-OSS, Phi-3, Phi-3.5, and Phi-4. For the detailed list of supported models, visit [Supported Models](/models-tutorials/llms/supported-models) + +- Hugging Face collection of hybrid models: https://huggingface.co/collections/amd/ryzen-ai-17-hybrid-llm +- Hugging Face collection of NPU models: https://huggingface.co/collections/amd/ryzen-ai-17-npu-llm + +Each OGA model folder contains a `genai_config.json` file. This file contains various configuration settings for the model. The `session_option` section is where information about specific runtime dependencies is specified. + +## Changes Compared to Previous Release + +- OGA version is updated to v0.11.2 (Ryzen AI 1.7) from v0.9.2.2 (Ryzen AI 1.6.1). +- For 1.7 release, a new set of hybrid and NPU models is published. Models from earlier releases are not compatible with this version. If you are using Ryzen AI 1.7, please download the updated models. +- Context length upto 4K tokens (combined input and output) is supported for NPU Models. Extended context length more than 4K tokens is supported for Hybrid models. + +## Compatible OGA APIs + +Pre-optimized hybrid or NPU LLMs can be executed using the official OGA C++ and Python APIs. The current release is compatible with OGA version 0.11.2. +For detailed documentation and examples, refer to the official OGA repository: +https://github.com/microsoft/onnxruntime-genai/tree/rel-0.11.2 + +## LLMs Test Programs + +The Ryzen AI installation includes test programs (in C++ and Python) that can be used to run LLMs and understand how to integrate them in your application. + +The steps for deploying the pre-optimized models using the sample programs are described in the following sections. + +### Steps to run C++ program and sample python script + +#### 1. (Optional) Enable Performance Mode + +To run LLMs in best performance mode, follow these steps: + +- Go to `Windows` → `Settings` → `System` → `Power`, and set the power mode to **Best Performance**. +- Open a terminal and run: + +```bat +cd C:\Windows\System32\AMD +xrt-smi configure --pmode performance +``` + +#### 2. Activate the Ryzen AI Conda Environment and install `torch` library + +Run the following commands: + +```bat +conda activate ryzen-ai- +pip install torch==2.7.1 +``` + +This step is required for running the Python script. + +:::info +For the C++ program, if you choose not to activate the Conda environment, open a Windows Command Prompt and manually set the environment variable before continuing: + +`set RYZEN_AI_INSTALLATION_PATH=C:\Program Files\RyzenAI\` +::: + +### C++ Program + +Use the `model_benchmark.exe` executable to test LLMs and identify DLL dependencies for C++ applications. + +#### 1. Set Up a working directory and copy required Files + +```bat +mkdir llm_run +cd llm_run + +:: Copy the sample C++ executable +xcopy /Y "%RYZEN_AI_INSTALLATION_PATH%\LLM\example\model_benchmark.exe" . + +:: Copy the sample prompt file +xcopy /Y "%RYZEN_AI_INSTALLATION_PATH%\LLM\example\amd_genai_prompt.txt" . + +:: Copy required DLLs +xcopy /Y "%RYZEN_AI_INSTALLATION_PATH%\deployment\." . +``` + +#### 2. Download model from Hugging Face + +```bat +:: Install Git LFS if you haven't already: https://git-lfs.com +git lfs install + +:: Clone the model repository +git clone https://huggingface.co/amd/Llama-2-7b-chat-hf-onnx-ryzenai-hybrid +``` + +#### 3. Run `model_benchmark.exe` + +```bat +.\model_benchmark.exe -i -f -l + +:: Example: +.\model_benchmark.exe -i Llama-2-7b-chat-hf-onnx-ryzenai-hybrid -f amd_genai_prompt.txt -l "1024" +``` + +### Long Context Support + +Ryzen AI now supports token counts up to the model's context length for **hybrid models**. If the total number of tokens exceed 4096, follow the below steps. + +**Steps to run long context:** + +1. Make the following changes in `genai_config.json` file. + +- Add `"hybrid_opt_chunk_context": "1"` under `model.decoder.session_options.provider_options.RyzenAI`. + +```json +{ + "model": { + "bos_token_id": 1, + "context_length": 16384, + "decoder": { + "session_options": { + "log_id": "onnxruntime-genai", + "provider_options": [ + { + "RyzenAI": { + "external_data_file": "model_jit.pb.bin", + "hybrid_opt_free_after_prefill": "1", + "hybrid_opt_max_seq_length": "4096", + "hybrid_opt_chunk_context": "1" + } + } + ] + } + } + } +} +``` + +- Add `"chunk_size":2048` under `search`. + +```json +"search": { + "diversity_penalty": 0.0, + "do_sample": false, + "chunk_size": 2048, + ... +} +``` + +2. Copy the `amd_genai_prompt_long.txt` into your working directory. + +```bat +xcopy /Y "%RYZEN_AI_INSTALLATION_PATH%\LLM\example\amd_genai_prompt_long.txt" . +``` + +3. Run the model using `model_benchmark.exe` using the `amd_genai_prompt_long.txt` prompt file. + +```bat +.\model_benchmark.exe -i -f amd_genai_prompt_long.txt -l "16000" +``` + +:::info +The sample test application model_benchmark.exe accepts -l for input token length and -g for output token length. In Ryzen AI 1.7, **NPU models** support up to 4096 tokens in total (input + output). By default, -g is set to 128. If the input length is close to 4096, you must adjust -g so the sum of input and output tokens does not exceed 4096. For example, -l 4000 -g 96 is valid (4000 + 96 ≤ 4096), while -l 4000 -g 128 will exceed the limit and result in an error. + +For **Hybrid models**, the combined number of input and output tokens must not exceed the model's `context_length`. You can verify the `context_length` in the `genai_config.json` file. For example, if a model's `context_length` is 8,000, the total token count (input + output) must not exceed 8,000. + +The long context feature has been tested for hybrid models upto 16,000 tokens. +::: + +### Python Script + +#### 1. Navigate to your working directory and download model + +```bat +:: Install Git LFS if you haven't already: https://git-lfs.com +git lfs install + +:: Clone the model repository +git clone https://huggingface.co/amd/Llama-2-7b-chat-hf-onnx-ryzenai-hybrid +``` + +#### 2. Run sample python script + +```bat +python "%RYZEN_AI_INSTALLATION_PATH%\LLM\example\run_model.py" -m -l + +:: Example command +python "%RYZEN_AI_INSTALLATION_PATH%\LLM\example\run_model.py" -m "Llama-2-7b-chat-hf-onnx-ryzenai-hybrid" -l 256 +``` + +:::info +Some models may return non-printable characters in their output (for example, Qwen models), which can cause a crash while printing the output text. To avoid this, modify the provided script %RYZEN_AI_INSTALLATION_PATH%\LLM\example\run_model.py by adding a text sanitization function and updating the print statement as shown below. + +Add sanitize_string function: + +``` +def sanitize_string(input_string): + return input_string.encode("charmap", "ignore").decode("charmap") +``` + +Update line 80 to print sanitized output: + +``` +print("Output:", sanitize_string(output_text)) +``` + +This sanitization fix will be included in the run_model.py script in the next release. +::: + +### Python Script (with Chat Template) + +For models that use chat templates, the sample [model_chat.py](/models-tutorials/llms/oga-inference) script may provide better output quality. The script and usage instructions are available in the [RyzenAI-SW repository](/models-tutorials/llms/oga-inference). + +This script automatically loads and applies the chat template from the model folder during inference, which can improve output quality for models that use a chat template. + +It is highly recommended to use [model_chat.py](/models-tutorials/llms/oga-inference) for the [GPT-OSS-20B NPU model](https://huggingface.co/amd/gpt-oss-20b-onnx-ryzenai-npu). + +## Vision Language Model (VLM) + +AMD provides a pre-optimized Gemma-3-4b-it multimodal model ready to be deployed with Ryzen AI Software. Support for this model is available starting with the current Ryzen AI 1.7 release. + +Model: [Gemma-3-4b-it-mm-onnx-ryzenai-npu](https://huggingface.co/amd/Gemma-3-4b-it-mm-onnx-ryzenai-npu) + +VLM inference requires a dedicated Python script. The python script and usage instructions are available in the RyzenAI-SW repository: [VLM](/models-tutorials/llms/vlm) + +## Building C++ Applications + +A complete example including C++ source and build instructions is available in the RyzenAI-SW repository: [LLM-examples/oga_api](/models-tutorials/llms/oga-api) + +## Using Fine-Tuned Models + +It is also possible to run fine-tuned versions of the pre-optimized OGA models. + +To do this, the fine-tuned models must first be prepared for execution with the OGA flow. For instructions on how to do this, refer to the page about [ONNX Model Preparation](/develop/onnx-model-preparation). + +After a fine-tuned model has been prepared for execution, it can be deployed by following the steps described previously in this page. + +## Running LLM via pip install + +In addition to the full RyzenAI software stack, we also provide standalone wheel files for the users who prefer using their own environment. To prepare an environment for running the Hybrid and NPU-only LLM independently, perform the following steps: + +1. Create a new python environment and activate it. + +```bash +conda create -n python=3.12 -y +conda activate +``` + +2. Install onnxruntime-genai wheel file. + +```bash +pip install onnxruntime-genai-directml-ryzenai==0.11.2 --extra-index-url=https://pypi.amd.com/simple +pip install model-generate==1.7.0 --extra-index-url=https://pypi.amd.com/simple +``` + +3. Navigate to your working directory and download the desired Hybrid/NPU model + +```bash +cd working_directory +git clone +``` + +4. Run the Hybrid or NPU model. diff --git a/docs/models-tutorials/llms/index.mdx b/docs/models-tutorials/llms/index.mdx new file mode 100644 index 00000000..b0c3ba68 --- /dev/null +++ b/docs/models-tutorials/llms/index.mdx @@ -0,0 +1,20 @@ +--- +title: Large Language Models +sidebar_position: 2 +--- + +# Large Language Models + +Deploy and run Large Language Models on AMD Ryzen AI processors. + +- [LLM Deployment Overview](/models-tutorials/llms/overview) - Architecture and deployment options +- [High-level Python SDK](/models-tutorials/llms/python-api) - Python API for LLM inference +- [Server Interface](/models-tutorials/llms/server-interface) - OpenAI-compatible server +- [Hybrid Inference](/models-tutorials/llms/hybrid-inference) - NPU + GPU hybrid execution +- [Linux Setup](/models-tutorials/llms/linux-setup) - Linux-specific configuration +- [Supported LLMs](/models-tutorials/llms/supported-models) - Full list of supported models +- [Vision-Language Models](/models-tutorials/llms/vlm) - VLM deployment +- [OGA C++ API](/models-tutorials/llms/oga-api) - Low-level C++ API for ONNX GenAI +- [OGA Inference (Python)](/models-tutorials/llms/oga-inference) - Python OGA inference +- [Fine-tune and Deploy](/models-tutorials/llms/llm-sft-deploy) - SFT fine-tuning and deployment +- [RAG with OGA](/models-tutorials/llms/rag-oga) - Retrieval-Augmented Generation diff --git a/docs/models-tutorials/llms/linux-setup.mdx b/docs/models-tutorials/llms/linux-setup.mdx new file mode 100644 index 00000000..326068a6 --- /dev/null +++ b/docs/models-tutorials/llms/linux-setup.mdx @@ -0,0 +1,161 @@ +--- +title: Running LLM on Linux +sidebar_position: 5 +ci_validated: false +ci_last_run: 2026-02-27 +--- + +import CIStatus from '@site/src/components/CIStatus'; + +# Running LLM on Linux + + + +This page showcases an example of running LLM on RyzenAI NPU + +- Open a Linux terminal and create a new folder + +```bash +mkdir run_llm +cd run_llm +``` + +- Choose any prequantized and postprocessed ready-to-run Model from [Hugging Face collection of NPU models](https://huggingface.co/collections/amd/ryzen-ai-17-npu-llm) +- For this flow, "Phi-3.5-mini-instruct-onnx-ryzenai-npu" is chosen for reference + +```bash +# Make sure git-lfs is installed (https://git-lfs.com) +git lfs install +git clone https://huggingface.co/amd/Phi-3.5-mini-instruct-onnx-ryzenai-npu +``` + +- Search for RYZEN_AI_INSTALLATION_PATH + +```bash +echo $RYZEN_AI_INSTALLATION_PATH +/ryzen_ai-1.6.1/venv + +# Activate the virtual environment +source /ryzen_ai-1.6.1/venv/bin/activate +``` + +- Collecting the necessary files to get in current working directory + +**Deployment folder** — contains the libraries needed to run LLM models: + +```bash +cp -r /ryzen_ai-1.6.1/venv/deployment . +``` + +**Model Benchmark Script:** + +```bash +cp /ryzen_ai-1.6.1/venv/LLM/examples/model_benchmark . +``` + +**Prompt file** — input for your LLM model: + +```bash +cp /ryzen_ai-1.6.1/venv/LLM/examples/amd_genai_prompt.txt . +``` + +- Current working directory should have below files + +``` +deployment model_benchmark amd_genai_prompt.txt Phi-3.5-mini-instruct-onnx-ryzenai-npu +``` + +- Two files under Phi-3.5 Model have to be updated to make it work for Linux environment + +1. **Edit genai_config.json** file under Model folder: + - Set `"custom_ops_library": "deployment/lib/libonnx_custom_ops.so"` (line 8) + - Add under `config_entries`: `"hybrid_dbg_use_aie_rope": "0"` (line 11) + +2. **Edit .cache/MatMulNBits_2_0_meta.json** file under Model folder: + + Python utility script helps convert Windows-style paths in "MatMulNBits_2_0_meta.json" to Linux-style paths: + +```python +import json + +with open('Phi-3.5-mini-instruct-onnx-ryzenai-npu/.cache/MatMulNBits_2_0_meta.json','r') as f: + lines = f.readlines() + for i in range(len(lines)): + if '.cache' in lines[i]: + lines[i] = lines[i].replace('\\','/') + +with open('Phi-3.5-mini-instruct-onnx-ryzenai-npu/.cache/MatMulNBits_2_0_meta.json','w') as f: + f.writelines(lines) +``` + +- Lastly, add directories for LD_LIBRARY_PATH + +```bash +export LD_LIBRARY_PATH=deployment/lib:$LD_LIBRARY_PATH +``` + +- We can now run our Model with command below: + +```bash +./model_benchmark -i Phi-3.5-mini-instruct-onnx-ryzenai-npu/ -l 128 -f amd_genai_prompt.txt + +# Enable "-v" flag for verbose output +``` + +## Expected output + +```bash notest +------------------------------ +Prompt Number of Tokens: 128 + +Batch size: 1, prompt tokens: 128, tokens to generate: 128 +Prompt processing (time to first token): + avg (us): 442251 + avg (tokens/s): 289.428 + p50 (us): 442583 + stddev (us): 4901.59 + n: 5 * 128 token(s) +Token generation: + avg (us): 85353.7 + avg (tokens/s): 11.716 + p50 (us): 84689.3 + stddev (us): 7012.99 + n: 635 * 1 token(s) +Token sampling: + avg (us): 27.4852 + avg (tokens/s): 36383.2 + p50 (us): 27.652 + stddev (us): 0.928063 + n: 5 * 1 token(s) +E2E generation (entire generation loop): + avg (ms): 11282.4 + p50 (ms): 11275.4 + stddev (ms): 14.2974 + n: 5 +Peak working set size (bytes): 6736375808 +``` + +## Preparing OGA Model + +Preparing OGA Model is a two-step process + +### Model Quantization + +- Follow Model Quantization steps described here [ONNX Model Preparation](/develop/onnx-model-preparation) + +### Postprocessing + +- Model Quantization step produces Pytorch quantized model. +- Model_generate script initially converts Pytorch quantized model to Onnx format and subsequently postprocesses to run for NPU Execution mode. + +```bash +pip install onnx-ir + +model_generate --npu --optimize decode +``` + +- Expected Output + +```bash +NPU optimize decode model generated successfully. +``` diff --git a/docs/models-tutorials/llms/llm-sft-deploy/index.mdx b/docs/models-tutorials/llms/llm-sft-deploy/index.mdx new file mode 100644 index 00000000..d7f2a97e --- /dev/null +++ b/docs/models-tutorials/llms/llm-sft-deploy/index.mdx @@ -0,0 +1,101 @@ +--- +title: "Fine-tune and Deploy LLMs" +ci_validated: false +--- + +{/* AUTO-GENERATED from llms/llm-sft-deploy/README.md -- do not edit directly. */} +{/* Run "node website/scripts/sync-examples.mjs" after updating the source. */} + +import CIStatus from '@site/src/components/CIStatus'; + + + +:::info Source Code +Clone the repo and find this example at [`examples/llms/llm-sft-deploy/`](https://github.com/amd/RyzenAI-SW/tree/main/examples/llms/llm-sft-deploy). +::: + +# Accelerate Finetuned LLMs Locally on NPU + iGPU Ryzen AI Processor + +This repo provides supplemental code to the AMD Blog [Accelerate Finetuned LLMs Locally on NPU + iGPU Ryzen AI processor](https://www.amd.com/en/developer/resources/technical-articles/accelerate-llms-locally-on-amd-ryzen-ai-npu-and-igpu.html). Code is provided for LoRA finetuning on MI300X and then running inference of finetuned model on Ryzen AI. + +# Finetuning LLMs + +## Getting Started +1. Install miniconda/anaconda and create a new conda environment for training/inference on GPUs +2. Install requirements.txt using `pip install -r requirements.txt` +3. Set Huggingface API Tokens by `export={HUGGINGFACE_API_TOKEN}` in terminal. Needed for accessing gated models and saving to Huggingface. + +## Finetune + +We provide ``train.py`` to do LoRA finetuning. Training can be saved locally or directly to huggingface and wandB can be utilized to track training
+Set ``--hf_dir local`` to save locally and bypass huggingface and wandB setup. + +The training script LoRA finetunes Llama3.2 1B on the [Volve Alpaca Dataset](https://huggingface.co/datasets/bengsoon/volve_alpaca), an application for the oil & rigging industry. + +### Finetuning Adapter (Save Locally) + +```bash notest +python train.py --lora --lora_qv --hf_dir local +``` + +### Finetuning Adapter (Save to HF) + +```bash notest +python train.py --lora --lora_qv --hf_dir +``` + +### Merging Adapter +After finetuning the adapter, merge adapter with base LLM through the following: + +```bash notest +python train.py --merge_model --model_name meta-llama/Llama-3.2-1B --adapter_model_dir +``` + +## LLM Inference of Finetuned models on GPU +Use: ``inference.py`` to run inference on GPU .
+Set ``--inference_filename`` to a ".json" filename in which model predictions will be stored. + +#### Inference on Finetuned (merged) model + +```bash notest +python inference.py --fp --model_dir amd/volve-llama3.2-1b --inference_filename "volve-llama3_1B.json" +``` + +#### Inference on Quark Quantized model (safetensors) +- Install Quark from wheel file [here](https://quark.docs.amd.com/latest/install.html#install-quark-quark-examples-from-download).
+- Inside the zip folder are example scripts. Use the following for AWQ quantization:
+ +``` +cd examples/torch/language_modeling/llm_ptq/ +python quantize_quark.py \ + --model_dir \ + --output_dir \ + --quant_scheme w_uint4_per_group_asym \ + --num_calib_data 128 \ + --quant_algo awq \ + --dataset pileval_for_awq_benchmark \ + --model_export hf_format \ + --data_type float16 \ + --custom_mode awq +``` + +- Run the following for inference: + +```bash notest +python inference.py --quark_safetensors --quant_model_dir --inference_filename "quantized_model.json" +``` + + +# Deploy on Ryzen AI + +### Quantize the full-precision, finetuned model using the quantization strategy menentioned [here](https://ryzenai.docs.amd.com/en/latest/oga_model_prepare.html#generate-quantized-model) + +### Install RyzenAI and prerequisites accoring to instructions [here](https://ryzenai.docs.amd.com/en/latest/inst.html). + +### Transform the quantized model to run on Hybrid approach within Ryzen AI, utilizing both the iGPU and NPU by running the following. See reference [here](https://ryzenai.docs.amd.com/en/latest/oga_model_prepare.html#postprocessing). + +### Run inference on RyzenAI with the following:
+``python inference_oga.py --model_dir "" --inference_filename hybrid_ft_model.json`` + +Please check the blog post for comprehensive instructions on additional packages needed within the ryzen-ai conda environment. + diff --git a/docs/models-tutorials/llms/oga-api/index.mdx b/docs/models-tutorials/llms/oga-api/index.mdx new file mode 100644 index 00000000..70a6bf00 --- /dev/null +++ b/docs/models-tutorials/llms/oga-api/index.mdx @@ -0,0 +1,117 @@ +--- +title: "OGA C++ API" +ci_validated: false +--- + +{/* AUTO-GENERATED from llms/oga_api/README.md -- do not edit directly. */} +{/* Run "node website/scripts/sync-examples.mjs" after updating the source. */} + +import CIStatus from '@site/src/components/CIStatus'; + + + +:::info Source Code +Clone the repo and find this example at [`examples/llms/oga_api/`](https://github.com/amd/RyzenAI-SW/tree/main/examples/llms/oga_api). +::: + +# Ryzen AI LLM - Onnxruntime GenAI + +Ryzen AI Software includes support for deploying LLMs on Ryzen AI PCs using the ONNX Runtime generate() API (OGA). + +## Pre-optimized Models + +AMD provides a set of pre-optimized LLMs ready to be deployed with Ryzen AI Software and the supporting runtime for hybrid and NPU execution. These models can be found on Hugging Face: + +### Published models: +- [Ryzen AI Hybrid models.](https://huggingface.co/collections/amd/ryzenai-14-llm-hybrid-models-67da31231bba0f733750a99c) +- [Ryzen AI NPU models.](https://huggingface.co/collections/amd/ryzenai-13-llm-npu-models-6759f510b8132db53e044aaf) + +## Ryzen AI Installation + +- The steps for installing Ryzen AI along with it's requirement can be found in the Official Ryzen AI Software documantion page here - https://ryzenai.docs.amd.com/en/latest/inst.html + +## Steps to compile and run LLM example. +- Activate Ryzen AI environment: +``` +conda activate ryzen-ai-1.5.0 +``` +- Download the model: This example uses the Llama-2-7b-chat model. +``` +#hyrbid model: +git clone https://huggingface.co/amd/Llama-2-7b-chat-hf-awq-g128-int4-asym-fp16-onnx-hybrid + +#npu model: +git clone https://huggingface.co/amd/Llama2-7b-chat-awq-g128-int4-asym-bf16-onnx-ryzen-strix +``` + +- Clone the RyzenAI-SW repository: +``` +git clone https://github.com/amd/RyzenAI-SW +``` +- Navigate to OGA_API folder: +``` +cd path\to\RyzenAI-SW\example\llm\oga_api +``` +- Copy necessary DLLs and header files: +``` +xcopy /I "%RYZEN_AI_INSTALLATION_PATH%\deployment\*" libs +xcopy /I "%RYZEN_AI_INSTALLATION_PATH%\LLM\lib\onnxruntime-genai.lib" libs +xcopy /I "%RYZEN_AI_INSTALLATION_PATH%\LLM\include\*" include +``` +- Compile and build the code: +``` + mkdir build + cd build + cmake .. -A x64 + cmake --build . --config Release + cd bin\Release +``` +- Execute code: +``` +.\example.exe -m "" +``` +- Sample command +``` +.\example.exe -m "path\to\Llama-2-7b-chat-hf-awq-g128-int4-asym-fp16-onnx-hybrid" +``` + +- Sample output: +``` +Initializing ORT GenAI... +Loading Model from: C:\Users\satreysa\Downloads\RyzenAI-SW\example\llm\oga_api\models\Llama-2-7b-chat-hf-awq-g128-int4-asym-fp16-onnx-hybrid +Model loaded. +Creating Tokenizer... +Tokenizer created. +Creating Generator... +Generator created. +-------------------------------- +Enter prompt: Explain the basics of object oriented programming +Generating response: + Of course! Object-oriented programming (OOP) is a programming paradigm that organizes software design around objects, which are instances of classes, rather than functions and data. Here are the basics of OOP: + +1. Classes and Objects: In OOP, a class is a blueprint or template for creating objects. A class defines the properties and behaviors of an object, and it can contain other classes or objects as members. An object is an instance of a class, and it has its own set of attributes (data) and methods (functions). +2. Inheritance: Inheritance is the process of creating a new class based on an existing class. The new class (the subclass) inherits the properties and behaviors of the existing class (the superclass), and it can also add new properties and behaviors. +3. Polymorphism: Polymorphism is the ability of an object to take on many forms. In OOP, polymorphism can occur in two ways: method overriding and method overloading. Method overriding occurs when a subclass provides a different implementation of a method that is already defined in its superclass. Method overloading occurs when a class provides multiple definitions for a method with the same name but different parameters. +4. Encapsulation: Encapsulation is the practice of hiding the implementation details of an object from the outside world. In OOP, encapsulation is used to protect the data and methods of an object from external interference or misuse. +5. Abstraction: Abstraction is the process of representing complex real-world objects or systems in a simplified way. In OOP, abstraction is used to focus on the essential features of an object and to hide the irrelevant details. +6. Composition: Composition is the process of combining objects or classes to create a new object or system. In OOP, composition is used to create complex objects by combining simpler objects or classes. +7. Inheritance Hierarchy: Inheritance hierarchy is a tree-like structure that represents the relationship between classes. A class can inherit properties and behaviors from its parent class, and it can also have its own subclasses that inherit properties and behaviors from it. +8. Interfaces: Interfaces are used to define a set of methods that a class must implement. Interfaces are used to define a contract between a class and its clients, and they are used to ensure that a class implements a set of methods that are common to all classes in a particular category. +9. Abstract Classes: Abstract classes are classes that cannot be instantiated. They are used to define a blueprint for a class, and they can contain methods that are intended to be overridden by subclasses. +10. Final Classes: Final classes are classes that cannot be subclassed. They are used to define a class that cannot be modified or extended. + +These are the basic concepts of object-oriented programming. Of course, there are many more advanced concepts and techniques that can be used in OOP, but these are the fundamental building blocks upon which all other concepts are based. +``` + +**Note:** This example script demonstrates how to run the LLaMA-2-7b-Chat model. The chat template used in `main.cpp` is specifically tailored for the LLaMA-2-7b-Chat model. If you are using a different model, you may need to modify the chat template accordingly to ensure compatibility with that model’s expected input format. + +``` +std::string apply_llama2_chat_template(const std::string& user_input, const std::string& system_prompt = "You are a helpful assistant.") { + return "[INST] <>\n" + system_prompt + "\n<>\n\n" + user_input + " [/INST]"; +} +``` + +# Copyright + +Copyright(C) 2025 Advanced Micro Devices, Inc. All rights reserved. + diff --git a/docs/models-tutorials/llms/oga-inference/index.mdx b/docs/models-tutorials/llms/oga-inference/index.mdx new file mode 100644 index 00000000..74a015c0 --- /dev/null +++ b/docs/models-tutorials/llms/oga-inference/index.mdx @@ -0,0 +1,51 @@ +--- +title: "OGA Inference (Python)" +ci_validated: false +--- + +{/* AUTO-GENERATED from llms/oga_inference/README.md -- do not edit directly. */} +{/* Run "node website/scripts/sync-examples.mjs" after updating the source. */} + +import CIStatus from '@site/src/components/CIStatus'; + + + +:::info Source Code +Clone the repo and find this example at [`examples/llms/oga_inference/`](https://github.com/amd/RyzenAI-SW/tree/main/examples/llms/oga_inference). +::: + +# OGA Inference with Chat Template + +Inference script with chat template support for ONNX Runtime GenAI models. + +## When to use this? +Use this for models that require chat templates (e.g., GPT-OSS-20B) for better output quality. + +Based on Microsoft OGA [model-chat.py](https://github.com/microsoft/onnxruntime-genai/blob/rel-0.11.2/examples/python/model-chat.py), modified for Ryzen AI. + +## Prerequisites +- Ryzen AI Software installed (see [Installation Instructions](https://ryzenai.docs.amd.com/en/latest/inst.html)) +- Activate the conda environment created by the MSI installer: +```bash + conda activate ryzen-ai- +``` +- For more details on running LLMs with OGA, see [OnnxRuntime GenAI (OGA) Flow](https://ryzenai.docs.amd.com/en/latest/hybrid_oga.html). + +## Usage +```bash +python model_chat.py -m -pr -ipl -tm +``` + +## Arguments +| Argument | Description | +|----------|-------------| +| `-m` | Path to ONNX model folder | +| `-pr` | Prompt file (.txt) | +| `-ipl` | Input prompt length in tokens (auto-caps to fit context) | +| `-tm` | Show timing info | +| `-v` | Verbose output | + +## Example +```bash +python model_chat.py -m ./gpt-oss-20b-onnx-ryzenai-npu -pr prompt.txt -ipl 256 -tm +``` diff --git a/docs/models-tutorials/llms/overview.mdx b/docs/models-tutorials/llms/overview.mdx new file mode 100644 index 00000000..33afd598 --- /dev/null +++ b/docs/models-tutorials/llms/overview.mdx @@ -0,0 +1,178 @@ +--- +title: LLM Deployment Overview +sidebar_position: 1 +ci_validated: false +ci_last_run: 2026-02-27 +--- + +import CIStatus from '@site/src/components/CIStatus'; + +# LLM Deployment Overview + + + +Large Language Models (LLMs) can be deployed on Ryzen AI PCs with NPU and GPU acceleration. NPU-only and Hybrid execution modes, which utilize both the NPU and integrated GPU (iGPU), are supported via ONNXRuntime GenAI (OGA). GPU-only acceleration is enabled through llama.cpp. See the [Execution Modes table](#execution-modes) below for detailed information. + +## Execution Modes + +| Mode | Framework(s) | Compute Allocation | Primary Use Case | +|------|--------------|-------------------|------------------| +| **NPU-Only** | OnnxRuntime GenAI (OGA) | Neural Processing Unit (NPU) exclusive | Maximum NPU utilization while preserving iGPU for parallel workloads | +| **Hybrid** | OnnxRuntime GenAI (OGA) | Dynamic NPU + iGPU partitioning | Interactive inference with optimal prefill/decode performance | +| **GPU** | llama.cpp | Dedicated GPU execution | High-throughput inference on discrete/integrated GPU | +| **CPU** | OGA or llama.cpp | Traditional CPU-based inference | Baseline compatibility across all processor generations | + +## Hardware Requirements + +| Processor Series | NPU-Only | Hybrid | GPU/CPU | +|------------------|-----------|--------|---------| +| Ryzen AI 300 (STX/KRK) | ✓ | ✓ | ✓ | +| Ryzen AI 7000/8000 | ✗ | ✗ | ✓ | + +## Development Interfaces + +The Ryzen AI LLM software stack is available through three development interfaces, each suited for specific use cases as outlined in the sections below. All three interfaces are built on top of native OnnxRuntime GenAI (OGA) libraries or llama.cpp libraries, as shown in the diagram below. + +The high-level Python APIs, as well as the Server Interface, also leverage the Lemonade SDK, which is multi-vendor open-source software that provides everything necessary for quickly getting started with LLMs on OGA or llama.cpp. + +A key benefit of Lemonade is that software developed against their interfaces is portable to many other execution backends. + +**Ryzen AI Software Stack:** + +| Your Python Application | Your LLM Stack | Your Native Application | +|-------------------------|----------------|-------------------------| +| [Lemonade Python API](/models-tutorials/llms/python-api) | [Lemonade Server Interface](/models-tutorials/llms/server-interface) | [OGA C++ Headers](/models-tutorials/llms/hybrid-inference) **OR** [llama.cpp C++ Headers](https://github.com/ggml-org/llama.cpp) | +| Custom [AMD OnnxRuntime GenAI (OGA)](https://github.com/microsoft/onnxruntime-genai) **OR** [llama.cpp](https://github.com/ggml-org/llama.cpp) | | | +| [AMD Ryzen AI Driver and Hardware](https://www.amd.com/en/products/processors/consumer/ryzen-ai.html) | | | + +* indicates open-source software (OSS). + +### Server Interface (REST API) + +The Server Interface provides a convenient means to integrate with applications that: + +- Already support an LLM server interface, such as the Ollama server or OpenAI API. +- Are written in any language (C++, C#, Javascript, etc.) that supports REST APIs. +- Benefits from process isolation for the LLM backend. + +Lemonade Server is available in two ways: + +- **Standalone Windows GUI installer**: Quick setup with a desktop shortcut for immediate use. (Recommended for end users, see [Server Interface](/models-tutorials/llms/server-interface)) +- **Full Lemonade SDK**: Complete development toolkit with server interface included. (Recommended for developers, see [High-Level Python SDK](/models-tutorials/llms/python-api) for Python SDK) + +For example applications that have been tested with Lemonade Server, see the [Lemonade Server Examples](https://github.com/lemonade-sdk/lemonade/tree/main/docs/server/apps). + +### High-Level Python SDK + +The high-level Python SDK, Lemonade, allows you to get started using PyPI installation in approximately 5 minutes. + +This SDK allows you to: + +- Experiment with models in hybrid or NPU-only execution mode on Ryzen AI hardware. +- Validate inference speed and task performance. +- Integrate with Python apps using a high-level API. + +To get started in Python, follow these instructions: [High-Level Python SDK](/models-tutorials/llms/python-api). + +### OGA APIs for C++ Libraries and Python + +Native C++ libraries for OGA are available to give full customizability for deployment into native applications. The Python bindings for OGA also provide a customizable interface for Python development. + +To get started with the OGA APIs, follow these instructions: [Hybrid OGA](/models-tutorials/llms/hybrid-inference). + +## Supported LLMs + +AMD provides pre-optimized LLMs ready to deploy with Ryzen AI Software. + +Hugging Face collections: +- [Ryzen-AI-1.7-Hybrid-LLM](https://huggingface.co/collections/amd/ryzen-ai-17-hybrid-llm) — Models that run across CPU and NPU +- [Ryzen-AI-1.7-NPU-LLM](https://huggingface.co/collections/amd/ryzen-ai-17-npu-llm) — Models that run entirely on the NPU + +:::note Quantization +Most models use **INT4 (AWQ, group size 128)**. Phi-4 models use **GPTQ** quantization. +::: + +### Llama Family + +| Model | Parameters | Hybrid | NPU-Only | Context | Verified | +|-------|------------|:------:|:--------:|---------|----------| +| [amd/Llama-2-7b-chat-hf-onnx-ryzenai-1.7-hybrid](https://huggingface.co/amd/Llama-2-7b-chat-hf-onnx-ryzenai-1.7-hybrid) | 7B | ✓ | ✓ | 4K | Pending | +| [amd/Llama-2-7b-hf-onnx-ryzenai-1.7-hybrid](https://huggingface.co/amd/Llama-2-7b-hf-onnx-ryzenai-1.7-hybrid) | 7B | ✓ | ✓ | 4K | Pending | +| [amd/Meta-Llama-3-8B-onnx-ryzenai-1.7-hybrid](https://huggingface.co/amd/Meta-Llama-3-8B-onnx-ryzenai-1.7-hybrid) | 8B | ✓ | ✓ | 4K | Pending | +| [amd/Llama-3.1-8B-onnx-ryzenai-1.7-hybrid](https://huggingface.co/amd/Llama-3.1-8B-onnx-ryzenai-1.7-hybrid) | 8B | ✓ | ✓ | 4K | Pending | +| [amd/Meta-Llama-3.1-8B-Instruct-onnx-ryzenai-1.7-hybrid](https://huggingface.co/amd/Meta-Llama-3.1-8B-Instruct-onnx-ryzenai-1.7-hybrid) | 8B | ✓ | ✓ | 4K | Pending | +| [amd/Llama-3.2-1B-onnx-ryzenai-1.7-hybrid](https://huggingface.co/amd/Llama-3.2-1B-onnx-ryzenai-1.7-hybrid) | 1B | ✓ | ✓ | 4K | Pending | +| [amd/Llama-3.2-1B-Instruct-onnx-ryzenai-1.7-hybrid](https://huggingface.co/amd/Llama-3.2-1B-Instruct-onnx-ryzenai-1.7-hybrid) | 1B | ✓ | ✓ | 4K | Pending | +| [amd/Llama-3.2-3B-onnx-ryzenai-1.7-hybrid](https://huggingface.co/amd/Llama-3.2-3B-onnx-ryzenai-1.7-hybrid) | 3B | ✓ | — | 4K | Pending | +| [amd/Llama-3.2-3B-Instruct-onnx-ryzenai-1.7-hybrid](https://huggingface.co/amd/Llama-3.2-3B-Instruct-onnx-ryzenai-1.7-hybrid) | 3B | ✓ | — | 4K | Pending | +| [amd/CodeLlama-7b-Instruct-hf-onnx-ryzenai-1.7-hybrid](https://huggingface.co/amd/CodeLlama-7b-Instruct-hf-onnx-ryzenai-1.7-hybrid) | 7B | ✓ | ✓ | 4K | Pending | + +### DeepSeek Family + +| Model | Parameters | Hybrid | NPU-Only | Context | Verified | +|-------|------------|:------:|:--------:|---------|----------| +| [amd/DeepSeek-R1-Distill-Llama-8B-onnx-ryzenai-1.7-hybrid](https://huggingface.co/amd/DeepSeek-R1-Distill-Llama-8B-onnx-ryzenai-1.7-hybrid) | 8B | ✓ | ✓ | 4K | Pending | +| [amd/DeepSeek-R1-Distill-Qwen-1.5B-onnx-ryzenai-1.7-hybrid](https://huggingface.co/amd/DeepSeek-R1-Distill-Qwen-1.5B-onnx-ryzenai-1.7-hybrid) | 1.5B | ✓ | ✓ | 4K | Pending | +| [amd/DeepSeek-R1-Distill-Qwen-7B-onnx-ryzenai-1.7-hybrid](https://huggingface.co/amd/DeepSeek-R1-Distill-Qwen-7B-onnx-ryzenai-1.7-hybrid) | 7B | ✓ | ✓ | 4K | Pending | + +### Phi Family + +| Model | Parameters | Hybrid | NPU-Only | Context | Verified | +|-------|------------|:------:|:--------:|---------|----------| +| [amd/Phi-3-mini-4k-instruct-onnx-ryzenai-1.7-hybrid](https://huggingface.co/amd/Phi-3-mini-4k-instruct-onnx-ryzenai-1.7-hybrid) | 3.8B | ✓ | ✓ | 4K | Pending | +| [amd/Phi-3-mini-128k-instruct-onnx-ryzenai-1.7-hybrid](https://huggingface.co/amd/Phi-3-mini-128k-instruct-onnx-ryzenai-1.7-hybrid) | 3.8B | ✓ | ✓ | 4K | Pending | +| [amd/Phi-3.5-mini-instruct-onnx-ryzenai-1.7-hybrid](https://huggingface.co/amd/Phi-3.5-mini-instruct-onnx-ryzenai-1.7-hybrid) | 3.8B | ✓ | ✓ | 4K | Pending | +| [amd/Phi-4-mini-instruct-onnx-ryzenai-1.7-hybrid](https://huggingface.co/amd/Phi-4-mini-instruct-onnx-ryzenai-1.7-hybrid) | 4B | ✓ | — | 4K | Pending | +| [amd/Phi-4-mini-reasoning-onnx-ryzenai-1.7-hybrid](https://huggingface.co/amd/Phi-4-mini-reasoning-onnx-ryzenai-1.7-hybrid) | 4B | ✓ | — | 4K | Pending | + +### Qwen Family + +| Model | Parameters | Hybrid | NPU-Only | Context | Verified | +|-------|------------|:------:|:--------:|---------|----------| +| [amd/Qwen-2.5_1.5B_Instruct-onnx-ryzenai-1.7-hybrid](https://huggingface.co/amd/Qwen-2.5_1.5B_Instruct-onnx-ryzenai-1.7-hybrid) | 1.5B | ✓ | ✓ | 4K | Pending | +| [amd/Qwen1.5-7B-Chat-onnx-ryzenai-1.7-hybrid](https://huggingface.co/amd/Qwen1.5-7B-Chat-onnx-ryzenai-1.7-hybrid) | 7B | ✓ | ✓ | 4K | Pending | +| [amd/Qwen2-1.5B-onnx-ryzenai-1.7-hybrid](https://huggingface.co/amd/Qwen2-1.5B-onnx-ryzenai-1.7-hybrid) | 1.5B | ✓ | ✓ | 4K | Pending | +| [amd/Qwen2-7B-onnx-ryzenai-1.7-hybrid](https://huggingface.co/amd/Qwen2-7B-onnx-ryzenai-1.7-hybrid) | 7B | ✓ | ✓ | 4K | Pending | +| [amd/Qwen2.5-0.5B-Instruct-onnx-ryzenai-1.7-hybrid](https://huggingface.co/amd/Qwen2.5-0.5B-Instruct-onnx-ryzenai-1.7-hybrid) | 0.5B | ✓ | — | 4K | Pending | +| [amd/Qwen2.5-7B-Instruct-onnx-ryzenai-1.7-hybrid](https://huggingface.co/amd/Qwen2.5-7B-Instruct-onnx-ryzenai-1.7-hybrid) | 7B | ✓ | ✓ | 4K | Pending | +| [amd/Qwen2.5-Coder-0.5B-Instruct-onnx-ryzenai-1.7-hybrid](https://huggingface.co/amd/Qwen2.5-Coder-0.5B-Instruct-onnx-ryzenai-1.7-hybrid) | 0.5B | ✓ | — | 4K | Pending | +| [amd/Qwen2.5-Coder-1.5B-Instruct-onnx-ryzenai-1.7-hybrid](https://huggingface.co/amd/Qwen2.5-Coder-1.5B-Instruct-onnx-ryzenai-1.7-hybrid) | 1.5B | ✓ | ✓ | 4K | Pending | +| [amd/Qwen2.5-Coder-7B-Instruct-onnx-ryzenai-1.7-hybrid](https://huggingface.co/amd/Qwen2.5-Coder-7B-Instruct-onnx-ryzenai-1.7-hybrid) | 7B | ✓ | ✓ | 4K | Pending | +| [amd/Qwen2.5_3B_Instruct-onnx-ryzenai-1.7-hybrid](https://huggingface.co/amd/Qwen2.5_3B_Instruct-onnx-ryzenai-1.7-hybrid) | 3B | ✓ | ✓ | 4K | Pending | +| [amd/Qwen2.5-14B-instruct-onnx-ryzenai-1.7-hybrid](https://huggingface.co/amd/Qwen2.5-14B-instruct-onnx-ryzenai-1.7-hybrid) | 14B | ✓ | — | 4K | Pending | +| [amd/Qwen3-1.7B-awq-quant-onnx-ryzenai-1.7-hybrid](https://huggingface.co/amd/Qwen3-1.7B-awq-quant-onnx-ryzenai-1.7-hybrid) | 1.7B | ✓ | — | 4K | Pending | +| [amd/Qwen3-4B-awq-quant-onnx-ryzenai-1.7-hybrid](https://huggingface.co/amd/Qwen3-4B-awq-quant-onnx-ryzenai-1.7-hybrid) | 4B | ✓ | — | 4K | Pending | +| [amd/Qwen3-8B-awq-quant-onnx-ryzenai-1.7-hybrid](https://huggingface.co/amd/Qwen3-8B-awq-quant-onnx-ryzenai-1.7-hybrid) | 8B | ✓ | — | 4K | Pending | +| [amd/Qwen3-14B-onnx-ryzenai-1.7-hybrid](https://huggingface.co/amd/Qwen3-14B-onnx-ryzenai-1.7-hybrid) | 14B | ✓ | — | 4K | Pending | + +### Mistral Family + +| Model | Parameters | Hybrid | NPU-Only | Context | Verified | +|-------|------------|:------:|:--------:|---------|----------| +| [amd/Mistral-7B-Instruct-v0.1-onnx-ryzenai-1.7-hybrid](https://huggingface.co/amd/Mistral-7B-Instruct-v0.1-onnx-ryzenai-1.7-hybrid) | 7B | ✓ | ✓ | 4K | Pending | +| [amd/Mistral-7B-Instruct-v0.2-onnx-ryzenai-1.7-hybrid](https://huggingface.co/amd/Mistral-7B-Instruct-v0.2-onnx-ryzenai-1.7-hybrid) | 7B | ✓ | ✓ | 4K | Pending | +| [amd/Mistral-7B-Instruct-v0.3-onnx-ryzenai-1.7-hybrid](https://huggingface.co/amd/Mistral-7B-Instruct-v0.3-onnx-ryzenai-1.7-hybrid) | 7B | ✓ | ✓ | 4K | Pending | +| [amd/Mistral-7B-v0.3-onnx-ryzenai-1.7-hybrid](https://huggingface.co/amd/Mistral-7B-v0.3-onnx-ryzenai-1.7-hybrid) | 7B | ✓ | ✓ | 4K | Pending | + +### Other Models + +| Model | Parameters | Hybrid | NPU-Only | Context | Verified | +|-------|------------|:------:|:--------:|---------|----------| +| [amd/gemma-2-2b-onnx-ryzenai-1.7-hybrid](https://huggingface.co/amd/gemma-2-2b-onnx-ryzenai-1.7-hybrid) | 2B | ✓ | — | 3K | Pending | +| [amd/AMD-OLMo-1B-SFT-DPO-onnx-ryzenai-1.7-hybrid](https://huggingface.co/amd/AMD-OLMo-1B-SFT-DPO-onnx-ryzenai-1.7-hybrid) | 1B | ✓ | — | 2K | Pending | +| [amd/chatglm3-6b-onnx-ryzenai-1.7-hybrid](https://huggingface.co/amd/chatglm3-6b-onnx-ryzenai-1.7-hybrid) | 6B | ✓ | ✓ | 4K | Pending | +| [amd/SmolLM2-135M-Instruct-onnx-ryzenai-1.7-hybrid](https://huggingface.co/amd/SmolLM2-135M-Instruct-onnx-ryzenai-1.7-hybrid) | 135M | ✓ | — | 4K | Pending | +| [amd/SmolLM-135M-Instruct-onnx-ryzenai-1.7-hybrid](https://huggingface.co/amd/SmolLM-135M-Instruct-onnx-ryzenai-1.7-hybrid) | 135M | ✓ | — | 4K | Pending | + +Fine-tuned versions of these models are also supported. For instructions on preparing a fine-tuned OGA model, refer to [ONNX Model Preparation](/develop/onnx-model-preparation). + +## End-to-End OGA Validation + +The Lemonade CLI provides built-in tools for end-to-end validation of OGA hybrid and NPU-only execution, including: + +- Prompting with templates +- Benchmarking (time-to-first-token and tokens-per-second) +- Accuracy measurement +- Memory profiling + +For CLI usage and validation commands, see the [Lemonade Server CLI Guide](https://lemonade-server.ai/docs/server/lemonade-server-cli/). diff --git a/docs/models-tutorials/llms/python-api.mdx b/docs/models-tutorials/llms/python-api.mdx new file mode 100644 index 00000000..edb9c767 --- /dev/null +++ b/docs/models-tutorials/llms/python-api.mdx @@ -0,0 +1,67 @@ +--- +title: High-Level Python SDK +sidebar_position: 2 +ci_validated: false +ci_last_run: 2026-02-27 +--- + +import CIStatus from '@site/src/components/CIStatus'; + +# High-Level Python SDK + + + +A Python environment offers flexibility for experimenting with LLMs, profiling them, and integrating them into Python applications. We use the [Lemonade SDK](https://github.com/lemonade-sdk/lemonade) to get up and running quickly. + +To get started, follow these instructions. + +## System-level pre-requisites + +You only need to do this once per computer: + +1. Make sure your system has the recommended Ryzen AI driver installed as described in the [installation guide](/getting-started/installation). +2. Download and install [Miniconda for Windows](https://repo.anaconda.com/miniconda/Miniconda3-latest-Windows-x86_64.exe) or [Miniforge for Windows](https://github.com/conda-forge/miniforge/releases/download/25.3.0-1/Miniforge3-25.3.0-1-Windows-x86_64.exe). +3. Launch a terminal and call `conda init`. + +## Environment Setup + +To create and set up an environment, run these commands in your terminal: + +```bash +conda create -n ryzenai-llm python=3.12 +conda activate ryzenai-llm +pip install lemonade-sdk[dev,oga-ryzenai] --extra-index-url=https://pypi.amd.com/simple +``` + +## Validation Tools + +Now that you have completed installation, you can try prompting an LLM like this (where `PROMPT` is any prompt you like). + +Run this command in a terminal that has your environment activated: + +```bash +lemonade -i amd/Llama-3.2-1B-Instruct-onnx-ryzenai-hybrid oga-load --device hybrid --dtype int4 llm-prompt --max-new-tokens 64 -p PROMPT +``` + +For more details on validation commands, see the [Lemonade Server CLI Guide](https://lemonade-server.ai/docs/server/lemonade-server-cli/). + +## Python API + +You can also run this code to try out the high-level Lemonade API in a Python script: + +```python +from lemonade.api import from_pretrained + +model, tokenizer = from_pretrained( + "amd/Llama-3.2-1B-Instruct-onnx-ryzenai-hybrid", recipe="oga-hybrid" +) + +input_ids = tokenizer("This is my prompt", return_tensors="pt").input_ids +response = model.generate(input_ids, max_new_tokens=30) + +print(tokenizer.decode(response[0])) +``` + +## Next Steps + +From here, you can explore additional validation tools for measuring speed and accuracy, streaming responses with the API, and launching the server interface. See the [Supported LLMs](/models-tutorials/llms/supported-models) for model-specific examples, or the [Lemonade Server CLI Guide](https://lemonade-server.ai/docs/server/lemonade-server-cli/) for full CLI documentation. diff --git a/docs/models-tutorials/llms/rag-oga/index.mdx b/docs/models-tutorials/llms/rag-oga/index.mdx new file mode 100644 index 00000000..e7a7646b --- /dev/null +++ b/docs/models-tutorials/llms/rag-oga/index.mdx @@ -0,0 +1,36 @@ +--- +title: "RAG with OGA" +ci_validated: false +--- + +{/* AUTO-GENERATED from llms/RAG-OGA/README.md -- do not edit directly. */} +{/* Run "node website/scripts/sync-examples.mjs" after updating the source. */} + +import CIStatus from '@site/src/components/CIStatus'; + + + +:::info Source Code +Clone the repo and find this example at [`examples/llms/RAG-OGA/`](https://github.com/amd/RyzenAI-SW/tree/main/examples/llms/RAG-OGA). +::: + + + +## Introduction +Welcome to this repository, a showcase of an **ONNX Runtime GenAI(OGA)‑based RAG LLM sample application** running on a **Ryzen AI processor**. +This repo provides supplemental code to the AMD Blog [RAG with Hybrid LLM on AMD Ryzen AI Processor](https://www.amd.com/en/developer/resources/technical-articles/2025/rag-with-hybrid-llm-on-amd-ryzen-ai-processors.html). + +## What You’ll Find Here + +- **Retrieval-Augmented Generation (RAG) pipeline** powered by: + - A **hybrid LLM** enables disaggregated inference in which the compute-heavy prefill phase runs on the NPU, while the decode phase executes on the GPU. + - An **embedding model** compiled with **Vitis AI Execution Provider** +- Built using the widely adopted **LangChain** orchestration framework + +## Quick Setup + +Follow these simple steps to get started: + +1. Execute the setup steps outlined below to provision your environment. +2. After setup, this README will guide you through how to run the sample application. + diff --git a/docs/models-tutorials/llms/server-interface.mdx b/docs/models-tutorials/llms/server-interface.mdx new file mode 100644 index 00000000..fb4ce7e5 --- /dev/null +++ b/docs/models-tutorials/llms/server-interface.mdx @@ -0,0 +1,76 @@ +--- +title: Server Interface (REST API) +sidebar_position: 3 +ci_validated: false +ci_last_run: 2026-02-27 +--- + +import CIStatus from '@site/src/components/CIStatus'; + +# Server Interface (REST API) + + + +The Lemonade SDK offers a server interface that allows your application to load an LLM on Ryzen AI hardware in a process, and then communicate with this process using standard `REST` APIs. This allows applications written in any language (C#, JavaScript, Python, C++, etc.) to easily integrate with Ryzen AI LLMs. + +Server interfaces are used across the LLM ecosystem because they allow for no-code plug-and-play between the higher level of the application stack (GUIs, agents, RAG, etc.) with the LLM and hardware that have been abstracted by the server. For more information, see the [Understanding local LLM Servers Guide](https://lemonade-server.ai/docs/server/concepts/). + +For example, open source projects such as [Open WebUI](https://lemonade-server.ai/docs/server/apps/open-webui/) have out-of-box support for connecting to a variety of server interfaces, which in turn allows users to quickly start working with LLMs in a GUI. + +## Server Setup + +Lemonade Server can be installed via the Lemonade Server Installer executable by following these steps: + +1. Make sure your system has the recommended Ryzen AI driver installed as described in the [installation guide](/getting-started/installation). +2. Download and install `Lemonade_Server_Installer.exe` from the [latest Lemonade release](https://github.com/lemonade-sdk/lemonade/releases). +3. Launch the server by double-clicking the `lemonade_server` shortcut added to your desktop. + +For a visual walkthrough of this process, watch our Lemonade Introductory Video: + +
+