elastic · kapral18 · Feb 16, 2026
diff --git a/.env.example b/.env.example
@@ -39,5 +39,5 @@ PRODUCER_WORKER_POOL_SIZE=
 
 # Optional: Space-separated list of repositories to index
 # Used as fallback when no repositories are provided as CLI arguments
-# Format: "repo1 repo2" or "repo1:index1 repo2:index2"
+# Format: "repo1 repo2" or "repo1:alias1 repo2:alias2"
 # REPOSITORIES_TO_INDEX=
diff --git a/README.md b/README.md
@@ -42,7 +42,12 @@ cp .env.example .env
 # This reduces indexing time and improves relevance by excluding tests, build artifacts, etc.
 
 # 6. Index your repository
-npm run index -- /path/to/your/repo --clean --watch --concurrency 8
+# Full rebuild (creates new backing indices + atomic alias swap)
+npm run index -- /path/to/your/repo --clean --concurrency 8
+
+# (Optional) Continuous incremental indexing (long-running)
+# Note: `--watch` is not compatible with `--clean`
+npm run index -- /path/to/your/repo --watch --concurrency 8
 ```
 
 ### Excluding Files with `.indexerignore`
@@ -85,8 +90,9 @@ Indexes one or more repositories by scanning the codebase, enqueuing code chunks
 
 **Arguments:**
 
-- `[repos...]` - One or more repository paths, names, or URLs (format: `repo[:index]`). Optional if `REPOSITORIES_TO_INDEX` env var is set.
-- `--clean` - Delete existing Elasticsearch index before starting (full rebuild)
+- `[repos...]` - One or more repository paths, names, or URLs (format: `repo[:alias]`). Optional if `REPOSITORIES_TO_INDEX` env var is set.
+- `--clean` - Create a new backing index and atomically swap aliases (full rebuild)
+- `--keep-old-indices` - Keep previous backing indices after alias swap
 - `--pull` - Git pull before indexing
 - `--watch` - Keep indexer running after processing queue (for continuous indexing)
 - `--concurrency <number>` - Number of parallel workers (default: 1, recommended: CPU core count)
@@ -108,8 +114,8 @@ npm run index -- /path/to/repo --watch --concurrency 8
 # Index a remote repository (clones automatically)
 npm run index -- https://github.com/elastic/kibana.git --clean
 
-# Index with custom Elasticsearch index name
-npm run index -- /path/to/repo:my-custom-index
+# Index with a custom alias name
+npm run index -- /path/to/repo:my-custom-alias
 
 # Index multiple repositories sequentially
 npm run index -- /path/to/repo1 /path/to/repo2 --concurrency 4
@@ -120,7 +126,7 @@ npm run index -- /path/to/repo --pull
 # Private repository with token
 npm run index -- https://github.com/org/private-repo.git --token ghp_YourToken
 
-# Using REPOSITORIES_TO_INDEX env var (backward compatibility)
+# Using REPOSITORIES_TO_INDEX env var (optional convenience)
 export REPOSITORIES_TO_INDEX="/path/to/repo1 /path/to/repo2"
 npm run index -- --concurrency 4
 ```
@@ -136,7 +142,18 @@ npm run index -- --concurrency 4
 - Without `--clean`: Automatically detects if this is a first-time index or an incremental update
   - If no previous index exists, performs a full index
   - If previous index exists, only processes changed files since last indexed commit
-- With `--clean`: Always performs a full rebuild, deleting the existing index first
+- With `--clean`: Always performs a full rebuild into a new backing index, then atomically swaps aliases
+
+**Important:** `--watch` and `--clean` are mutually exclusive. Use `--clean` for a one-off maintenance rebuild and `--watch` for long-running incremental updates.
+
+### Maintenance lock (clean rebuild safety)
+
+When running with `--clean`, the indexer uses a **per-alias maintenance lock** stored in `<alias>_settings` (document id: `_reindex_lock`):
+
+- A `--clean` run **acquires the lock**, builds new backing indices, performs an **atomic alias swap**, then **releases the lock**.
+- A non-clean run (cron/incremental) checks for the lock and, if present, logs that maintenance is in progress and **skips** that repo/alias.
+
+This is designed for GitOps-style deployments where you can run maintenance rebuilds without pausing scheduled incremental jobs.
 
 ### `npm run scaffold-language`
 
@@ -325,11 +342,11 @@ Configuration is managed via environment variables in a `.env` file.
 
 ### Elasticsearch indices created
 
-Given a base index name (from `ELASTICSEARCH_INDEX` or CLI `repo[:index]`), the indexer creates and maintains:
+Given a base alias name (from `ELASTICSEARCH_INDEX` or CLI `repo[:alias]`), the indexer creates and maintains:
 
-- `<index>`: the primary chunk index (semantic search + metadata)
-- `<index>_settings`: small settings/state index (e.g. last indexed commit per branch)
-- `<index>_locations`: dedicated per-file location index (one document per chunk occurrence)
+- `<alias>`: alias that points to the active backing chunk index
+- `<alias>_locations`: alias that points to the active backing locations index
+- `<alias>_settings`: settings/state index (e.g. last indexed commit per branch and maintenance lock)
 
 | Variable                              | Description                                                                                                                                                                                 | Default                                              |
 | ------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------- |
@@ -339,7 +356,7 @@ Given a base index name (from `ELASTICSEARCH_INDEX` or CLI `repo[:index]`), the
 | `ELASTICSEARCH_PASSWORD`              | The password for Elasticsearch authentication.                                                                                                                                              |                                                      |
 | `ELASTICSEARCH_API_KEY`               | An API key for Elasticsearch authentication.                                                                                                                                                |                                                      |
 | `ELASTICSEARCH_INDEX`                 | The name of the Elasticsearch index to use. This is often set dynamically by the deployment scripts.                                                                                        | `code-chunks`                                        |
-| `ELASTICSEARCH_INFERENCE_ID`          | The Elasticsearch inference endpoint ID for the ELSER model to use. Note: `ELASTICSEARCH_MODEL` is still supported for backward compatibility.                                              | `.elser-2-elasticsearch`                             |
+| `ELASTICSEARCH_INFERENCE_ID`          | The Elasticsearch inference endpoint ID for the ELSER model to use. Note: `ELASTICSEARCH_MODEL` is still supported.                                                                         | `.elser-2-elasticsearch`                             |
 | `OTEL_LOGGING_ENABLED`                | Enable OpenTelemetry logging.                                                                                                                                                               | `false`                                              |
 | `OTEL_METRICS_ENABLED`                | Enable OpenTelemetry metrics (defaults to same as `OTEL_LOGGING_ENABLED`).                                                                                                                  | Same as `OTEL_LOGGING_ENABLED`                       |
 | `OTEL_SERVICE_NAME`                   | Service name for OpenTelemetry logs and metrics.                                                                                                                                            | `semantic-code-search-indexer`                       |

diff --git a/docs/GCP_DEPLOYMENT_GUIDE.md b/docs/GCP_DEPLOYMENT_GUIDE.md
@@ -113,8 +113,9 @@ QUEUE_BASE_DIR="/var/lib/indexer/queues"
 
 # Optional: Space-separated list of repositories to index
 # Used as fallback when no repositories are provided as CLI arguments
-# Format: "repo1 repo2" or "repo1:index1 repo2:index2"
-REPOSITORIES_TO_INDEX="/var/lib/indexer/repos/repo-one:repo-one-index /var/lib/indexer/repos/repo-two:repo-two-index"
+# Format: "repo1 repo2" or "repo1:alias1 repo2:alias2"
+# Note: the value after ':' is an Elasticsearch *alias* (stable). The indexer manages backing indices automatically.
+REPOSITORIES_TO_INDEX="/var/lib/indexer/repos/repo-one:repo-one /var/lib/indexer/repos/repo-two:repo-two"
 ```
 
 ## 2. Scheduling with Cron
@@ -142,7 +143,7 @@ cd scripts/migrations/2025-11-16-unified-index-command
     Add the following line to the end of the file. This configuration will run the indexer every 10 minutes for multiple repositories.
 
     ```cron
-    */10 * * * * cd /opt/semantic-code-search-indexer && /usr/bin/flock -n /tmp/indexer.lock npm run index -- /var/lib/indexer/repos/repo-one:repo-one-index /var/lib/indexer/repos/repo-two:repo-two-index --pull --concurrency 4 --token ghp_YourToken >> /opt/semantic-code-search-indexer/indexer.log 2>&1
+    */10 * * * * cd /opt/semantic-code-search-indexer && /usr/bin/flock -n /tmp/indexer.lock npm run index -- /var/lib/indexer/repos/repo-one:repo-one /var/lib/indexer/repos/repo-two:repo-two --pull --concurrency 4 --token ghp_YourToken >> /opt/semantic-code-search-indexer/indexer.log 2>&1
     ```
 
     **Alternative using REPOSITORIES_TO_INDEX env var:**
@@ -158,17 +159,27 @@ cd scripts/migrations/2025-11-16-unified-index-command
     - `cd /opt/semantic-code-search-indexer`: Change to the project directory.
     - `/usr/bin/flock -n /tmp/indexer.lock`: This is a crucial command for reliability. It ensures that only one instance of the indexer can run at a time. If a previous run is still active, the new one will not start, preventing resource contention and potential data corruption.
     - `npm run index -- <repos...>`: The unified index command that handles both scanning and indexing in one pass.
-    - `/var/lib/indexer/repos/repo-one:repo-one-index`: Repository path with custom index name.
+    - `/var/lib/indexer/repos/repo-one:repo-one`: Repository path with custom alias name (stable).
     - `--pull`: Git pull before indexing to get latest changes.
     - `--concurrency 4`: Number of parallel workers (adjust based on VM resources).
     - `--token ghp_YourToken`: GitHub token for private repositories (optional).
     - `>> /opt/semantic-code-search-indexer/indexer.log 2>&1`: This redirects all output (both standard output and standard error) to a log file within the project directory. You must ensure this file is writable by the user running the cron job.
 
+    **Maintenance rebuilds (zero-downtime)**
+
+    For a one-off full rebuild without downtime, run a clean reindex. This creates new backing indices and atomically swaps aliases.
+
+    While a maintenance rebuild is in progress, scheduled non-clean runs will detect the maintenance lock in `<alias>_settings` and skip that alias.
+
+    ```bash
+    npm run index -- /var/lib/indexer/repos/repo-one:repo-one --clean --concurrency 4
+    ```
+
     **For watch mode (continuous indexing):**
     If you prefer to run the indexer as a long-running process that watches for changes, use the `--watch` flag and run it as a systemd service instead of a cron job:
 
     ```bash
-    npm run index -- /var/lib/indexer/repos/repo-one:repo-one-index --watch --concurrency 4
+    npm run index -- /var/lib/indexer/repos/repo-one:repo-one --watch --concurrency 4
     ```
 
 3.  **Save and Exit:**

diff --git a/docs/IMPLEMENTATION_SUMMARY.md b/docs/IMPLEMENTATION_SUMMARY.md
@@ -4,7 +4,7 @@
 
 Successfully implemented indexed directory fields to enable efficient directory-level aggregations and discovery in the semantic code search indexer. This enhancement enables the MCP server to help LLMs navigate large codebases (70K+ files) by discovering significant directories before diving into specific files.
 
-Note: Per-file directory and path information is stored in a dedicated `<index>_locations` index (one document per chunk occurrence). The primary `<index>` stores content-deduplicated chunk documents and intentionally does **not** store file paths or directory metadata.
+Note: Per-file directory and path information is stored in a dedicated `<alias>_locations` index (one document per chunk occurrence). The primary `<alias>` (an alias pointing to a backing chunk index) stores content-deduplicated chunk documents and intentionally does **not** store file paths or directory metadata.
 
 ## Changes Made
 
@@ -17,8 +17,8 @@ Note: Per-file directory and path information is stored in a dedicated `<index>_
   - `directoryDepth: number` - Depth in directory tree (0 for root)
 
 - Updated Elasticsearch storage model:
-  - The primary chunk index (`<index>`) stores content-deduplicated chunk documents (no file paths / directories).
-  - The locations index (`<index>_locations`) stores per-file occurrences, including:
+  - The primary chunk index (`<alias>`) stores content-deduplicated chunk documents (no file paths / directories).
+  - The locations index (`<alias>_locations`) stores per-file occurrences, including:
     - `filePath` (`wildcard`)
     - `directoryPath` (`keyword`)
     - `directoryName` (`keyword`)

diff --git a/docs/elasticsearch_guide.md b/docs/elasticsearch_guide.md
@@ -38,11 +38,31 @@ const client = new Client({
 
 ## Index Schema
 
-The `code-indexer` creates multiple Elasticsearch indices, derived from the base index name specified by `ELASTICSEARCH_INDEX` (defaulting to `code-chunks`):
+The `code-indexer` now uses an alias-first model. Given a base alias name from `ELASTICSEARCH_INDEX` (defaulting to `code-chunks`), the indexer maintains:
 
-- `<index>` (e.g. `code-chunks`): primary chunk index (semantic search + metadata)
-- `<index>_settings` (e.g. `code-chunks_settings`): small settings/state index (e.g. last indexed commit per branch)
-- `<index>_locations` (e.g. `code-chunks_locations`): dedicated per-file location index (one document per chunk occurrence)
+- `<alias>` (e.g. `code-chunks`): alias pointing to the active backing chunk index
+- `<alias>_locations` (e.g. `code-chunks_locations`): alias pointing to the active backing locations index
+- `<alias>_settings` (e.g. `code-chunks_settings`): stable settings/state index (e.g. last indexed commit per branch and maintenance lock)
+
+### Operational model (backing indices + atomic alias swap)
+
+The public names above (`<alias>`, `<alias>_locations`) are **Elasticsearch aliases**. On a clean rebuild (`npm run index -- <repo[:alias]> --clean`), the indexer:
+
+1. Creates new backing indices (example: `<alias>-scsi-<id>` and `<alias>-scsi-<id>_locations`)
+2. Indexes all documents into those backing indices
+3. Atomically swaps the aliases to point at the new backing indices
+4. By default, deletes the previous backing indices (use `--keep-old-indices` to retain them)
+
+This enables zero-downtime rebuilds where consumers only ever query the stable alias names.
+
+### Maintenance lock (skip incremental during rebuild)
+
+To avoid concurrent maintenance rebuilds (and to make GitOps-style scheduled jobs safe), `--clean` uses a per-alias lock stored in `<alias>_settings`:
+
+- Document id: `_reindex_lock`
+- On startup, a `--clean` run attempts to acquire the lock.
+- While locked, non-clean runs will log that maintenance is in progress and **skip** that alias.
+- On completion (success or failure), the `--clean` run releases the lock.
 
 ### Index Mapping
 
@@ -103,11 +123,11 @@ Here is the mapping for the `code-chunks` index:
 | `containerPath` | `text` | The path of the containing symbol (e.g., class name for a method). |
 | `chunk_hash` | `keyword` | A hash of the content of the code chunk. |
 | `content` | `text` | The raw source code of the chunk. |
-| `semantic_text` | `semantic_text` | Semantic search field populated via Elasticsearch inference at ingest time. Note: it does **not** include file paths/directories; those live in `<index>_locations`. |
+| `semantic_text` | `semantic_text` | Semantic search field populated via Elasticsearch inference at ingest time. Note: it does **not** include file paths/directories; those live in `<alias>_locations`. |
 | `created_at` | `date` | The timestamp when the document was created. |
 | `updated_at` | `date` | The timestamp when the document was last updated. |
 
-### Locations index (`<index>_locations`)
+### Locations index (`<alias>_locations`)
 
 To avoid “mega-documents” for boilerplate chunks (license headers, common imports, etc.), the indexer writes **one document per chunk occurrence** into `<index>_locations`.
 
@@ -148,7 +168,7 @@ To perform a semantic search, use a `semantic` query against the `semantic_text`
 ```javascript
 async function searchCode(query) {
   const response = await client.search({
-    index: 'code-chunks', // Or process.env.ELASTICSEARCH_INDEX
+    index: 'code-chunks', // Alias name (or process.env.ELASTICSEARCH_INDEX)
     query: {
       semantic: {
         field: 'semantic_text',
@@ -168,14 +188,14 @@ async function searchCode(query) {
 
 While the primary focus is on semantic search, you can also perform traditional Elasticsearch queries on the other fields. For example, you can filter chunk docs by `language` or `kind`.
 
-For file-path filtering, query `<index>_locations` by `filePath` and join back to chunk docs using `chunk_id` (via `mget`).
+For file-path filtering, query `<alias>_locations` by `filePath` and join back to chunk docs using `chunk_id` (via `mget`).
 
 ### Joining chunk docs to file locations (important)
 
-The indexer stores content-deduplicated chunk documents in `<index>` and per-file occurrences in `<index>_locations`:
+The indexer stores content-deduplicated chunk documents in `<alias>` (via alias) and per-file occurrences in `<alias>_locations`:
 
-- Query `<index>_locations` to find relevant occurrences (by `filePath`, directory fields, etc.).
-- Use the resulting `chunk_id` values to fetch chunk documents from `<index>` (`mget`).
+- Query `<alias>_locations` to find relevant occurrences (by `filePath`, directory fields, etc.).
+- Use the resulting `chunk_id` values to fetch chunk documents from `<alias>` (`mget`).
 
 ### Important Considerations
 

diff --git a/src/commands/full_index_producer.ts b/src/commands/full_index_producer.ts
@@ -1,11 +1,5 @@
 import { glob } from 'glob';
-import {
-  createIndex,
-  createLocationsIndex,
-  createSettingsIndex,
-  deleteIndex,
-  deleteLocationsIndex,
-} from '../utils/elasticsearch';
+import { createIndex, createLocationsIndex } from '../utils/elasticsearch';
 import { LanguageParser } from '../utils/parser';
 import { indexingConfig } from '../config';
 import path from 'path';
@@ -67,17 +61,14 @@ export async function index(directory: string, clean: boolean, options: IndexOpt
     supportedFileExtensions,
   });
   if (clean) {
-    logger.info('Clean flag is set, deleting existing index and clearing queue.');
-    await deleteIndex(options?.elasticsearchIndex);
-    await deleteLocationsIndex(options?.elasticsearchIndex);
+    logger.info('Clean flag is set, clearing queue before full reindex.');
 
     // Clear the queue when doing a clean reindex
     const workQueue: IQueueWithEnqueueMetadata = await getQueue(options, repoName, gitBranch);
     await workQueue.clear();
   }
 
   await createIndex(options?.elasticsearchIndex);
-  await createSettingsIndex(options?.elasticsearchIndex);
   await createLocationsIndex(options?.elasticsearchIndex);
 
   // Use execFileSync to prevent shell injection from special characters in directory paths

diff --git a/src/commands/incremental_index_command.ts b/src/commands/incremental_index_command.ts
@@ -20,6 +20,7 @@ import {
 export interface IncrementalIndexOptions {
   queueDir: string;
   elasticsearchIndex?: string;
+  settingsIndex?: string;
   token?: string;
   repoName?: string;
   branch?: string;
@@ -61,7 +62,8 @@ export async function incrementalIndex(directory: string, options: IncrementalIn
     ...options,
   });
 
-  const lastCommitHash = await getLastIndexedCommit(gitBranch, options?.elasticsearchIndex);
+  const settingsIndex = options?.settingsIndex || options?.elasticsearchIndex;
+  const lastCommitHash = await getLastIndexedCommit(gitBranch, settingsIndex);
 
   if (!lastCommitHash) {
     logger.warn('No previous commit hash found. Please run a full index first.', { gitBranch });