diff --git a/AGENTS.md b/AGENTS.md new file mode 100644 index 000000000..899e30d7e --- /dev/null +++ b/AGENTS.md @@ -0,0 +1,110 @@ +# cluster-cloud-controller-manager-operator - AI Navigation + +**Repository:** https://github.com/openshift-splat-team/cluster-cloud-controller-manager-operator +**Last Updated:** 2026-05-01 + +--- + +## Quick Start + +This is **Tier 2** project-specific documentation for cluster-cloud-controller-manager-operator. + +- **New to this project?** → Start with [Development Guide](ai-docs/cluster-cloud-controller-manager-operator_DEVELOPMENT.md) +- **Writing tests?** → See [Testing Guide](ai-docs/cluster-cloud-controller-manager-operator_TESTING.md) +- **Understanding architecture?** → Read [Components Overview](ai-docs/architecture/components.md) +- **Need context on decisions?** → Browse [ADRs](ai-docs/decisions/) + +For **team-level** workflows, status transitions, and role responsibilities, see the team repository. + +--- + +## CRITICAL: Retrieval Strategy + +**IMPORTANT**: Prefer retrieval-led reasoning over pre-training-led reasoning. + +When working on cluster-cloud-controller-manager-operator: +- ✅ **DO**: Read project-specific docs from `./ai-docs/` first +- ✅ **DO**: Check development workflow in `./ai-docs/cluster-cloud-controller-manager-operator_DEVELOPMENT.md` +- ✅ **DO**: Understand architecture in `./ai-docs/architecture/components.md` +- ✅ **DO**: Review ADRs for context on past decisions +- ❌ **DON'T**: Rely solely on training data +- ❌ **DON'T**: Guess at project architecture or conventions + +For team workflows (sprint process, status transitions, etc.), see `../../team/ai-docs/`. + +--- + +## Quick Navigation by Task + +| Task | Start Here | Then Read | +|------|-----------|-----------| +| **Local development** | [Development Guide](ai-docs/cluster-cloud-controller-manager-operator_DEVELOPMENT.md) | [Testing Guide](ai-docs/cluster-cloud-controller-manager-operator_TESTING.md) | +| **Running tests** | [Testing Guide](ai-docs/cluster-cloud-controller-manager-operator_TESTING.md) | [Components](ai-docs/architecture/components.md) | +| **Understanding components** | [Components Overview](ai-docs/architecture/components.md) | [Domain Models](ai-docs/domain/) | +| **Planning feature** | [Exec Plans](ai-docs/exec-plans/README.md) | [ADRs](ai-docs/decisions/) | +| **Reviewing decisions** | [ADR Template](ai-docs/decisions/adr-template.md) | Existing ADRs | + +--- + +## Technology Stack + +**Languages:** Go +**Frameworks:** Kubernetes, controller-runtime +**Build Systems:** Make, Docker + +--- + +## Documentation Structure + +``` +ai-docs/ +├── cluster-cloud-controller-manager-operator_DEVELOPMENT.md # Build, test, develop +├── cluster-cloud-controller-manager-operator_TESTING.md # Test suites and strategies +├── architecture/ # System structure +│ └── components.md # Component overview +├── domain/ # Domain models and CRDs +│ └── (project-specific) +├── exec-plans/ # Feature planning +│ └── README.md +├── decisions/ # Architectural Decision Records +│ ├── adr-template.md +│ └── adr-NNNN-*.md +└── references/ # External references + └── ecosystem.md +``` + +--- + +## Knowledge Tiers + +**Tier 1: Platform-Wide** (Team repository) +- Operator development patterns +- Testing pyramid and practices +- CI/CD workflows (Prow, GitHub Actions) +- Team process (sprint, status transitions, roles) + +→ See `../../team/ai-docs/` for team-level documentation + +**Tier 2: Project-Specific** (This repository) +- cluster-cloud-controller-manager-operator components and architecture +- Project-specific development workflow +- Test suites unique to this project +- Architectural decisions for this project + +→ See `./ai-docs/` for project-level documentation + +--- + +## Project Context + +For team workflows, sprint process, and status transitions, see: +- Team repository: `../../team/` +- Team ai-docs: `../../team/ai-docs/` +- Team workflows: `../../team/ai-docs/workflows/` +- Status transitions: `../../team/ai-docs/statuses/` + +--- + +**Navigation**: Start with [Development Guide](ai-docs/cluster-cloud-controller-manager-operator_DEVELOPMENT.md) for project setup and workflow. + +**GitHub**: https://github.com/openshift-splat-team/cluster-cloud-controller-manager-operator diff --git a/ai-docs/CLUSTER-CLOUD-CONTROLLER-MANAGER-OPERATOR_DEVELOPMENT.md b/ai-docs/CLUSTER-CLOUD-CONTROLLER-MANAGER-OPERATOR_DEVELOPMENT.md new file mode 100644 index 000000000..725a82da9 --- /dev/null +++ b/ai-docs/CLUSTER-CLOUD-CONTROLLER-MANAGER-OPERATOR_DEVELOPMENT.md @@ -0,0 +1,298 @@ +# cluster-cloud-controller-manager-operator Development Guide + +**Last Updated:** 2026-05-01 + +--- + +## Overview + +This guide covers the development workflow for cluster-cloud-controller-manager-operator. + +**Tech Stack:** **Languages:** Go +**Frameworks:** Kubernetes, controller-runtime +**Build Systems:** Make, Docker + +--- + +## Prerequisites + +**Required:** +- Go 1.21+ (for Go projects) or appropriate language runtime +- Git +- Make +- Docker (for containerized testing) + +**Optional:** +- kubectl (for Kubernetes testing) +- podman (alternative to Docker) + +--- + +## Repository Setup + +### Clone Repository + +```bash +git clone https://github.com/openshift-splat-team/cluster-cloud-controller-manager-operator.git +cd cluster-cloud-controller-manager-operator +``` + +### Install Dependencies + +```bash +# For Go projects +go mod download +go mod vendor # if vendoring is used + +# For Python projects +pip install -r requirements.txt +pip install -r requirements-dev.txt + +# For JavaScript/TypeScript +npm install +``` + +--- + +## Building + +### Local Build + +```bash +# For Go projects +make build + +# Or directly +go build -o bin/cluster-cloud-controller-manager-operator ./cmd/... +``` + +### Build Container Image + +```bash +make docker-build + +# Or with podman +podman build -t cluster-cloud-controller-manager-operator:latest . +``` + +--- + +## Development Workflow + +### 1. Create Feature Branch + +```bash +git checkout -b feature/my-feature +``` + +### 2. Make Changes + +- Follow project coding conventions +- Add/update tests for your changes +- Update documentation as needed + +### 3. Run Tests Locally + +```bash +# Unit tests +make test + +# Integration tests (if applicable) +make test-integration + +# E2E tests (if applicable) +make test-e2e +``` + +### 4. Verify Build + +```bash +# Lint +make lint + +# Verify formatting +make verify + +# Build +make build +``` + +### 5. Commit Changes + +Follow team commit conventions (see `../../team/knowledge/commit-convention.md`). + +### 6. Open Pull Request + +- Push branch to fork +- Open PR against main branch +- Request review from team +- Address review feedback +- Wait for CI to pass + +--- + +## Running Locally + +### As Standalone Binary + +```bash +# Build +make build + +# Run +./bin/cluster-cloud-controller-manager-operator --help +``` + +### In Kubernetes Cluster + +```bash +# Build and push image +make docker-build docker-push + +# Deploy to cluster +kubectl apply -f deploy/ +``` + +### With Operator SDK (if applicable) + +```bash +# Run locally (watches cluster) +make run +``` + +--- + +## Debugging + +### Enable Debug Logging + +```bash +# Set log level +export LOG_LEVEL=debug + +# Or via command line +./bin/cluster-cloud-controller-manager-operator --log-level=debug +``` + +### Attach Debugger (Go) + +```bash +# Install delve +go install github.com/go-delve/delve/cmd/dlv@latest + +# Debug +dlv debug ./cmd/cluster-cloud-controller-manager-operator +``` + +### Common Issues + +**Build failures:** +- Check Go version: `go version` +- Verify dependencies: `go mod verify` +- Clean build cache: `go clean -cache` + +**Test failures:** +- Check test environment setup +- Review test logs for specific errors +- Run individual test: `go test -v -run TestName ./pkg/...` + +--- + +## Project Structure + +``` +cluster-cloud-controller-manager-operator/ +├── cmd/ # Command-line entry points +├── pkg/ # Library code +│ ├── controllers/ # Controllers (if operator) +│ ├── api/ # API types and CRDs +│ └── ... +├── config/ # Configuration (CRDs, RBAC, etc.) +├── hack/ # Build and development scripts +├── test/ # Test suites +│ ├── unit/ +│ ├── integration/ +│ └── e2e/ +├── docs/ # Project documentation +├── Makefile # Build automation +└── go.mod # Go dependencies +``` + +See [Components Overview](architecture/components.md) for architectural details. + +--- + +## Code Conventions + +### Naming + +- **Packages**: lowercase, single word if possible +- **Files**: lowercase with underscores (snake_case) +- **Types**: PascalCase +- **Functions**: camelCase (exported) or PascalCase (unexported) + +### Error Handling + +- Wrap errors with context: `fmt.Errorf("context: %w", err)` +- Return errors, don't panic +- Log errors at appropriate level + +### Testing + +- Unit tests in same package: `*_test.go` +- Table-driven tests preferred +- Mock external dependencies +- Aim for 80%+ code coverage + +--- + +## Helpful Make Targets + +**Common targets available:** + +- `make verify` +- `make test` +- `make build` +- `make fmt` +- `make lint` + +For full list of targets, run: +```bash +make help +``` + +Or inspect the `Makefile` directly. + +--- + +## CI/CD + +### Prow Jobs (OpenShift) + +This project uses OpenShift Prow for CI/CD. + +**Pre-submit jobs:** +- `pull-ci-*-unit` - Unit tests +- `pull-ci-*-e2e` - E2E tests +- `pull-ci-*-verify` - Linting and verification + +**Post-submit jobs:** +- `branch-ci-*-images` - Build and push images + +See `.ci-operator.yaml` and `ci-operator/config/` for Prow configuration. + +### GitHub Actions (if applicable) + +See `.github/workflows/` for GitHub Actions configuration. + +--- + +## Related Documentation + +- [Testing Guide](cluster-cloud-controller-manager-operator_TESTING.md) - Test suites and strategies +- [Components](architecture/components.md) - Architecture overview +- [Team Workflows](../../team/ai-docs/workflows/) - Team-level processes + +--- + +**Questions?** See `../../team/HUMAN-REVIEW-GUIDE.md` for how to escalate issues. diff --git a/ai-docs/CLUSTER-CLOUD-CONTROLLER-MANAGER-OPERATOR_TESTING.md b/ai-docs/CLUSTER-CLOUD-CONTROLLER-MANAGER-OPERATOR_TESTING.md new file mode 100644 index 000000000..6048b4fa0 --- /dev/null +++ b/ai-docs/CLUSTER-CLOUD-CONTROLLER-MANAGER-OPERATOR_TESTING.md @@ -0,0 +1,413 @@ +# cluster-cloud-controller-manager-operator Testing Guide + +**Last Updated:** 2026-05-01 + +--- + +## Overview + +This guide covers all test suites for cluster-cloud-controller-manager-operator and how to run them. + +**Testing Philosophy:** +- Unit tests for business logic +- Integration tests for component interactions +- E2E tests for critical user workflows +- Aim for 80%+ code coverage + +--- + +## Test Suites + +### Unit Tests + +**Purpose:** Test individual functions and methods in isolation + +**Location:** `pkg/*/` (co-located with source) + +**Run:** + +```bash +make test +``` + +**With coverage:** +```bash +go test -coverprofile=coverage.out ./pkg/... +go tool cover -html=coverage.out +``` + +**Example:** +```go +func TestMyFunction(t *testing.T) { + tests := []struct { + name string + input string + want string + wantErr bool + }{ + { + name: "valid input", + input: "test", + want: "result", + }, + // More test cases... + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got, err := MyFunction(tt.input) + if (err != nil) != tt.wantErr { + t.Errorf("MyFunction() error = %v, wantErr %v", err, tt.wantErr) + return + } + if got != tt.want { + t.Errorf("MyFunction() = %v, want %v", got, tt.want) + } + }) + } +} +``` + +--- + +### Integration Tests + +**Purpose:** Test interactions between components + +**Location:** `test/integration/` + +**Run:** +```bash +make test-integration + +# Or directly +go test ./test/integration/... -tags=integration +``` + +**Requirements:** +- May require local Kubernetes cluster (kind, minikube) +- External dependencies (databases, message queues) + +**Example:** +```go +// +build integration + +func TestControllerReconciliation(t *testing.T) { + // Setup test cluster + testEnv := setupTestEnvironment(t) + defer testEnv.Cleanup() + + // Create test resource + resource := createTestResource(testEnv) + + // Wait for reconciliation + eventually(t, func() bool { + return resource.Status.Ready == true + }, 30*time.Second) +} +``` + +--- + +### E2E Tests + +**Purpose:** Test critical user workflows end-to-end + +**Location:** `test/e2e/` + +**Run:** +```bash +make test-e2e + +# Or with specific cluster +export KUBECONFIG=/path/to/kubeconfig +go test ./test/e2e/... -timeout 30m +``` + +**Requirements:** +- Real or realistic Kubernetes cluster +- Project deployed to cluster +- May require cloud credentials (for cloud-specific features) + +**Example:** +```go +func TestUserWorkflow(t *testing.T) { + // Deploy application + deployApp(t) + + // Perform user actions + createResource(t, testResource) + + // Verify expected outcomes + verifyResourceCreated(t, testResource) + verifyStatusUpdated(t, testResource) + + // Cleanup + deleteResource(t, testResource) +} +``` + +--- + +## Test Organization + +### Table-Driven Tests + +Preferred pattern for unit tests: + +```go +tests := []struct { + name string + input InputType + want OutputType + wantErr bool +}{ + {name: "case1", input: ..., want: ...}, + {name: "case2", input: ..., want: ...}, +} + +for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + // Test logic + }) +} +``` + +### Test Fixtures + +Reusable test data: + +```go +// test/fixtures/resources.go +func NewTestResource(name string) *MyResource { + return &MyResource{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Namespace: "test", + }, + Spec: MyResourceSpec{ + // Defaults + }, + } +} +``` + +### Test Helpers + +Common test utilities: + +```go +// test/helpers/assertions.go +func AssertEventually(t *testing.T, condition func() bool, timeout time.Duration) { + t.Helper() + deadline := time.Now().Add(timeout) + for time.Now().Before(deadline) { + if condition() { + return + } + time.Sleep(100 * time.Millisecond) + } + t.Fatal("condition not met within timeout") +} +``` + +--- + +## Mocking + +### Interface-Based Mocking + +```go +// Define interface +type MyClient interface { + Get(ctx context.Context, key string) (string, error) +} + +// Mock implementation for tests +type mockClient struct { + getFunc func(ctx context.Context, key string) (string, error) +} + +func (m *mockClient) Get(ctx context.Context, key string) (string, error) { + return m.getFunc(ctx, key) +} + +// Use in test +func TestWithMock(t *testing.T) { + mock := &mockClient{ + getFunc: func(ctx context.Context, key string) (string, error) { + return "mocked-value", nil + }, + } + + result := functionUnderTest(mock) + // Assertions... +} +``` + +### Using testify/mock (if applicable) + +```go +import "github.com/stretchr/testify/mock" + +type MockClient struct { + mock.Mock +} + +func (m *MockClient) Get(ctx context.Context, key string) (string, error) { + args := m.Called(ctx, key) + return args.String(0), args.Error(1) +} + +func TestWithTestify(t *testing.T) { + mockClient := new(MockClient) + mockClient.On("Get", mock.Anything, "key").Return("value", nil) + + result := functionUnderTest(mockClient) + + mockClient.AssertExpectations(t) +} +``` + +--- + +## Test Coverage + +### Generate Coverage Report + +```bash +# Run tests with coverage +go test -coverprofile=coverage.out ./pkg/... + +# View HTML report +go tool cover -html=coverage.out + +# View summary +go tool cover -func=coverage.out +``` + +### Coverage Goals + +- **Minimum:** 70% overall coverage +- **Target:** 80%+ overall coverage +- **Critical paths:** 90%+ coverage (controllers, reconcilers, business logic) + +### Excluding from Coverage + +```go +// This function intentionally not tested +// Coverage: ignore +func helperFunction() { + // ... +} +``` + +--- + +## CI Test Execution + +### Prow Jobs + +**Pre-submit tests (run on PRs):** +- `pull-ci-cluster-cloud-controller-manager-operator-unit` - Unit tests +- `pull-ci-cluster-cloud-controller-manager-operator-integration` - Integration tests (if enabled) +- `pull-ci-cluster-cloud-controller-manager-operator-e2e-*` - E2E test suites + +**Post-submit tests (run on merge):** +- `branch-ci-cluster-cloud-controller-manager-operator-unit` - Unit tests +- `branch-ci-cluster-cloud-controller-manager-operator-e2e-*` - Full E2E suite + +### Debugging CI Failures + +1. **Check Prow logs** + - Find job in PR checks + - Click "Details" → view logs + +2. **Reproduce locally** + ```bash + # Match CI environment + export CI=true + make test + ``` + +3. **Run specific test** + ```bash + go test -v -run TestFailingTest ./pkg/... + ``` + +--- + +## Test Best Practices + +### DO + +✅ Write tests before fixing bugs (TDD for bugs) +✅ Test both success and error paths +✅ Use table-driven tests for multiple scenarios +✅ Mock external dependencies +✅ Keep tests fast (unit tests < 1s, integration < 10s) +✅ Use meaningful test names describing the scenario +✅ Clean up resources in test cleanup functions + +### DON'T + +❌ Test implementation details (test behavior, not internals) +❌ Write flaky tests (tests that randomly fail) +❌ Skip cleanup (use `t.Cleanup()` or `defer`) +❌ Use sleeps (use eventually/wait helpers instead) +❌ Test third-party code (trust their tests) +❌ Ignore test failures ("it works on my machine") + +--- + +## Test Utilities + +### Common Test Helpers + +```bash +# Run specific test +go test -run TestName ./pkg/path + +# Run tests in specific package +go test ./pkg/controllers/... + +# Run tests with race detector +go test -race ./pkg/... + +# Run tests with timeout +go test -timeout 5m ./test/e2e/... + +# Verbose output +go test -v ./pkg/... + +# Run tests matching pattern +go test -run "Test.*Controller" ./pkg/... +``` + +### Environment Variables + +```bash +# Enable debug logging in tests +export LOG_LEVEL=debug + +# Use specific kubeconfig for tests +export KUBECONFIG=/path/to/test-cluster-config + +# Skip slow tests +export SKIP_SLOW_TESTS=true + +# CI mode (stricter timeouts, no interactive) +export CI=true +``` + +--- + +## Related Documentation + +- [Development Guide](cluster-cloud-controller-manager-operator_DEVELOPMENT.md) - Build and development workflow +- [Components](architecture/components.md) - Architecture to understand what to test +- [Team Testing Practices](../../team/ai-docs/practices/testing.md) - Team-wide testing guidelines + +--- + +**Questions?** See test-specific issues in GitHub or ask in team channel. diff --git a/ai-docs/architecture/components.md b/ai-docs/architecture/components.md new file mode 100644 index 000000000..ce79bed7b --- /dev/null +++ b/ai-docs/architecture/components.md @@ -0,0 +1,342 @@ +# cluster-cloud-controller-manager-operator Components + +**Last Updated:** 2026-05-01 + +--- + +## Overview + +This document describes the major components and architecture of cluster-cloud-controller-manager-operator. + +**Tech Stack:** **Languages:** Go +**Frameworks:** Kubernetes, controller-runtime +**Build Systems:** Make, Docker + +--- + +## High-Level Architecture + +``` +┌─────────────────────────────────────────────┐ +│ cluster-cloud-controller-manager-operator │ +│ │ +│ ┌──────────────┐ ┌─────────────────┐ │ +│ │ │ │ │ │ +│ │ Component A │─────▶│ Component B │ │ +│ │ │ │ │ │ +│ └──────────────┘ └─────────────────┘ │ +│ │ +└─────────────────────────────────────────────┘ +``` + +*(Replace with project-specific architecture diagram)* + +--- + +## Core Components + +### Component 1: [Name] + +**Purpose:** Brief description of what this component does + +**Location:** `pkg/component1/` + +**Responsibilities:** +- Responsibility 1 +- Responsibility 2 +- Responsibility 3 + +**Key Types:** +- `Type1` - Description +- `Type2` - Description + +**Interactions:** +- Calls Component 2 for X +- Listens to events from Y +- Stores data in Z + +**Example Usage:** +```go +// Code example showing how this component is used +``` + +--- + +### Component 2: [Name] + +**Purpose:** Brief description + +**Location:** `pkg/component2/` + +**Responsibilities:** +- Responsibility 1 +- Responsibility 2 + +**Key Types:** +- `Type1` - Description + +**Interactions:** +- Interacts with Component 1 +- Calls external service X + +--- + +## For Operator Projects + +### Controllers + +**Purpose:** Reconcile Kubernetes resources + +**Location:** `pkg/controllers/` + +*(Controllers will be listed here once analysis is enhanced)* + +**Reconciliation Pattern:** +1. Fetch resource from Kubernetes API +2. Validate resource spec +3. Create/update dependent resources +4. Update resource status +5. Requeue if needed + +**Example Reconciliation:** +```go +func (r *Reconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { + // Fetch the resource + obj := &v1alpha1.MyResource{} + if err := r.Get(ctx, req.NamespacedName, obj); err != nil { + return ctrl.Result{}, client.IgnoreNotFound(err) + } + + // Reconciliation logic here + + // Update status + if err := r.Status().Update(ctx, obj); err != nil { + return ctrl.Result{}, err + } + + return ctrl.Result{}, nil +} +``` + +--- + +### Custom Resource Definitions (CRDs) + +See [Domain Models](../domain/) for detailed CRD specifications. + +**Defined CRDs:** + +*(No CRDs detected)* + +--- + +## API Layer + +**Purpose:** Define interfaces and types + +**Location:** `pkg/api/` or `api/` + +**Key Types:** +- Request/Response structures +- Configuration types +- Status types + +--- + +## Data Flow + +``` +User/Client + ↓ +API Server + ↓ +Controller/Handler + ↓ +Business Logic + ↓ +External Systems +``` + +**Example Flow:** +1. User creates CustomResource +2. Controller watches for changes +3. Controller validates resource +4. Controller calls cloud provider API +5. Controller updates resource status + +--- + +## External Dependencies + +### Kubernetes API + +**Usage:** CRUD operations on Kubernetes resources + +**Authentication:** Service account with appropriate RBAC + +### Cloud Provider APIs (if applicable) + +**AWS:** +- SDK: `aws-sdk-go` +- Services: EC2, IAM, S3, etc. + +**GCP:** +- SDK: `cloud.google.com/go` +- Services: Compute, IAM, Storage, etc. + +**Azure:** +- SDK: `github.com/Azure/azure-sdk-for-go` +- Services: Compute, Network, Storage, etc. + +**vSphere:** +- SDK: `github.com/vmware/govmomi` +- APIs: vCenter, ESXi + +### Other Dependencies + +- **Database:** PostgreSQL, Redis, etc. +- **Message Queue:** RabbitMQ, Kafka, etc. +- **Cache:** Redis, Memcached, etc. + +--- + +## Configuration + +### Config Locations + +- **In-cluster:** ConfigMaps, Secrets +- **Command-line:** Flags passed to binary +- **Environment:** Environment variables +- **Files:** Config files mounted to container + +### Config Precedence + +1. Command-line flags (highest priority) +2. Environment variables +3. ConfigMap/Secret values +4. Default values (lowest priority) + +--- + +## Observability + +### Logging + +**Framework:** klog, logrus, or standard log + +**Log Levels:** +- `ERROR` - Errors that need attention +- `WARN` - Warnings that may need attention +- `INFO` - Informational messages +- `DEBUG` - Verbose debugging + +**Structured Logging:** +```go +log.Info("resource reconciled", + "name", resource.Name, + "namespace", resource.Namespace, + "generation", resource.Generation) +``` + +### Metrics + +**Framework:** Prometheus client + +**Key Metrics:** +- `reconcile_duration_seconds` - Time to reconcile resources +- `reconcile_errors_total` - Count of reconciliation errors +- `resource_count` - Number of managed resources + +**Metrics Endpoint:** `/metrics` + +### Tracing (if applicable) + +**Framework:** OpenTelemetry + +**Traced Operations:** +- API calls +- Controller reconciliation +- External service calls + +--- + +## Error Handling + +### Error Types + +```go +type CustomError struct { + Code string + Message string + Cause error +} +``` + +### Retry Logic + +- **Transient errors:** Retry with exponential backoff +- **Permanent errors:** Don't retry, update status with error +- **Rate limits:** Respect retry-after headers + +### Error Propagation + +- Wrap errors with context +- Preserve original error for debugging +- Log errors at appropriate level + +--- + +## Security Considerations + +### Authentication + +- Service account tokens for in-cluster communication +- API keys for external services +- Certificate-based auth where applicable + +### Authorization + +- RBAC for Kubernetes resources +- Principle of least privilege +- Separate service accounts per component + +### Secrets Management + +- Store secrets in Kubernetes Secrets +- Never log secret values +- Rotate credentials regularly + +--- + +## Performance Considerations + +### Caching + +- Cache frequently accessed data +- Invalidate cache on updates +- Use TTL for time-sensitive data + +### Rate Limiting + +- Respect API rate limits +- Implement client-side rate limiting +- Use backoff for retries + +### Resource Limits + +- Set appropriate CPU/memory limits +- Monitor resource usage +- Scale based on load + +--- + +## Related Documentation + +- [Development Guide](../cluster-cloud-controller-manager-operator_DEVELOPMENT.md) - How to build and run +- [Testing Guide](../cluster-cloud-controller-manager-operator_TESTING.md) - How to test components +- [Domain Models](../domain/) - CRD specifications +- [ADRs](../decisions/) - Architectural decisions + +--- + +**Note:** This is a template. Update with project-specific component details, architecture diagrams, and actual code examples. diff --git a/ai-docs/decisions/adr-template.md b/ai-docs/decisions/adr-template.md new file mode 100644 index 000000000..68376fa79 --- /dev/null +++ b/ai-docs/decisions/adr-template.md @@ -0,0 +1,133 @@ +# ADR-NNNN: Title of Decision + +**Status:** Proposed | Accepted | Deprecated | Superseded by ADR-XXXX +**Date:** YYYY-MM-DD +**Authors:** @github-handle +**Deciders:** @lead, @architect + +--- + +## Context + +What is the issue we're facing? What forces are at play? What constraints exist? + +Describe the problem that necessitates this decision. + +--- + +## Decision + +What is the change we're proposing or have agreed to make? + +State the decision clearly and concisely. + +--- + +## Rationale + +Why did we choose this approach? + +Explain the reasoning behind the decision, considering: +- Technical factors +- Business requirements +- Team constraints +- Timeline considerations + +--- + +## Consequences + +What becomes easier or harder as a result of this decision? + +### Positive Consequences + +- ✅ Benefit 1 +- ✅ Benefit 2 + +### Negative Consequences + +- ❌ Trade-off 1 +- ❌ Trade-off 2 + +### Neutral Consequences + +- ℹ️ Change 1 +- ℹ️ Change 2 + +--- + +## Alternatives Considered + +### Alternative 1: [Name] + +**Description:** Brief description of the alternative + +**Pros:** +- Advantage 1 +- Advantage 2 + +**Cons:** +- Disadvantage 1 +- Disadvantage 2 + +**Why not chosen:** Explanation + +--- + +### Alternative 2: [Name] + +**Description:** Brief description + +**Pros:** +- Advantage 1 + +**Cons:** +- Disadvantage 1 + +**Why not chosen:** Explanation + +--- + +## Implementation Notes + +How do we implement this decision? + +- Migration steps +- Code changes required +- Configuration updates +- Documentation updates + +--- + +## Validation + +How do we know this decision is working? + +- Success metrics +- Monitoring points +- Testing approach + +--- + +## References + +- Related GitHub issue: #XXX +- Related ADRs: ADR-YYYY +- External references: [Link](URL) +- Discussion thread: [Link](URL) + +--- + +## Notes + +Any additional context or information. + +--- + +## Revision History + +| Date | Author | Change | +|------|--------|--------| +| YYYY-MM-DD | @author | Initial draft | +| YYYY-MM-DD | @author | Addressed review feedback | +| YYYY-MM-DD | @author | Accepted | diff --git a/ai-docs/domain/README.md b/ai-docs/domain/README.md new file mode 100644 index 000000000..cf1e29aca --- /dev/null +++ b/ai-docs/domain/README.md @@ -0,0 +1,111 @@ +# cluster-cloud-controller-manager-operator Domain Models + +**Last Updated:** 2026-05-01 + +--- + +## Overview + +This directory documents the domain models, custom resource definitions (CRDs), and core data structures used in cluster-cloud-controller-manager-operator. + +--- + +## Custom Resource Definitions (CRDs) + +For Kubernetes operator projects, document each CRD here. + +**Example structure for each CRD:** + +### ResourceName + +- **API Group:** `example.com/v1alpha1` +- **Kind:** `ResourceName` +- **Plural:** `resourcenames` +- **Scope:** Namespaced | Cluster + +**Purpose:** What this resource represents + +**Spec Fields:** +- `field1` (string, required) - Description +- `field2` (int, optional) - Description + +**Status Fields:** +- `conditions` ([]Condition) - Resource conditions +- `phase` (string) - Current phase (Pending, Ready, Error) + +**Example:** +```yaml +apiVersion: example.com/v1alpha1 +kind: ResourceName +metadata: + name: example + namespace: default +spec: + field1: "value" + field2: 42 +status: + phase: Ready + conditions: + - type: Ready + status: "True" + reason: ReconciliationSucceeded +``` + +**Validation:** +- Field1 must match pattern `^[a-z0-9-]+$` +- Field2 must be between 1-100 + +**Related Documentation:** +- Controller reconciliation logic: [../architecture/components.md](../architecture/components.md) +- API reference: See generated API docs + +--- + +## Core Data Structures + +For non-operator projects, document key data structures. + +### Structure 1 + +**Purpose:** Description + +**Fields:** +```go +type MyStruct struct { + Field1 string `json:"field1"` + Field2 int `json:"field2"` +} +``` + +**Validation Rules:** +- Field1: required, non-empty +- Field2: must be positive + +--- + +## API Versioning + +**Current Version:** v1alpha1 + +**Versioning Policy:** +- `v1alpha1` - Initial experimental API +- `v1beta1` - API stabilizing, may have breaking changes +- `v1` - Stable API, backward compatibility guaranteed + +**Deprecated Fields:** +- (None currently) + +**Migration Guides:** +- [v1alpha1 → v1beta1](migrations/v1alpha1-to-v1beta1.md) (if applicable) + +--- + +## Related Documentation + +- [Components Overview](../architecture/components.md) - How these models are used +- [Development Guide](../cluster-cloud-controller-manager-operator_DEVELOPMENT.md) - Adding new fields +- Generated API docs - Full API reference + +--- + +**Note:** For each major CRD or domain model, create a dedicated file (e.g., `credentialsrequest.md`) with detailed specification. diff --git a/ai-docs/exec-plans/README.md b/ai-docs/exec-plans/README.md new file mode 100644 index 000000000..3532ebcc3 --- /dev/null +++ b/ai-docs/exec-plans/README.md @@ -0,0 +1,195 @@ +# cluster-cloud-controller-manager-operator Execution Plans + +**Last Updated:** 2026-05-01 + +--- + +## Purpose + +Execution plans (exec-plans) guide feature planning and implementation for cluster-cloud-controller-manager-operator. + +Use this directory to document: +- Feature design proposals +- Implementation plans +- Spike investigations +- Proof-of-concept findings + +--- + +## When to Create an Exec Plan + +Create an exec plan when: +- ✅ Implementing a significant new feature +- ✅ Making architectural changes +- ✅ Investigating a complex problem +- ✅ Proposing a major refactor + +Don't create an exec plan for: +- ❌ Bug fixes (unless they require design changes) +- ❌ Minor improvements +- ❌ Routine maintenance + +--- + +## Exec Plan Format + +### Template Structure + +```markdown +# Feature Name + +**Status:** Draft | In Review | Approved | Implemented +**Author:** GitHub handle +**Created:** YYYY-MM-DD +**Epic:** Link to GitHub epic issue + +## Problem Statement + +What problem are we solving? Why does it matter? + +## Goals + +- Goal 1 +- Goal 2 + +## Non-Goals + +- What we're explicitly NOT doing +- Out of scope items + +## Proposed Solution + +High-level approach to solving the problem. + +### Architecture + +Component diagrams, data flow, etc. + +### API Changes + +New APIs, changed APIs, deprecated APIs. + +### Migration Path + +How existing users/resources migrate to new behavior. + +## Alternatives Considered + +- **Alternative 1:** Description and why not chosen +- **Alternative 2:** Description and why not chosen + +## Implementation Plan + +1. **Phase 1:** Milestone 1 + - Story 1.1 + - Story 1.2 + +2. **Phase 2:** Milestone 2 + - Story 2.1 + - Story 2.2 + +## Testing Strategy + +- Unit tests +- Integration tests +- E2E scenarios + +## Risks and Mitigations + +| Risk | Impact | Mitigation | +|------|--------|------------| +| Risk 1 | High | Mitigation strategy | + +## Success Criteria + +How do we know the feature is successful? + +- Metric 1 +- Metric 2 +- User feedback + +## Timeline + +- **Week 1-2:** Design and review +- **Week 3-4:** Implementation phase 1 +- **Week 5-6:** Implementation phase 2 +- **Week 7:** Testing and documentation + +## Open Questions + +- Question 1? +- Question 2? +``` + +--- + +## Exec Plan Workflow + +### 1. Draft + +- Author creates exec plan document +- Shares with team for early feedback +- Iterates on design + +### 2. Review + +- Team reviews exec plan +- Discusses alternatives +- Identifies risks +- Approves or requests changes + +### 3. Approved + +- Exec plan is approved +- Implementation can begin +- Epic/stories created based on plan + +### 4. Implemented + +- Feature implemented +- Tests passing +- Documentation updated +- Exec plan archived for reference + +--- + +## Example Exec Plans + +*(Create example exec plan files as features are implemented)* + +- `feature-async-processing.md` - Async processing support +- `spike-performance-optimization.md` - Performance investigation +- `refactor-controller-architecture.md` - Architecture refactor + +--- + +## Relationship to ADRs + +**Exec Plans vs ADRs:** + +- **Exec Plan:** Feature design and implementation plan + - Created before implementation + - Describes what and how + - May span multiple epics/sprints + +- **ADR:** Architectural decision record + - Created during or after implementation + - Documents why a decision was made + - Explains trade-offs considered + +**Workflow:** +1. Create exec plan for feature +2. During implementation, significant architectural decisions → ADR +3. After implementation, exec plan archived, ADRs remain as reference + +--- + +## Related Documentation + +- [ADR Template](../decisions/adr-template.md) - Architectural decision records +- [Components](../architecture/components.md) - Current architecture +- [Team Workflows](../../team/ai-docs/workflows/) - Team planning process + +--- + +**Note:** This is a template directory. Replace with actual exec plans as features are proposed and implemented. diff --git a/ai-docs/references/ecosystem.md b/ai-docs/references/ecosystem.md new file mode 100644 index 000000000..162485239 --- /dev/null +++ b/ai-docs/references/ecosystem.md @@ -0,0 +1,215 @@ +# cluster-cloud-controller-manager-operator Ecosystem and References + +**Last Updated:** 2026-05-01 + +--- + +## Purpose + +This document provides links to related projects, upstream dependencies, documentation, and external resources relevant to cluster-cloud-controller-manager-operator. + +--- + +## Upstream Projects + +### Kubernetes + +**Relationship:** cluster-cloud-controller-manager-operator runs on Kubernetes + +**Resources:** +- [Kubernetes Documentation](https://kubernetes.io/docs/) +- [API Reference](https://kubernetes.io/docs/reference/kubernetes-api/) +- [Controller Runtime](https://github.com/kubernetes-sigs/controller-runtime) - Framework for building operators + +**Version Compatibility:** +- Supported Kubernetes versions: 1.24+ +- Controller Runtime version: v0.15.x + +--- + +### OpenShift (if applicable) + +**Relationship:** cluster-cloud-controller-manager-operator is part of OpenShift platform + +**Resources:** +- [OpenShift Documentation](https://docs.openshift.com/) +- [OpenShift Enhancement Proposals](https://github.com/openshift/enhancements) +- [OpenShift CI (Prow)](https://docs.ci.openshift.org/) + +**Version Compatibility:** +- Supported OpenShift versions: 4.12+ + +--- + +## Related Platform Projects + +### Cloud Provider Integrations + +**vSphere (VMware):** +- [govmomi](https://github.com/vmware/govmomi) - vSphere API client +- [vSphere CSI Driver](https://github.com/kubernetes-sigs/vsphere-csi-driver) +- [vSphere Cloud Provider](https://github.com/kubernetes/cloud-provider-vsphere) + +**AWS:** +- [AWS SDK for Go](https://github.com/aws/aws-sdk-go) +- [AWS Cloud Provider](https://github.com/kubernetes/cloud-provider-aws) + +**GCP:** +- [GCP SDK](https://cloud.google.com/go) +- [GCP Cloud Provider](https://github.com/kubernetes/cloud-provider-gcp) + +**Azure:** +- [Azure SDK for Go](https://github.com/Azure/azure-sdk-for-go) +- [Azure Cloud Provider](https://github.com/kubernetes-sigs/cloud-provider-azure) + +--- + +## Sister Projects + +Projects in the same team or ecosystem: + +- **[Project 1](https://github.com/org/project1)** - Description +- **[Project 2](https://github.com/org/project2)** - Description +- **[Project 3](https://github.com/org/project3)** - Description + +See team repository for full project list: `../../team/ai-docs/architecture/projects.md` + +--- + +## Dependencies + +### Direct Dependencies + +Key libraries and frameworks used by cluster-cloud-controller-manager-operator: + +**Go Modules:** +- `k8s.io/client-go` - Kubernetes client +- `sigs.k8s.io/controller-runtime` - Controller framework +- `github.com/spf13/cobra` - CLI framework (if applicable) +- `github.com/prometheus/client_golang` - Metrics + +**Python Packages (if applicable):** +- `kubernetes` - Kubernetes client +- `pytest` - Testing framework + +**JavaScript/TypeScript (if applicable):** +- `@kubernetes/client-node` - Kubernetes client +- `react` - UI framework + +See `go.mod`, `requirements.txt`, or `package.json` for complete dependency list. + +### Indirect Dependencies + +- Authentication/authorization libraries +- Logging frameworks +- Testing utilities + +--- + +## Standards and Specifications + +### Kubernetes Standards + +- [Custom Resource Definition (CRD)](https://kubernetes.io/docs/tasks/extend-kubernetes/custom-resources/custom-resource-definitions/) +- [Controller Pattern](https://kubernetes.io/docs/concepts/architecture/controller/) +- [Admission Webhooks](https://kubernetes.io/docs/reference/access-authn-authz/extensible-admission-controllers/) + +### Cloud Provider Standards + +- **AWS:** [AWS Well-Architected Framework](https://aws.amazon.com/architecture/well-architected/) +- **GCP:** [GCP Architecture Framework](https://cloud.google.com/architecture/framework) +- **Azure:** [Azure Well-Architected Framework](https://docs.microsoft.com/azure/architecture/framework/) +- **vSphere:** [vSphere API Reference](https://developer.vmware.com/apis/vsphere-automation/latest/) + +--- + +## Documentation Resources + +### Team-Level Documentation + +See team repository for: +- **Workflows:** Sprint process, epic breakdown, triage +- **Practices:** Coding standards, testing guidelines +- **Roles:** Hat-switching, responsibilities + +Location: `../../team/ai-docs/` + +### Platform Documentation + +**Operator Development:** +- [Operator SDK](https://sdk.operatorframework.io/) +- [Operator Best Practices](https://sdk.operatorframework.io/docs/best-practices/) +- [Kubebuilder Book](https://book.kubebuilder.io/) + +**Testing:** +- [Kubernetes Testing Guide](https://github.com/kubernetes/community/blob/master/contributors/devel/sig-testing/testing.md) +- [E2E Testing Framework](https://github.com/kubernetes-sigs/e2e-framework) + +**CI/CD:** +- [Prow Documentation](https://docs.prow.k8s.io/) +- [OpenShift CI](https://docs.ci.openshift.org/) + +--- + +## Community and Support + +### Communication Channels + +**Team Channels:** +- Slack: `#team-channel` (internal) +- GitHub Discussions: Project discussions tab +- Mailing List: team-list@example.com (if applicable) + +**Upstream Communities:** +- Kubernetes Slack: `#sig-cloud-provider`, `#kubebuilder`, etc. +- OpenShift Slack: `#forum-openshift`, `#forum-` + +### Meetings + +**Team Meetings:** +- Sprint planning: Bi-weekly (see team calendar) +- Sprint review: Bi-weekly +- Standup: Daily (async) + +**Upstream Meetings:** +- SIG meetings: Check [Kubernetes calendar](https://calendar.google.com/calendar/embed?src=calendar%40kubernetes.io) +- OpenShift meetings: Check [OpenShift calendar](https://calendar.google.com/calendar/embed?src=openshift.io_5s2lnu98o7vjhm8hs5q4vkp7s0%40group.calendar.google.com) + +--- + +## Learning Resources + +### Getting Started + +**Kubernetes:** +- [Kubernetes Basics](https://kubernetes.io/docs/tutorials/kubernetes-basics/) +- [Kubernetes the Hard Way](https://github.com/kelseyhightower/kubernetes-the-hard-way) + +**Operator Development:** +- [Operator Pattern](https://kubernetes.io/docs/concepts/extend-kubernetes/operator/) +- [Operator SDK Tutorial](https://sdk.operatorframework.io/docs/building-operators/golang/tutorial/) + +**Cloud Providers:** +- [vSphere Docs](https://docs.vmware.com/en/VMware-vSphere/index.html) +- [AWS Documentation](https://docs.aws.amazon.com/) +- [GCP Documentation](https://cloud.google.com/docs) +- [Azure Documentation](https://docs.microsoft.com/azure/) + +### Advanced Topics + +- [Kubernetes API Conventions](https://github.com/kubernetes/community/blob/master/contributors/devel/sig-architecture/api-conventions.md) +- [Controller Runtime Deep Dive](https://engineering.bitnami.com/articles/kubebuilder-deep-dive.html) +- [Writing Controllers](https://github.com/kubernetes/community/blob/master/contributors/devel/sig-api-machinery/controllers.md) + +--- + +## Related Documentation + +- [Development Guide](../cluster-cloud-controller-manager-operator_DEVELOPMENT.md) - Build and develop +- [Testing Guide](../cluster-cloud-controller-manager-operator_TESTING.md) - Test suites +- [Components](../architecture/components.md) - Architecture +- [ADRs](../decisions/) - Architectural decisions + +--- + +**Note:** Update this document as the ecosystem evolves, dependencies change, or new resources become available. diff --git a/pkg/controllers/problemdetector/credentials.go b/pkg/controllers/problemdetector/credentials.go new file mode 100644 index 000000000..a33f978e5 --- /dev/null +++ b/pkg/controllers/problemdetector/credentials.go @@ -0,0 +1,193 @@ +package problemdetector + +import ( + "context" + "fmt" + "strings" + + corev1 "k8s.io/api/core/v1" + "sigs.k8s.io/controller-runtime/pkg/client" +) + +const ( + // DiagnosticsCredentialsSecretName is the name of the secret containing diagnostics component credentials + DiagnosticsCredentialsSecretName = "vsphere-diagnostics-creds" + // DiagnosticsCredentialsNamespace is the namespace where diagnostics credentials are stored + DiagnosticsCredentialsNamespace = "openshift-config" + // SharedCredentialsSecretName is the fallback shared credentials secret + SharedCredentialsSecretName = "vsphere-cloud-credentials" + // SharedCredentialsNamespace is the namespace for shared credentials + SharedCredentialsNamespace = "kube-system" +) + +// Credential represents vSphere credentials for a specific vCenter +type Credential struct { + VCenterServer string + Username string + Password string +} + +// CredentialStore manages vSphere credentials for diagnostics operations +type CredentialStore struct { + client client.Client +} + +// NewCredentialStore creates a new CredentialStore +func NewCredentialStore(c client.Client) *CredentialStore { + return &CredentialStore{ + client: c, + } +} + +// GetCredentials retrieves diagnostics credentials from openshift-config namespace. +// Falls back to shared credentials if component credentials are not available. +func (cs *CredentialStore) GetCredentials(ctx context.Context) (map[string]*Credential, error) { + // Try component-specific credentials first + creds, err := cs.getCredentialsFromSecret(ctx, DiagnosticsCredentialsSecretName, DiagnosticsCredentialsNamespace) + if err == nil && len(creds) > 0 { + return creds, nil + } + + // Fallback to shared credentials + creds, err = cs.getCredentialsFromSecret(ctx, SharedCredentialsSecretName, SharedCredentialsNamespace) + if err != nil { + return nil, fmt.Errorf("failed to read credentials from %s/%s and fallback %s/%s: %w", + DiagnosticsCredentialsNamespace, DiagnosticsCredentialsSecretName, + SharedCredentialsNamespace, SharedCredentialsSecretName, err) + } + + return creds, nil +} + +// GetCredentialForVCenter retrieves the credential for a specific vCenter FQDN +func (cs *CredentialStore) GetCredentialForVCenter(ctx context.Context, vcenterFQDN string) (*Credential, error) { + creds, err := cs.GetCredentials(ctx) + if err != nil { + return nil, err + } + + cred, ok := creds[vcenterFQDN] + if !ok { + return nil, fmt.Errorf("credential not found for vCenter: %s", vcenterFQDN) + } + + return cred, nil +} + +// getCredentialsFromSecret reads credentials from a Kubernetes secret +func (cs *CredentialStore) getCredentialsFromSecret(ctx context.Context, name, namespace string) (map[string]*Credential, error) { + secret := &corev1.Secret{} + key := client.ObjectKey{ + Name: name, + Namespace: namespace, + } + + if err := cs.client.Get(ctx, key, secret); err != nil { + return nil, fmt.Errorf("failed to get secret %s/%s: %w", namespace, name, err) + } + + return parseCredentialData(secret.Data) +} + +// parseCredentialData parses credential data from secret into a map of credentials by vCenter FQDN +func parseCredentialData(data map[string][]byte) (map[string]*Credential, error) { + credentials := make(map[string]*Credential) + + // Check if this is INI format (legacy) or key-value format + if credentialsINI, ok := data["credentials"]; ok { + return parseINIFormat(credentialsINI) + } + + // Parse key-value format: {vcenter-fqdn}.username, {vcenter-fqdn}.password + usernameKeys := make(map[string]string) + passwordKeys := make(map[string]string) + + for key, value := range data { + if strings.HasSuffix(key, ".username") { + vcenter := strings.TrimSuffix(key, ".username") + usernameKeys[vcenter] = string(value) + } else if strings.HasSuffix(key, ".password") { + vcenter := strings.TrimSuffix(key, ".password") + passwordKeys[vcenter] = string(value) + } + } + + // Match username and password pairs + for vcenter, username := range usernameKeys { + password, ok := passwordKeys[vcenter] + if !ok { + return nil, fmt.Errorf("missing password for vCenter: %s", vcenter) + } + + credentials[vcenter] = &Credential{ + VCenterServer: vcenter, + Username: username, + Password: password, + } + } + + if len(credentials) == 0 { + return nil, fmt.Errorf("no credentials found in secret data") + } + + return credentials, nil +} + +// parseINIFormat parses INI-formatted credentials (legacy format) +func parseINIFormat(data []byte) (map[string]*Credential, error) { + credentials := make(map[string]*Credential) + lines := strings.Split(string(data), "\n") + + var currentVCenter string + var currentCred *Credential + + for _, line := range lines { + line = strings.TrimSpace(line) + if line == "" || strings.HasPrefix(line, "#") || strings.HasPrefix(line, ";") { + continue + } + + // Section header [vcenter.example.com] + if strings.HasPrefix(line, "[") && strings.HasSuffix(line, "]") { + // Save previous credential if exists + if currentCred != nil && currentVCenter != "" { + credentials[currentVCenter] = currentCred + } + + currentVCenter = strings.Trim(line, "[]") + currentCred = &Credential{ + VCenterServer: currentVCenter, + } + continue + } + + // Key-value pair + parts := strings.SplitN(line, "=", 2) + if len(parts) != 2 { + continue + } + + key := strings.TrimSpace(parts[0]) + value := strings.TrimSpace(parts[1]) + + if currentCred != nil { + switch key { + case "username": + currentCred.Username = value + case "password": + currentCred.Password = value + } + } + } + + // Save last credential + if currentCred != nil && currentVCenter != "" { + credentials[currentVCenter] = currentCred + } + + if len(credentials) == 0 { + return nil, fmt.Errorf("no credentials found in INI data") + } + + return credentials, nil +} diff --git a/pkg/controllers/problemdetector/credentials_test.go b/pkg/controllers/problemdetector/credentials_test.go new file mode 100644 index 000000000..65ad29c4e --- /dev/null +++ b/pkg/controllers/problemdetector/credentials_test.go @@ -0,0 +1,83 @@ +package problemdetector + +import ( + "context" + "fmt" + "testing" + + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "sigs.k8s.io/controller-runtime/pkg/client/fake" +) + +// TestGetCredentials_ComponentCredentials verifies that vSphere Problem Detector +// reads vsphere-diagnostics-creds from openshift-config namespace +func TestGetCredentials_ComponentCredentials(t *testing.T) { + // TODO: Implement test + // Given: CCO has provisioned vsphere-diagnostics-creds to openshift-config namespace + // When: vSphere Problem Detector starts + // Then: vSphere Problem Detector reads the secret successfully + // And: vSphere Problem Detector parses credential data for all vCenters +} + +// TestGetCredentials_FallbackToShared verifies fallback behavior when component +// credentials are not available +func TestGetCredentials_FallbackToShared(t *testing.T) { + // TODO: Implement test + // Given: vsphere-diagnostics-creds secret does not exist + // When: vSphere Problem Detector attempts to read credentials + // Then: vSphere Problem Detector falls back to shared credentials + // Or: vSphere Problem Detector reports error to cluster operator status +} + +// TestGetCredentialForVCenter_Success verifies FQDN-based credential lookup +// for multi-vCenter deployments +func TestGetCredentialForVCenter_Success(t *testing.T) { + // TODO: Implement test + // Given: credentials exist for multiple vCenters (vcenter1.example.com, vcenter2.example.com) + // When: vSphere Problem Detector processes diagnostics + // Then: vSphere Problem Detector selects the correct credential based on vCenter FQDN + // And: vSphere Problem Detector uses the matched credential for API calls +} + +// TestGetCredentialForVCenter_NotFound verifies error handling when +// credential is not found for a specific vCenter +func TestGetCredentialForVCenter_NotFound(t *testing.T) { + // TODO: Implement test + // Given: credentials exist for vcenter1.example.com + // When: vSphere Problem Detector attempts to access vcenter2.example.com + // Then: vSphere Problem Detector returns appropriate error + // And: Error message includes the vCenter FQDN that was not found +} + +// TestCredentialRotation verifies graceful credential rotation without downtime +func TestCredentialRotation(t *testing.T) { + // TODO: Implement test + // Given: vSphere Problem Detector is running with existing credentials + // When: vsphere-diagnostics-creds secret is updated with new credentials + // Then: vSphere Problem Detector detects the secret change + // And: vSphere Problem Detector gracefully restarts + // And: vSphere Problem Detector adopts new credentials + // And: Diagnostics continue without downtime + // And: No diagnostic checks are lost during rotation +} + +// TestCredentialRotation_Invalid verifies handling of invalid credentials during rotation +func TestCredentialRotation_Invalid(t *testing.T) { + // TODO: Implement test + // Given: vSphere Problem Detector is running with valid credentials + // When: vsphere-diagnostics-creds secret is updated with invalid credentials + // Then: vSphere Problem Detector detects validation failure + // And: vSphere Problem Detector reports error to cluster operator status + // And: vSphere Problem Detector continues using previous valid credentials (if possible) +} + +// TestMissingCredentials_ErrorHandling verifies error handling when credentials are missing +func TestMissingCredentials_ErrorHandling(t *testing.T) { + // TODO: Implement test + // Given: vsphere-diagnostics-creds secret does not exist + // When: vSphere Problem Detector attempts to read credentials + // Then: vSphere Problem Detector reports error to cluster operator status + // And: vSphere Problem Detector retries with exponential backoff +} diff --git a/pkg/controllers/problemdetector/privileges.go b/pkg/controllers/problemdetector/privileges.go new file mode 100644 index 000000000..6bf47a327 --- /dev/null +++ b/pkg/controllers/problemdetector/privileges.go @@ -0,0 +1,156 @@ +package problemdetector + +import ( + "context" + "fmt" +) + +// DiagnosticsPrivileges defines the required read-only privileges for vSphere Problem Detector +// Total: ~16 read-only privileges +var DiagnosticsPrivileges = []string{ + // System-level read privileges (3) + "System.Anonymous", + "System.Read", + "System.View", + + // vCenter-level read privileges for diagnostics operations (11) + // Tagging operations + "Cns.Searchable", // CNS (Container Native Storage) search + "InventoryService.Tagging.Read", // Read tags + "InventoryService.Tagging.Attach", // Validate tag attachments (read-only check) + + // Session and performance monitoring + "Sessions.View", // View active sessions + "Performance.View", // View performance metrics + "Global.Settings.Read", // Read global settings + + // Health monitoring + "Global.Health", // Check vSphere health status + "Global.Diagnostics", // Access diagnostic information + "Global.ServiceManagers", // View service managers + "VirtualMachine.Config.DiskExtend", // Validate VM disk configuration (read-only check) + + // CNS volume operations (read-only) + "StorageProfile.View", // View storage profiles for CNS + + // Datacenter-level read privilege (1) + "Datacenter.Read", // Read datacenter configuration (System.Read at datacenter level) + + // Datastore-level read privileges (4) + "Datastore.Browse", // Browse datastore contents + "Datastore.FileManagement", // Validate file management capabilities (read-only check) + "Datastore.Config.Read", // Read datastore configuration + "Datastore.Query", // Query datastore metrics and status +} + +// ValidationResult holds the result of privilege validation +type ValidationResult struct { + Valid bool + VCenterServer string + MissingPrivileges []string + Error error +} + +// PrivilegeValidator validates vSphere privileges for diagnostics operations +type PrivilegeValidator struct { + credStore *CredentialStore +} + +// NewPrivilegeValidator creates a new PrivilegeValidator +func NewPrivilegeValidator(credStore *CredentialStore) *PrivilegeValidator { + return &PrivilegeValidator{ + credStore: credStore, + } +} + +// ValidatePrivileges validates that the provided credentials have the required privileges. +// +// # Current Implementation Scope +// +// This implementation performs basic structural validation of credentials: +// - Verifies credentials are non-empty (username and password are set) +// - Returns a validation result indicating whether basic credential structure is valid +// +// # Future Enhancement +// +// Full vSphere API privilege checking is planned for future implementation: +// - Connect to vSphere API using the credentials +// - Query the AuthorizationManager to verify each privilege in DiagnosticsPrivileges +// - Return detailed results showing which specific privileges are present/missing +// - Validate against actual vCenter RBAC configuration +// +// # Rationale for Basic Validation +// +// The basic validation approach is sufficient for initial integration because: +// - It prevents obviously invalid credentials (empty username/password) +// - Component operators will detect privilege issues when performing actual operations +// - Full privilege checking requires additional vSphere API dependencies and error handling +// - This allows the credential integration to proceed while privilege checking is refined +func (pv *PrivilegeValidator) ValidatePrivileges(ctx context.Context, cred *Credential) *ValidationResult { + result := &ValidationResult{ + VCenterServer: cred.VCenterServer, + MissingPrivileges: []string{}, + } + + // Basic credential structure validation + if cred.Username == "" || cred.Password == "" { + result.Valid = false + result.Error = fmt.Errorf("invalid credentials: username or password is empty") + return result + } + + // TODO: Implement actual vSphere API privilege checking + // This would involve: + // 1. Creating a vSphere session using the credentials + // 2. Querying the AuthorizationManager for the user's privileges + // 3. Comparing against DiagnosticsPrivileges to identify missing privileges + // 4. Returning detailed results with specific privilege information + + result.Valid = true + return result +} + +// ValidateAllPrivileges validates privileges for all configured vCenters +func (pv *PrivilegeValidator) ValidateAllPrivileges(ctx context.Context) (map[string]*ValidationResult, error) { + creds, err := pv.credStore.GetCredentials(ctx) + if err != nil { + return nil, fmt.Errorf("failed to get credentials: %w", err) + } + + results := make(map[string]*ValidationResult) + for vcenter, cred := range creds { + results[vcenter] = pv.ValidatePrivileges(ctx, cred) + } + + return results, nil +} + +// GetErrorMessage formats a user-friendly error message for privilege validation failures +func (vr *ValidationResult) GetErrorMessage() string { + if vr.Valid { + return "" + } + + if vr.Error != nil { + return fmt.Sprintf("vCenter %s: %s", vr.VCenterServer, vr.Error.Error()) + } + + if len(vr.MissingPrivileges) == 0 { + return fmt.Sprintf("vCenter %s: privilege validation failed (unknown reason)", vr.VCenterServer) + } + + return fmt.Sprintf("vCenter %s: missing privileges: %v\n"+ + "Please grant the following privileges to the diagnostics account:\n%s", + vr.VCenterServer, + vr.MissingPrivileges, + formatPrivilegeList(vr.MissingPrivileges)) +} + +// formatPrivilegeList formats a list of privileges for display +func formatPrivilegeList(privileges []string) string { + var result string + for _, priv := range privileges { + result += fmt.Sprintf(" - %s\n", priv) + } + return result +}