{self._escape_html(title)}
-{t('generated_by', self.lang)} v{self._get_version()}
- {body} -diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml deleted file mode 100644 index 87e0f60..0000000 --- a/.github/workflows/ci.yml +++ /dev/null @@ -1,110 +0,0 @@ -# ────────────────────────────────────────────────────────────── -# f2a CI — Lint, build, test on every push / PR -# Based on CocoRoF/googer proven workflow pattern. -# -# Optimised for speed: -# - Rust target + registry cached (Swatinem/rust-cache) -# - pip cached -# - lint & test run in parallel -# - matrix trimmed: full OS × Python only on main; PR = Linux-only -# ────────────────────────────────────────────────────────────── -name: CI - -on: - push: - branches: [main] - pull_request: - branches: [main] - -env: - CARGO_INCREMENTAL: "1" - CARGO_NET_RETRY: "10" - RUSTUP_MAX_RETRIES: "10" - -jobs: - # ── Version consistency check (fast, no build) ──────────── - check-versions: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v4 - - - name: Verify pyproject.toml == Cargo.toml versions - run: | - PY_VER=$(grep -oP '^version\s*=\s*"\K[^"]+' pyproject.toml) - RS_VER=$(grep -oP '^version\s*=\s*"\K[^"]+' Cargo.toml) - echo "pyproject.toml = $PY_VER" - echo "Cargo.toml = $RS_VER" - if [ "$PY_VER" != "$RS_VER" ]; then - echo "::error::Version mismatch! pyproject.toml=$PY_VER vs Cargo.toml=$RS_VER — update both files." - exit 1 - fi - - # ── Lint (Rust + Python) ────────────────────────────────── - lint: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v4 - - - name: Install Rust toolchain - uses: dtolnay/rust-toolchain@stable - with: - components: clippy, rustfmt - - - name: Rust cache - uses: Swatinem/rust-cache@v2 - with: - cache-on-failure: true - - - name: cargo fmt --check - run: cargo fmt --all -- --check - - - name: cargo clippy - run: cargo clippy --all-targets -- -D warnings - - # ── Test (cross-platform × multi-Python) ────────────────── - test: - # Run in parallel with lint (no 'needs') - strategy: - fail-fast: false - matrix: - os: [ubuntu-latest] - python-version: ["3.10", "3.12"] - include: - # Spot-check other platforms with one Python version - - os: macos-14 - python-version: "3.12" - - os: windows-latest - python-version: "3.12" - runs-on: ${{ matrix.os }} - steps: - - uses: actions/checkout@v4 - - - name: Install Rust toolchain - uses: dtolnay/rust-toolchain@stable - - - name: Rust cache - uses: Swatinem/rust-cache@v2 - with: - key: ${{ matrix.os }}-py${{ matrix.python-version }} - cache-on-failure: true - - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v5 - with: - python-version: ${{ matrix.python-version }} - - - name: Pip cache - uses: actions/cache@v4 - with: - path: ~/.cache/pip - key: pip-${{ matrix.os }}-py${{ matrix.python-version }}-${{ hashFiles('pyproject.toml') }} - restore-keys: | - pip-${{ matrix.os }}-py${{ matrix.python-version }}- - - - name: Install package and test deps - run: | - python -m pip install --upgrade pip - python -m pip install .[dev] - - - name: Run tests - run: python -m pytest -v --tb=short diff --git a/.github/workflows/pages.yml b/.github/workflows/pages.yml new file mode 100644 index 0000000..f20156e --- /dev/null +++ b/.github/workflows/pages.yml @@ -0,0 +1,43 @@ +# ────────────────────────────────────────────────────────────── +# GitHub Pages — Serve sample HTML reports +# Deploys the sample/ folder so reports can be viewed in-browser +# ────────────────────────────────────────────────────────────── +name: Deploy Sample Reports to Pages + +on: + push: + branches: [main] + paths: [sample/**] + workflow_dispatch: + +permissions: + contents: read + pages: write + id-token: write + +concurrency: + group: pages + cancel-in-progress: true + +jobs: + deploy: + runs-on: ubuntu-latest + environment: + name: github-pages + url: ${{ steps.deployment.outputs.page_url }} + steps: + - uses: actions/checkout@v5 + + - name: Setup Pages + uses: actions/configure-pages@v5 + + - name: Upload artifact + uses: actions/upload-pages-artifact@v3 + with: + path: sample/ + + - name: Deploy to GitHub Pages + id: deployment + uses: actions/deploy-pages@v4 + with: + enablement: true diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index 022789e..c429466 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -1,73 +1,83 @@ # ────────────────────────────────────────────────────────────── -# f2a — Cross-platform wheel build & PyPI deploy -# -# Trigger : push to the `deploy` branch (only when pyproject.toml changes) -# Pipeline : -# 1. check-version — skip if version already on PyPI -# 2. test — pytest across Python 3.10–3.13 -# 3. build-wheels — maturin native wheels (5 targets × 4 Pythons) -# 4. build-sdist — source distribution -# 5. publish — upload to PyPI via Trusted Publisher (OIDC) -# -# Based on CocoRoF/googer proven workflow pattern. +# PyPI auto-deploy workflow +# Trigger: push to the deploy branch +# Condition: only deploy if not yet published on PyPI, or version bumped # ────────────────────────────────────────────────────────────── -name: Build & Publish to PyPI +name: Publish to PyPI on: push: branches: - deploy - paths: - - "pyproject.toml" jobs: - # ── 1. Version gate ─────────────────────────────────────── + # ── Step 1: Version check ───────────────────────────────── check-version: runs-on: ubuntu-latest outputs: - version_changed: ${{ steps.check.outputs.changed }} - new_version: ${{ steps.check.outputs.new_version }} + should_publish: ${{ steps.decide.outputs.should_publish }} + local_version: ${{ steps.local.outputs.version }} steps: - uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 with: - fetch-depth: 2 + python-version: "3.12" - - name: Check version change - id: check + - name: Read local version + id: local run: | - NEW_VERSION=$(grep -oP '^version\s*=\s*"\K[^"]+' pyproject.toml) - echo "new_version=$NEW_VERSION" >> "$GITHUB_OUTPUT" - - # Verify Cargo.toml version matches - CARGO_VERSION=$(grep -oP '^version\s*=\s*"\K[^"]+' Cargo.toml) - if [ "$NEW_VERSION" != "$CARGO_VERSION" ]; then - echo "::error::Version mismatch! pyproject.toml=$NEW_VERSION vs Cargo.toml=$CARGO_VERSION" - exit 1 - fi + VERSION=$(python -c "import tomllib; print(tomllib.load(open('pyproject.toml','rb'))['project']['version'])") + echo "version=$VERSION" >> "$GITHUB_OUTPUT" + echo "📦 Local version: $VERSION" - # Compare with previous commit - OLD_VERSION=$(git show HEAD~1:pyproject.toml 2>/dev/null | grep -oP '^version\s*=\s*"\K[^"]+' || echo "") - - echo "Old version: $OLD_VERSION" - echo "New version: $NEW_VERSION" + - name: Check PyPI for existing version + id: pypi + run: | + LOCAL="${{ steps.local.outputs.version }}" + # PyPI JSON API — returns 404 if the package does not exist + HTTP_CODE=$(curl -s -o /tmp/pypi.json -w "%{http_code}" \ + "https://pypi.org/pypi/f2a/json") + + if [ "$HTTP_CODE" = "404" ]; then + echo "pypi_version=NONE" >> "$GITHUB_OUTPUT" + echo "🆕 Package not yet on PyPI" + else + PYPI_VER=$(python -c " + import json, pathlib + data = json.loads(pathlib.Path('/tmp/pypi.json').read_text()) + print(data['info']['version']) + ") + echo "pypi_version=$PYPI_VER" >> "$GITHUB_OUTPUT" + echo "📡 PyPI version: $PYPI_VER" + fi - if [ -z "$OLD_VERSION" ] || [ "$OLD_VERSION" != "$NEW_VERSION" ]; then - echo "changed=true" >> "$GITHUB_OUTPUT" - echo "✅ Version changed: $OLD_VERSION -> $NEW_VERSION" + - name: Decide whether to publish + id: decide + run: | + LOCAL="${{ steps.local.outputs.version }}" + PYPI="${{ steps.pypi.outputs.pypi_version }}" + + if [ "$PYPI" = "NONE" ]; then + echo "should_publish=true" >> "$GITHUB_OUTPUT" + echo "✅ First publish — will deploy $LOCAL" + elif [ "$LOCAL" != "$PYPI" ]; then + echo "should_publish=true" >> "$GITHUB_OUTPUT" + echo "✅ Version bumped ($PYPI → $LOCAL) — will deploy" else - echo "changed=false" >> "$GITHUB_OUTPUT" - echo "⏭️ Version unchanged, skipping publish." + echo "should_publish=false" >> "$GITHUB_OUTPUT" + echo "⏭️ Version $LOCAL already on PyPI — skipping" fi - # ── 2. Test ─────────────────────────────────────────────── + # ── Step 2: Tests ───────────────────────────────────────── test: needs: check-version - if: needs.check-version.outputs.version_changed == 'true' + if: needs.check-version.outputs.should_publish == 'true' runs-on: ubuntu-latest strategy: - fail-fast: false matrix: - python-version: ["3.10", "3.13"] + python-version: ["3.10", "3.11", "3.12"] steps: - uses: actions/checkout@v4 @@ -76,123 +86,41 @@ jobs: with: python-version: ${{ matrix.python-version }} - - name: Install Rust toolchain - uses: dtolnay/rust-toolchain@stable - - - name: Rust cache - uses: Swatinem/rust-cache@v2 - with: - key: publish-py${{ matrix.python-version }} - cache-on-failure: true - - - name: Pip cache - uses: actions/cache@v4 - with: - path: ~/.cache/pip - key: pip-publish-py${{ matrix.python-version }}-${{ hashFiles('pyproject.toml') }} - restore-keys: | - pip-publish-py${{ matrix.python-version }}- - - - name: Install package and test deps + - name: Install dependencies run: | python -m pip install --upgrade pip - python -m pip install .[dev] + pip install -e ".[dev]" + pip install beautifulsoup4 html5lib - name: Run tests - run: python -m pytest -v --tb=short - - # ── 3. Build wheels (maturin) ───────────────────────────── - build-wheels: - needs: [check-version, test] - if: needs.check-version.outputs.version_changed == 'true' - strategy: - fail-fast: false - matrix: - include: - # ── Linux x86_64 ── - - os: ubuntu-latest - target: x86_64-unknown-linux-gnu - manylinux: auto - # ── Linux aarch64 ── - - os: ubuntu-latest - target: aarch64-unknown-linux-gnu - manylinux: auto - # ── macOS x86_64 (Intel, cross-compiled on ARM) ── - - os: macos-14 - target: x86_64-apple-darwin - manylinux: "off" - # ── macOS aarch64 (Apple Silicon) ── - - os: macos-14 - target: aarch64-apple-darwin - manylinux: "off" - # ── Windows x86_64 ── - - os: windows-latest - target: x86_64-pc-windows-msvc - manylinux: "off" - - runs-on: ${{ matrix.os }} - steps: - - uses: actions/checkout@v4 - - - name: Build wheels - uses: PyO3/maturin-action@v1 - with: - target: ${{ matrix.target }} - args: --release --out dist --interpreter 3.10 3.11 3.12 3.13 - manylinux: ${{ matrix.manylinux }} - before-script-linux: | - # Ensure Perl is available for vendored OpenSSL build (openssl-src) - if command -v yum &> /dev/null; then - yum install -y perl-IPC-Cmd perl-core - elif command -v apk &> /dev/null; then - apk add --no-cache perl make - fi - - - name: Upload wheels - uses: actions/upload-artifact@v4 - with: - name: wheels-${{ matrix.target }} - path: dist/*.whl + run: pytest git_action/tests/ -v --tb=short - # ── 4. Build sdist ──────────────────────────────────────── - build-sdist: + # ── Step 3: Build & Deploy ──────────────────────────────── + publish: needs: [check-version, test] - if: needs.check-version.outputs.version_changed == 'true' + if: needs.check-version.outputs.should_publish == 'true' runs-on: ubuntu-latest + environment: pypi + permissions: + id-token: write # Trusted Publisher (OIDC) steps: - uses: actions/checkout@v4 - - name: Build sdist - uses: PyO3/maturin-action@v1 - with: - command: sdist - args: --out dist - - - name: Upload sdist - uses: actions/upload-artifact@v4 + - name: Set up Python + uses: actions/setup-python@v5 with: - name: sdist - path: dist/*.tar.gz + python-version: "3.12" - # ── 5. Publish to PyPI ──────────────────────────────────── - publish: - needs: [check-version, build-wheels, build-sdist] - if: needs.check-version.outputs.version_changed == 'true' - runs-on: ubuntu-latest - environment: pypi - permissions: - id-token: write # Required for Trusted Publisher (OIDC) - steps: - - name: Download all artifacts - uses: actions/download-artifact@v4 - with: - path: dist - merge-multiple: true + - name: Install build tools + run: python -m pip install --upgrade pip build - - name: List artifacts - run: ls -lh dist/ + - name: Build package + run: python -m build - name: Publish to PyPI uses: pypa/gh-action-pypi-publish@release/v1 - # Uses Trusted Publisher (OIDC) — no API token needed. - # Register at: https://pypi.org/manage/project/f2a/settings/publishing/ + # Uses Trusted Publisher, so no API token required. + # You must register GitHub Actions as a Trusted Publisher in the PyPI project settings. + # To use a manual token instead, uncomment below: + # with: + # password: ${{ secrets.PYPI_API_TOKEN }} diff --git a/.gitignore b/.gitignore index c9c39d9..a75caaf 100644 --- a/.gitignore +++ b/.gitignore @@ -1,10 +1,6 @@ # f2a — .gitignore -# ── Rust ── -target/ -**/*.rs.bk - -# ── Python ── +# Python __pycache__/ *.py[cod] *$py.class @@ -12,35 +8,41 @@ __pycache__/ dist/ build/ *.egg -*.so -*.pyd -*.pdb -# ── Virtual environments ── +# Virtual environments .venv/ venv/ env/ -# ── IDE ── +# IDE .vscode/ .idea/ *.swp *.swo -# ── OS ── +# OS .DS_Store Thumbs.db -# ── Test & Coverage ── +# Test & Coverage .pytest_cache/ htmlcov/ .coverage coverage.xml -test_data_e2e/ -# ── Output ── +# Manual test folder +test/ + +# Output +examples/output/ +examples/sample_data.csv output/*.html -# ── mypy / ruff ── +# Allow sample reports +!sample/ + +# mypy .mypy_cache/ + +# ruff .ruff_cache/ diff --git a/ADVANCED_ANALYSIS_PLAN.md b/ADVANCED_ANALYSIS_PLAN.md new file mode 100644 index 0000000..374f785 --- /dev/null +++ b/ADVANCED_ANALYSIS_PLAN.md @@ -0,0 +1,371 @@ +# f2a Advanced Analysis Plan + +> **목적**: ML 논문/기법 기반의 고급 분석 기능을 체계적으로 설계하고, HTML 리포트에 2-depth 탭 구조로 제공한다. + +--- + +## 1. 현재 상태 분석 (As-Is) + +### 1.1 현재 구현된 분석 (Basic Report) + +| 영역 | 기법 | 비고 | +|------|------|------| +| **Descriptive** | count, missing, unique, mean, median, std, SE, CV, MAD, min/max/range, p5/q1/q3/p95, IQR, skewness, kurtosis | 16개 수치 지표 | +| **Distribution** | Shapiro-Wilk, D'Agostino, KS, Anderson-Darling, skew/kurt 분류 | 4개 정규성 검정 | +| **Correlation** | Pearson, Spearman, Kendall, Cramér's V, VIF | 5종 상관 분석 | +| **Missing** | column summary, row distribution, total ratio | 기초 결측 분석 | +| **Outlier** | IQR method, Z-score method | 2종 이상치 탐지 | +| **Categorical** | frequency, entropy, chi-square independence | 범주 분석 | +| **Feature Importance** | variance ranking, mean abs correlation, mutual information | 3종 중요도 | +| **PCA** | StandardScaler + PCA, scree, loadings | 기초 차원축소 | +| **Duplicates** | exact duplicates, column uniqueness | 중복 탐지 | +| **Quality** | completeness, uniqueness, consistency, validity (weighted) | 4차원 품질 | + +### 1.2 현재 HTML 구조 + +``` +Header +├── [1-depth tabs: subset/split 선택] ← 현재 유일한 탭 계층 +│ ├── Overview +│ ├── Data Quality +│ ├── Preprocessing +│ ├── Descriptive Statistics +│ ├── Distribution Analysis +│ ├── Correlation Analysis +│ ├── Missing Data +│ ├── Outlier Detection +│ ├── Categorical Analysis +│ ├── Feature Importance +│ ├── PCA +│ ├── Duplicates +│ └── Warnings +Footer +``` + +--- + +## 2. Advanced 분석 기법 설계 (To-Be) + +### 2.1 새로운 HTML 2-Depth 탭 구조 + +``` +Header +├── [1-depth: subset/split 선택] ← 기존 +│ ├── [2-depth: Basic | Advanced] ← 신규 +│ │ ├── Basic → 기존 모든 섹션 그대로 +│ │ └── Advanced +│ │ ├── A1. Advanced Distribution +│ │ ├── A2. Advanced Correlation +│ │ ├── A3. Clustering Analysis +│ │ ├── A4. Dimensionality Reduction +│ │ ├── A5. Feature Engineering Insights +│ │ ├── A6. Anomaly Detection +│ │ ├── A7. Statistical Tests +│ │ └── A8. Data Profiling Summary +Footer +``` + +--- + +### 2.2 Advanced 탭 상세 설계 + +--- + +#### A1. Advanced Distribution Analysis + +| 기법 | 근거 | 구현 계획 | 효과 | +|------|------|-----------|------| +| **Kernel Density Estimation (KDE) bandwidth selection** | Silverman(1986), Scott's rule | `scipy.stats.gaussian_kde`로 최적 bandwidth 자동 추정, KDE curve + histogram overlay | 데이터의 실제 분포 형태를 비모수적으로 파악 | +| **Best-fit distribution matching** | D'Agostino & Stephens(1986) | `scipy.stats`의 주요 분포(norm, lognorm, exponential, gamma, beta, weibull, uniform) 피팅 후 AIC/BIC 비교 | 각 컬럼이 어떤 이론적 분포에 가장 가까운지 자동 식별 | +| **Jarque-Bera test** | Jarque & Bera(1987) | `scipy.stats.jarque_bera` — skewness+kurtosis 기반 정규성 검정 | 기존 4개 검정에 추가, 대표본에 특히 유효 | +| **Power transformation recommendation** | Box-Cox(1964), Yeo-Johnson(2000) | `scipy.stats.boxcox`/`yeojohnson`으로 변환 후 skewness 변화량 측정 | 어떤 변환이 정규성을 개선하는지 자동 추천 | +| **Empirical CDF** | Kolmogorov(1933) | `statsmodels.distributions.empirical_distribution.ECDF` 또는 직접 step plot | 데이터의 누적 분포를 직관적으로 시각화 | + +**시각화:** +- KDE overlay histograms (bandwidth comparison) +- Best-fit distribution overlay plot (데이터 + 최적 분포 곡선) +- Power transformation before/after comparison +- ECDF step plots + +--- + +#### A2. Advanced Correlation Analysis + +| 기법 | 근거 | 구현 계획 | 효과 | +|------|------|-----------|------| +| **Partial correlation** | Fisher(1924) | 다른 변수를 제어한 상태에서의 순수 상관. inverse correlation matrix에서 추출 | 교란 변수 제거한 진정한 관계 파악 | +| **Distance correlation** | Székely et al.(2007) | `dcor` 라이브러리 또는 직접 구현. 비선형 관계까지 감지 | Pearson이 놓치는 비선형 의존성 탐지 | +| **Mutual Information heatmap** | Shannon(1948), Kraskov et al.(2004) | sklearn `mutual_info_regression`으로 전체 컬럼 쌍 MI 행렬 생성 | 비선형 정보 공유량의 정량적 시각화 | +| **Correlation stability (bootstrap)** | Efron(1979) | 상관계수의 bootstrap 신뢰구간 (95% CI) 계산 | 상관 추정의 신뢰도/안정성 평가 | +| **Correlation network graph** | Graph theory | 상관 threshold 초과 쌍을 node-edge로 시각화 | 변수 간 관계 구조의 직관적 네트워크 파악 | + +**시각화:** +- Partial correlation heatmap +- MI heatmap +- Bootstrap correlation CI forest plot +- Correlation network graph (matplotlib `networkx` layout) + +--- + +#### A3. Clustering Analysis + +| 기법 | 근거 | 구현 계획 | 효과 | +|------|------|-----------|------| +| **K-Means + Elbow/Silhouette** | MacQueen(1967), Rousseeuw(1987) | sklearn `KMeans` (k=2~10) + inertia elbow + silhouette score → optimal k 자동 결정 | 데이터의 자연 군집 구조 탐색 | +| **DBSCAN** | Ester et al.(1996) | sklearn `DBSCAN` with automated eps (k-distance graph) | 밀도 기반 클러스터링, 노이즈/이상치 자연 분리 | +| **Hierarchical clustering (dendrogram)** | Ward(1963) | `scipy.cluster.hierarchy.linkage` + dendrogram | 계층적 구조 시각화, 적절한 컷 레벨 참고 | +| **Cluster profiling** | — | 군집별 평균/분포 요약 테이블 생성 | 각 군집의 특성 자동 프로파일링 | + +**시각화:** +- Elbow plot + Silhouette score plot +- 2D PCA scatter with cluster labels (color-coded) +- DBSCAN result scatter (noise = gray) +- Dendrogram +- Cluster profile radar/bar chart + +--- + +#### A4. Dimensionality Reduction (확장) + +| 기법 | 근거 | 구현 계획 | 효과 | +|------|------|-----------|------| +| **t-SNE** | van der Maaten & Hinton(2008) | sklearn `TSNE(n_components=2, perplexity=30)` | 고차원 데이터의 2D 비선형 임베딩으로 군집 시각화 | +| **UMAP** | McInnes et al.(2018) | `umap-learn` 라이브러리 (optional dependency) | t-SNE보다 빠르고 전역 구조 보존 | +| **Factor Analysis** | Spearman(1904), Thurstone(1935) | sklearn `FactorAnalysis` — 잠재 요인 추출 + loadings | PCA와 달리 잠재 변수 모델, 해석력 우수 | +| **Explained variance per feature** | Kaiser criterion(1960) | 각 원본 feature가 top-k PC에 기여하는 분산 비율 | feature-level 중요도의 차원축소 관점 제공 | + +**시각화:** +- t-SNE 2D scatter (cluster labels overlay 가능) +- UMAP 2D scatter +- Factor loadings heatmap +- Feature contribution stacked bar chart + +--- + +#### A5. Feature Engineering Insights + +| 기법 | 근거 | 구현 계획 | 효과 | +|------|------|-----------|------| +| **Interaction detection** | Friedman & Popescu(2008) | 수치 컬럼 쌍의 곱/비율 생성 후 분산/상관 분석 | 유망한 interaction feature 자동 발견 | +| **Monotonic relationship detection** | Spearman rho | Spearman vs. Pearson 차이로 비선형 단조성 판별 | 변환이 필요한 비선형 관계 식별 | +| **Binning analysis** | Dougherty et al.(1995) | 수치 컬럼의 equal-width/equal-freq 빈 생성, 빈별 엔트로피 비교 | 이산화 전략 선택 도움 | +| **Cardinality analysis** | — | 범주형 컬럼의 유니크 비율별 인코딩 전략 추천 (one-hot / target / ordinal) | 전처리 파이프라인 설계 자동 가이드 | +| **Target leakage detection** | Kaufman et al.(2012) | 수치 컬럼 중 다른 컬럼과 r>0.99 또는 MI≈max인 쌍 경고 | 데이터 누수 조기 발견 | + +**시각화:** +- Top-N interaction feature 분포 히스토그램 +- Spearman vs Pearson 차이 bar chart +- Encoding strategy recommendation 테이블 + +--- + +#### A6. Anomaly Detection (확장) + +| 기법 | 근거 | 구현 계획 | 효과 | +|------|------|-----------|------| +| **Isolation Forest** | Liu et al.(2008) | sklearn `IsolationForest` → anomaly score per row | 다변량 이상치 탐지 (IQR/Z-score는 단변량) | +| **Local Outlier Factor (LOF)** | Breunig et al.(2000) | sklearn `LocalOutlierFactor` → LOF score per row | 밀도 기반 국소 이상치, 군집 밖의 점 탐지 | +| **Mahalanobis distance** | Mahalanobis(1936) | 공분산 기반 다변량 거리, chi-squared 임계값 | 상관 구조를 고려한 다변량 이상치 | +| **Anomaly summary** | — | 다수 방법의 consensus (≥2 방법에서 anomaly → 고확률) | 단일 방법 의존 제거, 견고한 이상치 판정 | + +**시각화:** +- Isolation Forest anomaly score 분포 히스토그램 +- LOF score scatter (2D PCA 공간에서) +- Mahalanobis distance 히스토그램 with chi-squared 임계선 +- Consensus anomaly heatmap (row × method) + +--- + +#### A7. Statistical Tests + +| 기법 | 근거 | 구현 계획 | 효과 | +|------|------|-----------|------| +| **Levene's test (등분산)** | Levene(1960) | 범주별로 수치 컬럼의 분산 동질성 검정 | ANOVA 전제조건 확인 | +| **Kruskal-Wallis test** | Kruskal & Wallis(1952) | 비모수 다집단 중위수 비교 | 비정규 분포에서의 집단 차이 검정 | +| **Mann-Whitney U test** | Mann & Whitney(1947) | 이진 범주와 수치 컬럼 간 비모수 검정 | 두 집단 차이의 비모수 평가 | +| **Chi-square goodness of fit** | Pearson(1900) | 범주형 컬럼의 균등 분포 검정 | 범주 분포의 편향 정도 정량 평가 | +| **Grubbs' test** | Grubbs(1950) | 단일 이상치의 통계적 유의성 검정 | 극단값의 통계적 유의미성 판별 | +| **Stationarity (ADF test)** | Dickey & Fuller(1979) | 시계열 컬럼의 단위근 검정 (`statsmodels.tsa`) | 시계열 정상성 자동 판단 | + +**시각화:** +- Group comparison boxplots (Kruskal-Wallis/Mann-Whitney와 함께) +- Test results summary table with p-values and significance stars +- Levene test bar chart per column + +--- + +#### A8. Data Profiling Summary + +| 기법 | 근거 | 구현 계획 | 효과 | +|------|------|-----------|------| +| **Automated insight generation** | AutoEDA literature | 모든 분석 결과를 종합하여 자연어 인사이트 생성 | 비전문가도 핵심 발견 사항을 즉시 파악 | +| **Feature type recommendation** | — | 각 컬럼의 분포/유니크/결측 패턴으로 최적 ML 타입 추천 | ML 파이프라인 설계 가이드 | +| **Dataset complexity scoring** | Ho & Basu(2002) | 차원수, 클래스 수, 불균형도, 상관 구조 → 복잡도 점수 | 데이터셋 난이도의 정량적 평가 | +| **Overall health dashboard** | — | 전체 분석 결과의 1-page 대시보드 (트래픽 라이트 시스템) | 데이터 상태의 즉각적 파악 | + +**시각화:** +- Health score radar chart (6 축: completeness, consistency, outlier ratio, skewness, correlation, duplicates) +- Insight cards (자동 생성된 주요 발견 사항) +- Feature type recommendation table + +--- + +## 3. 기술 의존성 분석 + +### 3.1 새로 필요한 패키지 + +| 패키지 | 용도 | 필수 여부 | 비고 | +|--------|------|-----------|------| +| `scikit-learn` | K-Means, DBSCAN, IsolationForest, LOF, FactorAnalysis, t-SNE, MI | **이미 설치됨** | core dependency | +| `networkx` | Correlation network graph | Optional | `try/except` 처리 | +| `umap-learn` | UMAP 차원축소 | Optional | `try/except` 처리 | +| `statsmodels` | ADF test, ECDF | Optional | `try/except` 처리 | + +**원칙:** `scikit-learn`과 기존 종속성(`scipy`, `numpy`, `pandas`, `matplotlib`, `seaborn`)만으로 A1~A8의 80%+ 구현 가능. `networkx`, `umap-learn`, `statsmodels`는 optional — 없으면 해당 분석을 건너뛰고 "library not available" 메시지 표시. + +### 3.2 성능 고려사항 + +| 기법 | 시간 복잡도 | 대응 전략 | +|------|-------------|-----------| +| t-SNE | O(n²) | n>5000이면 샘플링 후 수행 | +| UMAP | O(n·log(n)) | n>10000이면 샘플링 | +| Isolation Forest | O(n·t·log(n)) | max_samples=min(256, n) 기본 | +| MI 행렬 | O(n·d²) | d>30이면 top-30 컬럼만 | +| Bootstrap CI | O(B·n) | B=1000, n>5000이면 샘플링 | +| K-Means elbow | O(k·n·d·iter) | k=2~10, max_iter=100 | +| Best-fit distribution | O(n·d_count) | 7개 분포만 피팅 | + +--- + +## 4. 구현 계획 + +### Phase 1: 인프라 (2-depth 탭 구조) + +1. **`AnalysisConfig` 확장** — `advanced: bool = True` 플래그 + `AdvancedConfig` sub-dataclass 추가 +2. **HTML generator 2-depth 탭** — 기존 subset 탭 내부에 "Basic / Advanced" 서브탭 도입 +3. **`StatsResult` 확장** — `advanced_stats: dict[str, Any]` 필드 추가 +4. **`VizResult` 확장** — advanced plot 메서드 추가 + +### Phase 2: Advanced Stats 모듈 (4개 파일) + +5. **`stats/advanced_distribution.py`** — best_fit, kde_bandwidth, jarque_bera, power_transform, ecdf +6. **`stats/advanced_correlation.py`** — partial_corr, mi_matrix, bootstrap_ci, correlation_network_data +7. **`stats/clustering.py`** — kmeans_analysis, dbscan_analysis, hierarchical, cluster_profiles +8. **`stats/statistical_tests.py`** — levene, kruskal_wallis, mann_whitney, chi_sq_goodness, grubbs, adf_stationarity + +### Phase 3: Advanced Stats 모듈 (3개 파일) + +9. **`stats/advanced_anomaly.py`** — isolation_forest, lof, mahalanobis, consensus_anomaly +10. **`stats/advanced_dimreduction.py`** — tsne, umap, factor_analysis, feature_contribution +11. **`stats/feature_insights.py`** — interaction_detection, monotonic_detection, binning_analysis, cardinality_analysis, leakage_detection + +### Phase 4: Advanced Viz 모듈 (4개 파일) + +12. **`viz/advanced_dist_plots.py`** — best_fit_overlay, power_transform_comparison, ecdf_plot, kde_bandwidth_comparison +13. **`viz/advanced_corr_plots.py`** — partial_corr_heatmap, mi_heatmap, bootstrap_ci_plot, network_graph +14. **`viz/cluster_plots.py`** — elbow_plot, silhouette_plot, cluster_scatter, dendrogram, cluster_profiles_chart +15. **`viz/advanced_anomaly_plots.py`** — isolation_forest_hist, lof_scatter, mahalanobis_hist, consensus_heatmap + +### Phase 5: 통합 + +16. **Analyzer `_compute_advanced_stats()` 추가** — 각 advanced 모듈 호출 +17. **VizResult advanced plot 메서드 추가** — 각 advanced viz 호출 +18. **HTML generator advanced 섹션 빌더** — 8개 advanced 섹션 + 서브탭 +19. **Data Profiling Summary (A8)** — insights 자동 생성 로직 + +### Phase 6: 마무리 + +20. **pyproject.toml 업데이트** — optional deps 추가 +21. **`_METRIC_TIPS` 확장** — advanced 지표 tooltip 추가 +22. **End-to-end 테스트** — 실제 데이터셋으로 전체 리포트 생성 검증 + +--- + +## 5. 효과성 평가 + +### 5.1 분석 범위 확장 + +| 카테고리 | Basic (현재) | + Advanced | 커버리지 증가 | +|----------|-----------|------------|---------------| +| 정규성/분포 검정 | 4종 | +3종 (JB, 7-dist fitting, power transform) | +75% | +| 상관 분석 | 5종 | +4종 (partial, MI matrix, bootstrap CI, network) | +80% | +| 이상치 탐지 | 2종 (단변량) | +3종 (다변량: IF, LOF, Mahalanobis) | +150% | +| 차원축소 | PCA 1종 | +3종 (t-SNE, UMAP, Factor Analysis) | +300% | +| 군집 분석 | 0종 | +3종 (K-Means, DBSCAN, Hierarchical) | 신규 | +| 통계 검정 | 4종 (정규성) | +6종 (등분산, 비모수, 적합도, Grubbs, ADF) | +150% | +| Feature 공학 | 3종 (중요도) | +5종 (interaction, monotonic, binning, cardinality, leakage) | +167% | +| Data profiling | 품질 점수 | +3종 (insights, type recommendation, complexity) | +300% | + +### 5.2 실무적 가치 + +1. **비선형 관계 탐지**: Pearson/Spearman만으로는 포착 불가능한 비선형 의존성을 MI, distance correlation 으로 발견 +2. **다변량 이상치**: IQR/Z-score는 단변량 — Isolation Forest와 LOF로 변수 간 상호작용 고려한 이상치 탐지 +3. **군집 구조 발견**: 데이터의 자연 그룹을 자동 탐색, ML 모델링 전 데이터 이해도 극대화 +4. **최적 분포 식별**: 각 변수의 이론적 분포를 자동 피팅하여 변환/모델링 전략 결정 +5. **통계적 유의성**: 시각적 차이를 넘어 통계 검정으로 엄밀한 판단 근거 제공 +6. **Feature 공학 자동화**: interaction feature, 인코딩 전략, 데이터 누수를 자동 탐지 + +### 5.3 학술적 근거 (Key References) + +| # | 논문/방법 | 연도 | 핵심 기여 | +|---|-----------|------|-----------| +| 1 | Silverman, *Density Estimation for Statistics and Data Analysis* | 1986 | KDE bandwidth selection | +| 2 | Jarque & Bera, *Efficient tests for normality* | 1987 | JB normality test | +| 3 | Box & Cox, *An analysis of transformations* | 1964 | Power transformation | +| 4 | Yeo & Johnson, *A new family of power transformations* | 2000 | 음수 허용 power transform | +| 5 | Székely et al., *Measuring and testing dependence by correlation of distances* | 2007 | Distance correlation | +| 6 | Shannon, *A mathematical theory of communication* | 1948 | Mutual information | +| 7 | Efron, *Bootstrap methods: another look at the jackknife* | 1979 | Bootstrap CI | +| 8 | MacQueen, *Some methods for classification* | 1967 | K-Means | +| 9 | Ester et al., *A density-based algorithm (DBSCAN)* | 1996 | DBSCAN | +| 10 | Rousseeuw, *Silhouettes: a graphical aid* | 1987 | Silhouette score | +| 11 | van der Maaten & Hinton, *Visualizing data using t-SNE* | 2008 | t-SNE | +| 12 | McInnes et al., *UMAP: Uniform manifold approximation* | 2018 | UMAP | +| 13 | Liu et al., *Isolation forest* | 2008 | Isolation Forest | +| 14 | Breunig et al., *LOF: identifying density-based local outliers* | 2000 | LOF | +| 15 | Mahalanobis, *On the generalized distance in statistics* | 1936 | Mahalanobis distance | +| 16 | Fisher, *The distribution of the partial correlation coefficient* | 1924 | Partial correlation | +| 17 | Levene, *Robust tests for equality of variances* | 1960 | Levene's test | +| 18 | Kruskal & Wallis, *Use of ranks in one-criterion variance analysis* | 1952 | KW test | +| 19 | Dickey & Fuller, *Distribution of the estimators* | 1979 | ADF stationarity test | +| 20 | Ho & Basu, *Complexity measures of supervised classification problems* | 2002 | Dataset complexity | + +--- + +## 6. 2-Depth 탭 UI 설계 + +### 6.1 탭 구조 + +```html + +
+ + +
+
+
+
+
Generated from f2a.analyze("lerobot/roboturk") — a single line of code.
No data available
" + + sub = df.head(max_rows) + # Build table manually to inject data-tip attributes + parts: list[str] = ['| {idx_name} | ") + for col in sub.columns: + col_str = str(col) + tip = _METRIC_TIPS.get(col_str, "") + tip_attr = f' data-tip="{tip}"' if tip else "" + key_attr = f' data-tip-key="{col_str}"' if tip else "" + parts.append(f"{col} | ") + parts.append("
|---|---|
| {html_mod.escape(str(idx_val))} | ") + for col in sub.columns: + val = row[col] + col_str = str(col) + col_tip = _METRIC_TIPS.get(col_str, "") + # Format the display value + if isinstance(val, float): + display = f"{val:.4f}" + else: + display = str(val) if pd.notna(val) else "NaN" + tip_attr = f' data-tip="{col_tip}"' if col_tip else "" + key_attr = f' data-tip-key="{col_str}"' if col_tip else "" + parts.append(f"{html_mod.escape(display)} | ") + parts.append("
{html_mod.escape(exec_summary)}
' + '{desc}
' + ) + actions = ins.get("action_items", []) + if actions: + body += '{html_mod.escape(dataset_name)} — + {rows:,} rows x + {cols} columns
+ {meta_html} +{html_mod.escape(dataset_name)}
+ {meta_html} +Overview gives you a bird's-eye view of the entire dataset before you dive into " + "deeper analysis.
" + "What you will find here:
" + "Why it matters: Verifying row counts and types first catches loading errors " + "(truncated files, wrong delimiters, encoding issues) before they silently corrupt downstream results.
" + "Beginner tip: If the row count is much smaller than expected, your file may have " + "been loaded with the wrong separator. If a numeric column shows up as 'text', it probably " + "contains non-numeric characters that need cleaning.
" + ), + }, + "section_quality": { + "tip": "Scores the dataset across completeness, uniqueness, consistency, and validity (0-100%).", + "desc": ( + "Data Quality Assessment evaluates your dataset along four independent dimensions, " + "each scored from 0 to 100%. Think of it as a health check-up for your data.
" + "The four quality dimensions:
" + "How to read the scores:
" + "Overall score formula: 0.35 × Completeness + 0.25 × Uniqueness + " + "0.20 × Consistency + 0.20 × Validity. Completeness is weighted highest because missing " + "data affects nearly every analysis method.
" + ), + }, + "section_preprocessing": { + "tip": "Documents all automatic cleaning and transformation steps applied before analysis.", + "desc": ( + "Preprocessing Log records every automatic cleaning action the system performed on " + "your raw data before running any analysis.
" + "Why this matters: Reproducibility is the cornerstone of trustworthy analysis. " + "If you cannot explain exactly what transformations were applied, your results cannot be verified.
" + "Common preprocessing steps recorded:
" + "Beginner tip: Always review this log. If you see an important column was dropped, " + "it may mean the original data had formatting issues that need manual fixing.
" + ), + }, + "section_descriptive": { + "tip": "Central tendency, dispersion, and shape statistics for every column.", + "desc": ( + "Descriptive Statistics is the foundation of Exploratory Data Analysis (EDA). " + "It summarises each column with a set of numbers that describe its centre, spread, and shape.
" + "For numeric columns, you will see:
" + "For categorical columns: count, unique, top (most frequent), freq (frequency of top).
" + "Beginner tip: Look for columns where mean and median are very different -- " + "this signals outliers or skewed data that may need special treatment.
" + ), + }, + "section_distribution": { + "tip": "Histograms and Q-Q plots revealing the shape and spread of each numeric column.", + "desc": ( + "Distribution Analysis visualises how values are spread out in each numeric column. " + "While descriptive statistics give you numbers, distribution plots let you see the shape.
" + "What the charts show:
" + "Common distribution shapes:
" + "Why it matters: Many machine learning algorithms assume normally distributed inputs. " + "Knowing the actual distribution shape helps you choose the right model or apply transformations.
" + ), + }, + "section_correlation": { + "tip": "Measures pairwise linear and rank-based relationships between numeric columns.", + "desc": ( + "Correlation Analysis measures how strongly pairs of numeric columns are related.
" + "Two types of correlation computed:
" + "How to read the heatmap: Darker colours = stronger correlation. Red = positive, " + "Blue = negative. The diagonal is always 1.0 (each variable is perfectly correlated with itself).
" + "Warning thresholds:
" + "Beginner tip: High correlation between two features means they carry similar " + "information. Including both in a linear model can cause instability (multicollinearity).
" + ), + }, + "section_missing": { + "tip": "Analyses patterns, proportions, and potential mechanisms of missing data.", + "desc": ( + "Missing Data Analysis investigates where, how much, and " + "why data is missing.
" + "Key metrics:
" + "Three mechanisms of missingness:
" + "Practical guidelines:
" + "Outlier Detection identifies data points that are unusually far from the rest. " + "Outliers can be genuine extreme values or data entry errors.
" + "Detection method (IQR):
" + "Important: Not all outliers are errors! In many domains (fraud detection, " + "rare diseases, extreme weather), outliers are the most interesting data points. " + "Always investigate before removing.
" + "Beginner tip: Use box plots (shown in this section) to visually assess outliers. " + "Points shown as dots beyond the whiskers are potential outliers worth examining.
" + ), + }, + "section_categorical": { + "tip": "Frequency distributions, bar charts, and entropy analysis for categorical columns.", + "desc": ( + "Categorical Analysis examines non-numeric columns -- text labels, categories, " + "boolean flags, and any column with a limited set of distinct values.
" + "Key metrics for each categorical column:
" + "Why it matters: Categories with very low frequency (rare classes) can cause " + "problems in machine learning. A column where one category appears 99% of the time " + "carries almost no information.
" + "Beginner tip: Look at the bar charts. If one bar is overwhelmingly taller than " + "the rest, the column is 'imbalanced' -- you may need special techniques like oversampling.
" + ), + }, + "section_importance": { + "tip": "Ranks features by informational value using variance and mutual information.", + "desc": ( + "Feature Importance helps you answer: Which columns carry the most useful " + "information?
" + "Methods used:
" + "Practical use: Features with near-zero importance are candidates for removal. " + "Reducing dimensionality can speed up training, reduce overfitting, and improve interpretability.
" + "Beginner tip: Do NOT blindly remove all low-importance features. Sometimes a " + "feature is unimportant alone but becomes powerful when combined with another (interaction effects).
" + ), + }, + "section_pca": { + "tip": "Principal Component Analysis reveals the intrinsic dimensionality of the data.", + "desc": ( + "PCA (Principal Component Analysis) is a technique that transforms correlated features " + "into a smaller set of uncorrelated components ordered by how much variance they explain.
" + "Key outputs:
" + "Why it matters: If 95% of variance is explained by 3 components out of 50 features, " + "your data's intrinsic dimensionality is very low -- many features are redundant.
" + "Beginner tip: PCA works best when features are on similar scales. The system " + "automatically standardises (z-score) before applying PCA.
" + ), + }, + "section_duplicates": { + "tip": "Identifies exact duplicate rows that may inflate statistics or bias models.", + "desc": ( + "Duplicate Analysis scans for rows that are exactly identical across all columns.
" + "Why duplicates matter:
" + "Metrics shown: total rows, duplicate count, unique count, and duplicate ratio.
" + "Beginner tip: A small number of duplicates (< 1%) is often harmless, especially " + "if your data legitimately contains identical records. But always investigate unexpected high ratios.
" + ), + }, + "section_warnings": { + "tip": "Aggregated warnings and potential issues detected across all analyses.", + "desc": ( + "Warnings & Issues collects all anomalies and concerns found during the " + "entire analysis into one place for easy review.
" + "Common warnings include:
" + "Beginner tip: Treat this section as a priority to-do list. Address the highest-" + "severity warnings first, then re-run your analysis to see if the quality score improves.
" + ), + }, + # ===== Advanced: Distribution+ ===== + "sub_best_fit": { + "tip": "Finds the theoretical distribution (Normal, Gamma, Weibull, etc.) that best matches each column.", + "desc": ( + "Best-Fit Distribution evaluates each numeric column against a library of theoretical " + "distributions to find the one that most closely matches the observed data.
" + "Distributions tested include: Normal, Lognormal, Exponential, Gamma, Beta, Weibull, " + "Uniform, and more.
" + "How the best fit is selected:
" + "Why it matters: Knowing which distribution generated your data enables better " + "simulation, parametric modelling, confidence interval construction, and anomaly detection.
" + "Beginner tip: If the best-fit distribution is 'norm' (normal), many standard " + "statistical tests apply directly. If it is something else (e.g. lognorm), consider a " + "log-transform before applying methods that assume normality.
" + ), + }, + "sub_jarque_bera": { + "tip": "Tests whether each column's skewness and kurtosis match a normal distribution.", + "desc": ( + "Jarque-Bera Normality Test specifically checks if the shape of your " + "data matches a normal (bell-curve) distribution.
" + "How it works:
" + "How to interpret:
" + "Beginner tip: Non-normal data is extremely common in real-world datasets. " + "A failed normality test does not mean your data is 'bad' -- it means you should use " + "non-parametric methods or apply a transformation (like log or Box-Cox).
" + ), + }, + "sub_power_transform": { + "tip": "Recommends Box-Cox or Yeo-Johnson transformations to make skewed distributions more Gaussian.", + "desc": ( + "Power Transform Recommendation suggests mathematical transformations that can " + "reshape your skewed data into a more bell-shaped (Gaussian) form.
" + "Two methods evaluated:
" + "Key outputs:
" + "Beginner tip: Power transforms are essential preprocessing steps for algorithms " + "like linear regression and neural networks that assume roughly normal input distributions.
" + ), + }, + "sub_kde_bandwidth": { + "tip": "Determines the optimal smoothing parameter for Kernel Density Estimation plots.", + "desc": ( + "KDE Bandwidth Analysis finds the best 'smoothing level' for density curve " + "estimation.
" + "What is a KDE? Kernel Density Estimation creates a smooth curve from your data " + "points by placing a small bell-curve (kernel) on each point and adding them up. " + "The bandwidth controls how wide each kernel is.
" + "Trade-off:
" + "Two automatic rules compared:
" + "Beginner tip: If the two rules give very different bandwidths, your data likely " + "has outliers or multiple modes (peaks). Check the histogram to confirm.
" + ), + }, + # ===== Advanced: Correlation+ ===== + "sub_partial_corr": { + "tip": "Reveals direct relationships between variables after removing confounding effects.", + "desc": ( + "Partial Correlation answers: Do these two variables have a direct relationship, " + "or is their correlation caused by a third variable?
" + "Example: Ice cream sales and drowning deaths are correlated -- but the partial " + "correlation controlling for temperature will be near zero. Temperature is the real driver.
" + "How it is computed: Using the inverse of the covariance matrix (precision matrix). " + "The negative off-diagonal elements, when normalised, give the partial correlation.
" + "How to interpret:
" + "Why it matters: Identifying true direct relationships (vs. spurious ones) is critical " + "for causal inference and building parsimonious models.
" + ), + }, + "sub_mutual_info": { + "tip": "Information-theoretic measure that captures both linear and non-linear dependencies.", + "desc": ( + "Mutual Information (MI) measures how much knowing one variable tells you about " + "another -- capturing any type of relationship, not just linear ones.
" + "Formula: MI(X,Y) = H(X) + H(Y) - H(X,Y), where H is Shannon entropy.
" + "Key properties:
" + "Compare with Pearson correlation: Pearson r might be zero for X and sin(X), but " + "MI will correctly detect the dependency.
" + "Beginner tip: If you see a high MI but low Pearson correlation between two variables, " + "there is a non-linear relationship worth investigating with a scatter plot.
" + ), + }, + "sub_bootstrap_ci": { + "tip": "Resampling-based 95% confidence interval for each pairwise correlation.", + "desc": ( + "Bootstrap Correlation Confidence Intervals tell you how reliable each correlation " + "estimate actually is.
" + "How it works:
" + "How to interpret the CI:
" + "Beginner tip: A correlation of r = 0.5 with a CI of [0.45, 0.55] is much more " + "trustworthy than the same r = 0.5 with a CI of [-0.1, 0.9]. Always check the CI.
" + ), + }, + "sub_distance_corr": { + "tip": "Szekely distance correlation that detects non-linear dependencies missed by Pearson.", + "desc": ( + "Distance Correlation is a modern statistical measure that equals zero " + "if and only if variables are truly independent. This is a stronger guarantee " + "than Pearson correlation, which can be zero even when strong non-linear patterns exist.
" + "Range: 0 (perfect independence) to 1 (strong dependence).
" + "Key comparison with Pearson:
" + "Beginner tip: Distance correlation is computationally more expensive than Pearson " + "but catches hidden patterns that Pearson completely misses.
" + ), + }, + # ===== Clustering ===== + "sub_kmeans": { + "tip": "K-Means partitioning with automatically optimised cluster count via silhouette analysis.", + "desc": ( + "K-Means Clustering automatically groups your data points into k clusters " + "where each point belongs to the cluster with the nearest mean (centroid).
" + "How it works:
" + "Key metrics:
" + "Beginner tip: K-Means assumes roughly spherical clusters of similar size. " + "If your data has irregularly shaped or very differently sized clusters, check the DBSCAN " + "results instead.
" + ), + }, + "sub_dbscan": { + "tip": "Density-based clustering that discovers clusters of arbitrary shape and identifies noise.", + "desc": ( + "DBSCAN (Density-Based Spatial Clustering) finds clusters by looking for areas " + "where data points are densely packed together.
" + "Key advantages over K-Means:
" + "Key parameters:
" + "Metrics shown:
" + "Beginner tip: If DBSCAN finds only 1 cluster with many noise points, the data may " + "not have clear density-based structure, or the eps parameter needs tuning.
" + ), + }, + "sub_hierarchical": { + "tip": "Agglomerative clustering dendrogram showing how clusters merge at each level.", + "desc": ( + "Hierarchical Clustering builds a tree-like structure (dendrogram) showing how " + "data points merge into progressively larger clusters.
" + "How it works:
" + "Reading the dendrogram: The y-axis shows the 'distance' (dissimilarity) at which " + "clusters merge. You can draw a horizontal line at any height to get a different number of " + "clusters. Large vertical gaps suggest natural cluster boundaries.
" + "Beginner tip: Look for long vertical lines in the dendrogram -- these represent " + "large jumps in dissimilarity and suggest natural groupings in your data.
" + ), + }, + "sub_cluster_profiles": { + "tip": "Statistical summary (mean, std) of each K-Means cluster across all features.", + "desc": ( + "Cluster Profiles describes what makes each cluster unique by showing the average " + "value and standard deviation of every feature within each cluster.
" + "How to use:
" + "Example interpretation: If Cluster 0 has high income + high age, and Cluster 1 " + "has low income + low age, the main clustering dimension is a socioeconomic one.
" + "Beginner tip: This table is excellent for giving meaningful names to clusters " + "(e.g. 'High-value customers', 'Budget shoppers') based on their most distinctive features.
" + ), + }, + # ===== Dimensionality Reduction ===== + "sub_tsne": { + "tip": "Non-linear 2D projection that preserves local neighbourhood structure for visualisation.", + "desc": ( + "t-SNE (t-Distributed Stochastic Neighbour Embedding) is a visualisation technique " + "that compresses high-dimensional data into a 2D scatter plot while preserving which data " + "points are similar to each other.
" + "Key parameter:
" + "How to read the plot: Points close together in the 2D plot were similar in the " + "original high-dimensional space. Distinct clusters in the plot suggest real groups in the data.
" + "⚠️ Important caveats:
" + "UMAP (Uniform Manifold Approximation and Projection) is a modern alternative " + "to t-SNE that is generally faster and better at preserving the global layout of the data.
" + "Advantages over t-SNE:
" + "How to read the plot: Similar interpretation to t-SNE. Points close together are " + "similar; distinct groupings suggest real clusters. But unlike t-SNE, the relative positions " + "of clusters carry some meaning too.
" + "Beginner tip: If t-SNE and UMAP show similar cluster structure, you can be more " + "confident that the clusters are real. If they disagree, investigate further.
" + ), + }, + "sub_factor_analysis": { + "tip": "Discovers latent (hidden) factors that explain the correlations among observed variables.", + "desc": ( + "Factor Analysis seeks to explain why certain variables are correlated by " + "hypothesising the existence of hidden (latent) factors.
" + "Analogy: Imagine you measure students' scores in 10 subjects. Factor Analysis " + "might discover 3 latent factors: 'Verbal ability', 'Mathematical ability', and " + "'Artistic ability', each influencing several subjects.
" + "How it works:
" + "Output: Number of retained factors and the noise variance (uniqueness) for " + "each variable -- how much of its variation is NOT explained by the factors.
" + "Beginner tip: High noise variance for a variable means the common factors do not " + "explain it well -- it may be measuring something unique.
" + ), + }, + "sub_factor_loadings": { + "tip": "Shows how strongly each observed variable relates to each latent factor.", + "desc": ( + "Factor Loadings quantify the relationship between each original variable and " + "each latent factor discovered by Factor Analysis.
" + "How to interpret the values:
" + "Cross-loadings: If a variable loads highly on multiple factors, it is measuring " + "a mix of constructs and may not be well-suited for the factor model.
" + "Beginner tip: Factor loadings are similar to PCA loadings but have a different " + "interpretation. In Factor Analysis, the latent factors are hypothesised causes; in PCA, " + "components are just mathematical summaries of variance.
" + ), + }, + "sub_feature_contrib": { + "tip": "Ranks features by their contribution to total variance using PCA loadings.", + "desc": ( + "PCA-Weighted Feature Contribution ranks original features by how much of the " + "total variance they contribute to, weighted across all principal components.
" + "How it is calculated: For each feature, sum the squared loadings across all " + "components, weighted by each component's eigenvalue proportion.
" + "Use cases:
" + "Beginner tip: Features near the bottom of the ranking contribute very little " + "to overall variance and are good candidates for removal in a preprocessing pipeline.
" + ), + }, + # ===== Feature Insights ===== + "sub_interaction": { + "tip": "Detects synergistic product-interaction effects between feature pairs.", + "desc": ( + "Interaction Detection checks whether the product of two features " + "contains information not present in either feature alone.
" + "Why it matters: In many real-world scenarios, the effect of one variable depends " + "on the value of another. For example, the effect of 'education' on 'salary' might depend " + "on 'years of experience'.
" + "How it works: For each pair of features, a product-interaction term (X₁ × X₂) " + "is created and its correlation with other features is measured.
" + "Beginner tip: Strong interaction effects are excellent candidates for feature " + "engineering -- adding the product as a new feature can significantly improve model performance.
" + ), + }, + "sub_monotonic": { + "tip": "Compares Pearson vs Spearman correlation to identify non-linear monotonic patterns.", + "desc": ( + "Monotonic Relationship Analysis detects variables that consistently increase (or decrease) " + "together, but not necessarily in a straight line.
" + "The key insight:
" + "Practical implication: If you find a pair with high Spearman but low Pearson, " + "applying a monotonic transform (log, sqrt, etc.) before linear modelling will improve fit.
" + ), + }, + "sub_binning": { + "tip": "Evaluates equal-width and equal-frequency binning with entropy analysis.", + "desc": ( + "Binning Analysis evaluates different strategies for converting continuous variables " + "into discrete categories (bins).
" + "Two strategies compared:
" + "Shannon entropy measures how evenly data is distributed across bins. " + "Lower entropy = more concentrated (uneven); higher entropy = more uniform.
" + "Beginner tip: Binning is useful when you need to convert a numeric feature into " + "categories (e.g. age groups) or when tree-based models need to handle extreme outliers.
" + ), + }, + "sub_cardinality": { + "tip": "Analyses unique-value counts and recommends encoding methods for categorical features.", + "desc": ( + "Cardinality & Encoding Recommendation analyses each categorical column's " + "number of unique values and suggests the best encoding strategy for machine learning.
" + "Encoding recommendations by cardinality:
" + "Why it matters: Most ML algorithms cannot handle text labels directly -- they need " + "numeric representation. Choosing the wrong encoding can waste memory, cause overfitting, or " + "lose important information.
" + ), + }, + "sub_leakage": { + "tip": "Flags features that might unintentionally leak target information.", + "desc": ( + "Leakage Risk Assessment checks for features that might be improperly providing " + "direct or indirect access to the target variable.
" + "Common leakage signals:
" + "Why it is dangerous: Data leakage causes models to show unrealistically high " + "accuracy during training/validation but fail catastrophically in production.
" + "Beginner tip: If your model seems 'too good to be true' (e.g. 99% accuracy), " + "leakage is the most likely culprit. Check this section carefully.
" + ), + }, + # ===== Advanced Anomaly ===== + "sub_iso_forest": { + "tip": "Tree-based anomaly detection that isolates outliers via random feature splits.", + "desc": ( + "Isolation Forest detects anomalies based on a simple idea: outliers are " + "easier to isolate than normal points.
" + "How it works:
" + "Anomaly score: More negative = more anomalous. The contamination rate (default 5%) " + "determines the threshold.
" + "Advantages: Works well in high dimensions, does not assume any specific distribution, " + "and is very fast.
" + "Beginner tip: Isolation Forest is often the best first choice for anomaly detection " + "because it requires minimal parameter tuning and handles mixed-type data well.
" + ), + }, + "sub_lof": { + "tip": "Density-based anomaly detection comparing each point's local density to its neighbours.", + "desc": ( + "Local Outlier Factor (LOF) identifies anomalies by comparing the local density " + "of each data point to the density of its k nearest neighbours.
" + "Intuition: A point in a sparse region surrounded by dense regions is anomalous. " + "A point in a uniformly sparse region is just an edge case, not an anomaly.
" + "LOF score interpretation:
" + "Advantage over Isolation Forest: LOF is better at detecting anomalies in datasets " + "with varying density -- e.g. a point that is normal in one region of the data but anomalous " + "in another.
" + "Beginner tip: LOF works best when clusters have different densities. If all clusters " + "are equally dense, Isolation Forest may suffice.
" + ), + }, + "sub_mahalanobis": { + "tip": "Multivariate distance from the data centre, accounting for feature correlations.", + "desc": ( + "Mahalanobis Distance measures how far each observation is from the centre of the " + "data distribution, taking into account the correlations between features.
" + "Comparison with Euclidean distance:
" + "Statistical foundation: Under multivariate normality, D² follows a chi-squared " + "distribution. Points exceeding the 97.5th percentile chi-squared critical value are flagged.
" + "Beginner tip: Mahalanobis is ideal when features are correlated. If two features " + "always move together, a point where one is high and the other is low is genuinely unusual -- " + "Mahalanobis will catch this, but Euclidean distance might not.
" + ), + }, + "sub_consensus": { + "tip": "Combines 3 anomaly methods -- flags points agreed upon by at least 2 out of 3.", + "desc": ( + "Consensus Anomaly Detection combines the results of Isolation Forest, LOF, and " + "Mahalanobis distance to produce a more reliable anomaly assessment.
" + "The voting rule: A point is flagged as anomalous only if at least 2 out of 3 " + "methods agree. This reduces false positives dramatically.
" + "Why consensus is better than any single method:
" + "Beginner tip: Start your anomaly investigation with the consensus flags. These are " + "the most reliable candidates for genuine anomalies and are worth investigating first.
" + ), + }, + # ===== Statistical Tests ===== + "test_levene": { + "tip": "Tests whether groups have equal variances (homoscedasticity assumption).", + "desc": ( + "Levene's Test checks whether different groups in your data have approximately " + "equal variances -- a key assumption for many statistical tests and ANOVA.
" + "Why it matters: Many tests (like t-tests and ANOVA) assume equal variances. " + "Violating this assumption can lead to incorrect conclusions.
" + "How to interpret:
" + "Advantage: Levene's test is more robust to non-normality than Bartlett's test, " + "making it the preferred choice in practice.
" + ), + }, + "test_kruskal_wallis": { + "tip": "Non-parametric ANOVA: tests whether multiple groups have the same distribution.", + "desc": ( + "Kruskal-Wallis Test is the non-parametric equivalent of one-way ANOVA. " + "It tests whether multiple groups come from the same distribution, without assuming normality.
" + "How it works: All values from all groups are ranked together. The test checks " + "whether the average ranks differ significantly across groups.
" + "How to interpret:
" + "Beginner tip: Use Kruskal-Wallis when your data is ordinal, non-normal, or has " + "outliers that would make ANOVA unreliable.
" + ), + }, + "test_mann_whitney": { + "tip": "Non-parametric test comparing the distributions of two independent groups.", + "desc": ( + "Mann-Whitney U Test (also called Wilcoxon rank-sum) compares two independent " + "groups to determine whether they come from the same distribution.
" + "How it works: All values from both groups are ranked together. The test measures " + "whether one group tends to have systematically higher ranks than the other.
" + "How to interpret:
" + "When to use: Ideal for ordinal data, non-normal distributions, small samples, " + "or when outliers make the t-test unreliable.
" + "Beginner tip: Mann-Whitney tests for differences in distribution shape, not just " + "the mean. Two groups with the same mean but different spreads can still yield a significant result.
" + ), + }, + "test_chi_square": { + "tip": "Tests whether observed category frequencies differ from expected frequencies.", + "desc": ( + "Chi-Square Goodness of Fit tests whether the observed frequency distribution " + "of categories matches what you would expect (by default, a uniform distribution).
" + "Formula: χ² = Σ (Observed - Expected)² / Expected
" + "How to interpret:
" + "Requirement: Each expected frequency should be ≥ 5 for the test to be valid. " + "With very small expected counts, consider Fisher's exact test instead.
" + "Beginner tip: This test is commonly used to check whether a categorical variable " + "has a balanced distribution or whether some categories dominate.
" + ), + }, + "test_grubbs": { + "tip": "Tests whether the single most extreme value in a dataset is a statistically significant outlier.", + "desc": ( + "Grubbs' Test evaluates whether the most extreme value in a dataset is a " + "statistically significant outlier, as opposed to a natural extreme of the distribution.
" + "Formula: G = max|Xᵢ - X̄| / s, where X̄ is the mean and s is the standard deviation.
" + "How to interpret:
" + "Assumption: The test assumes the data (excluding the potential outlier) is " + "approximately normally distributed.
" + "Beginner tip: Grubbs' test only checks one extreme value at a time. " + "For datasets with multiple outliers, use the IQR method or Isolation Forest instead.
" + ), + }, + "test_adf": { + "tip": "Tests whether a time series is stationary (constant statistical properties over time).", + "desc": ( + "Augmented Dickey-Fuller (ADF) Test determines whether a time series has a " + "unit root -- meaning it is non-stationary (its mean, variance, or autocorrelation " + "change over time).
" + "Why stationarity matters: Most time-series models (ARIMA, etc.) require stationary " + "input. Non-stationary data can produce spurious correlations and unreliable forecasts.
" + "How to interpret:
" + "Beginner tip: If your numeric column represents sequential measurements over time, " + "check ADF before running any regression. A non-stationary predictor can make regression " + "results meaningless.
" + ), + }, + # ===== Basic sub-sections ===== + "sub_column_quality": { + "tip": "Per-column quality scores for completeness, uniqueness, and validity.", + "desc": ( + "Column Quality breaks down the overall quality score into individual columns, " + "letting you identify exactly which columns have problems.
" + "Each column is scored on completeness (non-missing ratio), uniqueness (distinct value ratio), " + "and validity (values within expected ranges). This helps you prioritise which columns need " + "the most attention during data cleaning.
" + "Beginner tip: Columns with very low quality scores are the first targets for cleaning " + "or removal.
" + ), + }, + "sub_cleaning_log": { + "tip": "Step-by-step record of all automated data cleaning actions performed.", + "desc": ( + "Cleaning Log documents every transformation the system applied during preprocessing " + "for full transparency and reproducibility.
" + "This includes columns dropped, type conversions, encoding fixes, and any rows removed. " + "Review it carefully to ensure no important data was unexpectedly modified.
" + ), + }, + "sub_detected_issues": { + "tip": "List of data quality issues found during preprocessing.", + "desc": ( + "Detected Issues enumerates specific problems found in the raw data: mixed types " + "within a column, suspicious patterns (e.g. '999' or '-1' used as missing-value markers), " + "encoding problems, and more.
" + "Each issue includes the affected column and a description. Address these before running " + "production models.
" + ), + }, + "sub_normality_tests": { + "tip": "Multiple normality tests (Shapiro-Wilk, Anderson-Darling, Jarque-Bera) for each numeric column.", + "desc": ( + "Normality Tests & Shape Analysis applies three complementary tests to assess " + "whether each numeric column follows a normal distribution:
" + "If all three agree (p < 0.05), the column is very likely non-normal. If they disagree, " + "examine the histogram to understand why.
" + ), + }, + "sub_vif": { + "tip": "Variance Inflation Factor detects multicollinearity between features.", + "desc": ( + "VIF (Variance Inflation Factor) measures how much the variance of a regression " + "coefficient is inflated by correlation with other features.
" + "Formula: VIF = 1 / (1 - R²ᵢ), where R²ᵢ is the R-squared from regressing " + "feature i on all other features.
" + "Interpretation:
" + "Beginner tip: High VIF causes unstable regression coefficients. Even small changes " + "in data can flip coefficient signs.
" + ), + }, + "sub_summary": { + "tip": "Compact summary of distribution shape, normality test results, and outlier counts.", + "desc": ( + "Summary provides a quick-reference view combining skewness classification, " + "kurtosis type, normality indicators, and outlier counts in a single table.
" + "Use this as a rapid screening tool before diving into detailed per-column analysis.
" + ), + }, + "sub_variance_explained": { + "tip": "Shows how much variance each principal component captures (scree plot data).", + "desc": ( + "Variance Explained shows each principal component's individual and cumulative " + "contribution to the total variance.
" + "How to use the scree plot: Look for an 'elbow' -- the point where the curve " + "bends sharply and additional components add very little variance. Components before the elbow " + "contain most of the signal; those after contain mostly noise.
" + "Rule of thumb: Typically, retaining enough components to explain 80-95% of " + "cumulative variance is a good balance between dimensionality reduction and information loss.
" + ), + }, + "sub_loadings": { + "tip": "Shows each original feature's contribution to each principal component.", + "desc": ( + "PCA Loadings matrix shows the weight (contribution) of each original feature to " + "each principal component.
" + "Features with high absolute loadings on a component are the main contributors to that " + "component. Use loadings to interpret what each component represents in domain terms.
" + "Example: If PC1 has high loadings for 'height', 'weight', and 'BMI', you might " + "interpret PC1 as a 'body size' component.
" + ), + }, +} + +# -- Korean ------------------------------------------------------------ +METHOD_INFO["ko"] = { + "section_overview": { + "tip": "행/열 개수, 데이터 타입 분포, 메모리 사용량 등 데이터셋 전체 요약.", + "desc": ( + "개요(Overview)는 본격적인 분석에 앞서 데이터셋의 전체 구조를 한눈에 보여줍니다.
" + "확인할 수 있는 내용:
" + "왜 중요한가: 행 수와 타입을 먼저 확인하면 파일 로딩 오류(잘린 파일, 잘못된 구분자, " + "인코딩 문제)를 분석 전에 잡아낼 수 있습니다.
" + "초보자 팁: 행 수가 예상보다 훨씬 적다면 구분자(separator) 설정이 잘못되었을 수 있고, " + "수치 열이 '텍스트'로 표시되면 숫자가 아닌 문자가 섞여 있을 가능성이 높습니다.
" + ), + }, + "section_quality": { + "tip": "완전성·유일성·일관성·유효성 4개 차원으로 데이터 품질을 0-100% 평가.", + "desc": ( + "데이터 품질 평가는 데이터셋의 건강 상태를 네 가지 독립적 차원에서 각각 0~100%로 점수화합니다.
" + "4가지 품질 차원:
" + "점수 읽는 법: 90-100%: 우수 | 70-89%: 양호, 플래그 확인 | 70% 미만: 모델링 전 반드시 해결
" + "종합 점수: 0.35×완전성 + 0.25×유일성 + 0.20×일관성 + 0.20×유효성
" + ), + }, + "section_preprocessing": { + "tip": "분석 전 자동으로 수행된 모든 정제·변환 단계를 기록.", + "desc": ( + "전처리 로그는 시스템이 원본 데이터에 수행한 모든 자동 정제 작업을 순서대로 기록합니다.
" + "기록되는 전처리 예시:
" + "왜 중요한가: 재현성(reproducibility)은 신뢰할 수 있는 분석의 토대입니다. " + "어떤 변환이 적용되었는지 정확히 알아야 결과를 검증할 수 있습니다.
" + "초보자 팁: 중요한 열이 삭제되었다면, 원본 데이터에 형식 문제가 있어 수동 수정이 필요할 수 있습니다.
" + ), + }, + "section_descriptive": { + "tip": "각 열의 중심경향, 산포도, 분포 형태를 요약하는 기술통계량.", + "desc": ( + "기술통계량은 탐색적 데이터 분석(EDA)의 기초로, 각 열을 중심·산포·형태 수치로 요약합니다.
" + "수치 열에서 확인할 수 있는 항목:
" + "초보자 팁: 평균과 중앙값이 크게 다른 열이 있다면, 이상치나 심한 비대칭이 있다는 신호입니다.
" + ), + }, + "section_distribution": { + "tip": "히스토그램과 Q-Q 플롯으로 각 수치 열의 분포 형태를 시각화.", + "desc": ( + "분포 분석은 각 수치 열의 값이 어떻게 퍼져 있는지 시각적으로 보여줍니다.
" + "차트 유형:
" + "대표적 분포 형태: 종형(정규)·오른쪽 치우침(소득, 가격)·왼쪽 치우침(만점 근처 시험 점수)·" + "이봉(두 집단 혼합)·균일(모든 값 동일 확률)
" + "왜 중요한가: 많은 머신러닝 알고리즘은 정규분포 입력을 가정합니다. " + "실제 분포 형태를 알면 올바른 모델을 선택하거나 변환을 적용할 수 있습니다.
" + ), + }, + "section_correlation": { + "tip": "수치 열 간 피어슨(선형) 및 스피어만(순위) 상관관계를 측정.", + "desc": ( + "상관 분석은 수치 열 쌍 사이의 관계 강도를 측정합니다.
" + "두 가지 상관계수:
" + "히트맵 읽는 법: 진한 색 = 강한 상관. 빨강 = 양의 상관, 파랑 = 음의 상관.
" + "경고 기준: |r| > 0.90: 심각한 다중공선성, 하나 제거 고려 | " + "|r| > 0.70: 강한 상관, 모니터링 | |r| < 0.30: 약한 상관
" + "초보자 팁: 두 특성 간 높은 상관은 비슷한 정보를 가지고 있다는 뜻입니다. " + "둘 다 선형 모델에 포함하면 불안정(다중공선성)을 유발할 수 있습니다.
" + ), + }, + "section_missing": { + "tip": "결측 데이터의 패턴, 비율, 발생 메커니즘을 분석.", + "desc": ( + "결측치 분석은 데이터가 어디서, 얼마나, 왜 빠져 있는지 조사합니다.
" + "결측 메커니즘 3가지:
" + "실무 가이드: 5% 미만: 평균/중앙값 대체 | 5-30%: 고급 대체(KNN, MICE) | " + "50% 초과: 열 삭제 고려
" + ), + }, + "section_outlier": { + "tip": "IQR 펜스와 Z-점수를 사용하여 이상 데이터 포인트를 탐지.", + "desc": ( + "이상치 탐지는 나머지 데이터에서 비정상적으로 멀리 떨어진 값을 식별합니다.
" + "IQR 탐지 방법:
" + "중요: 모든 이상치가 오류는 아닙니다! 이상 탐지, 희귀 질환, 극한 날씨 등에서는 " + "이상치가 가장 흥미로운 데이터일 수 있습니다. 항상 조사 후 제거하세요.
" + ), + }, + "section_categorical": { + "tip": "범주형·불리언 열의 빈도분포, 막대차트, 엔트로피 분석.", + "desc": ( + "범주형 분석은 텍스트 라벨, 범주, 불리언 등 비수치 열을 검사합니다.
" + "주요 지표:
" + "초보자 팁: 막대차트에서 하나의 막대가 압도적으로 크면 '불균형'이 있어 " + "오버샘플링 등 특수 기법이 필요할 수 있습니다.
" + ), + }, + "section_importance": { + "tip": "분산·상호정보량을 사용하여 특성의 정보 가치를 순위화.", + "desc": ( + "특성 중요도는 어떤 열이 가장 유용한 정보를 담고 있는가?라는 질문에 답합니다.
" + "사용 방법:
" + "초보자 팁: 중요도가 낮은 특성을 무조건 삭제하지 마세요. " + "혼자서는 약해도 다른 특성과 결합하면 강력한 상호작용 효과가 있을 수 있습니다.
" + ), + }, + "section_pca": { + "tip": "주성분 분석(PCA)으로 데이터의 내재적 차원과 분산 구조를 파악.", + "desc": ( + "PCA(주성분 분석)은 상관된 특성을 분산 설명량 순서로 정렬된 비상관 성분으로 변환합니다.
" + "주요 출력:
" + "초보자 팁: PCA는 특성 스케일이 비슷할 때 가장 잘 작동합니다. " + "시스템이 자동으로 표준화(z-score)한 후 PCA를 적용합니다.
" + ), + }, + "section_duplicates": { + "tip": "통계를 부풀리거나 모델을 편향시킬 수 있는 완전 중복 행을 식별.", + "desc": ( + "중복 분석은 모든 열이 정확히 동일한 행을 스캔합니다.
" + "중복이 문제인 이유:
" + "초보자 팁: 소량(1% 미만)의 중복은 대개 무해합니다만, " + "예상 밖의 높은 비율은 항상 조사해야 합니다.
" + ), + }, + "section_warnings": { + "tip": "전체 분석에서 발견된 경고와 잠재적 이슈를 한 곳에 모아 표시.", + "desc": ( + "경고 & 이슈는 모든 분석에서 발견된 이상 징후를 한 곳에 모아 보여줍니다.
" + "대표적 경고: 높은 결측률(>30%), 상수 열, 다중공선성(|r|>0.90), 극단 이상치 수, " + "데이터 타입 불일치 등
" + "초보자 팁: 이 섹션을 우선순위 할 일 목록처럼 활용하세요. " + "심각도가 높은 경고부터 해결 후 분석을 재실행하면 품질 점수가 개선됩니다.
" + ), + }, + "sub_best_fit": { + "tip": "각 수치 열을 이론적 분포(정규, 감마, 와이블 등)와 비교해 최적 분포를 선택.", + "desc": ( + "최적 분포 적합은 각 수치 열을 정규, 로그정규, 지수, 감마, 베타, 와이블 등과 비교하여 " + "가장 잘 맞는 이론적 분포를 찾습니다.
" + "선택 기준:
" + "초보자 팁: 최적 분포가 'norm'(정규)이면 대부분의 표준 통계 검정이 바로 적용됩니다. " + "그렇지 않으면 로그 변환 등을 고려하세요.
" + ), + }, + "sub_jarque_bera": { + "tip": "왜도와 첨도가 정규분포와 일치하는지 검정.", + "desc": ( + "자크-베라(JB) 검정은 데이터의 형태가 정규분포(종형 곡선)과 일치하는지 검사합니다.
" + "원리: 정규분포는 왜도=0, 초과첨도=0입니다. JB 통계량은 이 이상값으로부터의 편차를 측정합니다.
" + "해석: p ≥ 0.05: 정규성 기각 불가 | p < 0.05: 유의하게 비정규
" + "초보자 팁: 비정규 데이터는 현실에서 매우 흔합니다. 정규성 검정 실패가 데이터가 " + "'나쁘다'는 의미가 아니라, 비모수 방법을 쓰거나 변환이 필요하다는 의미입니다.
" + ), + }, + "sub_power_transform": { + "tip": "Box-Cox 또는 Yeo-Johnson 변환으로 치우친 분포를 정규에 가깝게 변환.", + "desc": ( + "거듭제곱 변환 권장은 치우친 데이터를 더 종형(가우시안)으로 변환하는 방법을 제안합니다.
" + "두 가지 방법:
" + "초보자 팁: 거듭제곱 변환은 선형회귀나 신경망처럼 대략적인 정규 입력을 가정하는 " + "알고리즘의 필수 전처리 단계입니다.
" + ), + }, + "sub_kde_bandwidth": { + "tip": "커널 밀도 추정(KDE)에 최적인 평활 매개변수를 Scott/Silverman 규칙으로 분석.", + "desc": ( + "KDE 대역폭 분석은 밀도 곡선 추정의 최적 '평활 수준'을 찾습니다.
" + "트레이드오프: 작은 대역폭 = 세부 포착, 노이즈 과적합 | " + "큰 대역폭 = 부드러운 곡선, 중요 특성 놓침
" + "두 규칙: Scott 규칙(단봉 데이터에 적합) vs Silverman 규칙(이상치에 더 강건)
" + "초보자 팁: 두 규칙의 대역폭이 크게 다르면 이상치나 다봉 분포일 가능성이 높으므로 " + "히스토그램을 확인하세요.
" + ), + }, + "sub_partial_corr": { + "tip": "다른 모든 변수의 효과를 제거한 후 두 변수 간 직접 관계를 측정.", + "desc": ( + "편상관은 두 변수의 관계가 직접적인 것인지, 아니면 제3의 변수 때문인지?를 답합니다.
" + "예시: 아이스크림 판매량과 익사 사고는 상관이 있지만, 기온을 통제하면 편상관은 거의 0입니다.
" + "해석: 높은 편상관 = 진짜 직접 관계 | 거의 0 = 다른 변수를 매개로 한 허위 관계
" + ), + }, + "sub_mutual_info": { + "tip": "선형·비선형 의존성을 모두 포착하는 정보이론적 측정치.", + "desc": ( + "상호정보량(MI)은 한 변수를 알면 다른 변수에 대해 얼마나 알 수 있는지를 측정합니다.
" + "핵심 특성: MI=0이면 통계적 독립, MI>0이면 어떤 형태든 의존성이 있습니다. " + "피어슨이 0인 X와 sin(X)도 MI는 정확히 감지합니다.
" + "초보자 팁: MI는 높은데 피어슨은 낮다면 비선형 관계가 있으므로 산점도로 패턴을 확인하세요.
" + ), + }, + "sub_bootstrap_ci": { + "tip": "리샘플링 기반 각 쌍별 상관계수의 95% 신뢰구간.", + "desc": ( + "부트스트랩 상관 신뢰구간은 각 상관 추정치가 실제로 얼마나 신뢰할 수 있는지 알려줍니다.
" + "작동 방식: 복원 추출 1,000회 → 각각 상관계수 계산 → 2.5~97.5 백분위가 95% CI
" + "해석: 좁은 CI = 안정적 추정 | 넓은 CI = 높은 불확실성 | CI가 0을 포함 = 유의하지 않을 수 있음
" + ), + }, + "sub_distance_corr": { + "tip": "피어슨이 놓치는 비선형 의존성을 탐지하는 세켈리 거리상관.", + "desc": ( + "거리상관은 변수가 진정 독립일 때 그리고 오직 그때만 0이 됩니다. " + "피어슨보다 강력한 보장입니다.
" + "비교: 낮은 피어슨 + 높은 거리상관 → 비선형 관계 존재! 산점도로 패턴을 확인하세요.
" + ), + }, + "sub_kmeans": { + "tip": "실루엣 분석으로 최적 클러스터 수를 자동 결정하는 K-Means.", + "desc": ( + "K-Means 클러스터링은 데이터 포인트를 k개 그룹으로 자동 분할합니다.
" + "작동 방식: 표준화(z-score) → k=2~10 시도 → 최고 실루엣 점수의 k 선택
" + "지표: 실루엣 > 0.5: 좋은 군집화, > 0.7: 강한 구조 | 관성(WCSS): 낮을수록 밀집
" + "초보자 팁: K-Means는 비슷한 크기의 구형 클러스터를 가정합니다. " + "불규칙한 모양이면 DBSCAN 결과를 확인하세요.
" + ), + }, + "sub_dbscan": { + "tip": "임의 형태의 클러스터를 자동 발견하고 노이즈를 식별하는 밀도 기반 방법.", + "desc": ( + "DBSCAN은 데이터 포인트가 밀집된 영역을 찾아 클러스터를 형성합니다.
" + "K-Means 대비 장점: 클러스터 수 사전 지정 불필요, 임의 형태 클러스터 발견, " + "노이즈 포인트 자동 식별
" + "초보자 팁: 클러스터가 1개뿐이고 노이즈가 많다면, 데이터에 뚜렷한 밀도 구조가 " + "없거나 eps 파라미터 조정이 필요합니다.
" + ), + }, + "sub_hierarchical": { + "tip": "클러스터가 각 수준에서 어떻게 병합되는지 보여주는 덴드로그램.", + "desc": ( + "계층적 클러스터링은 트리 구조(덴드로그램)를 구축하여 데이터 포인트가 점차 큰 " + "클러스터로 합쳐지는 과정을 보여줍니다.
" + "덴드로그램 읽기: y축은 병합 '거리'(비유사도). 어느 높이에서든 수평선을 그으면 다른 k를 얻습니다. " + "긴 수직선은 자연스러운 클러스터 경계를 나타냅니다.
" + ), + }, + "sub_cluster_profiles": { + "tip": "각 K-Means 클러스터의 모든 특성에 대한 통계 요약(평균, 표준편차).", + "desc": ( + "클러스터 프로필은 각 클러스터의 평균/표준편차를 보여줘 무엇이 각 클러스터를 고유하게 만드는지 설명합니다.
" + "활용법: 클러스터 간 평균이 크게 다른 특성 = 클러스터를 정의하는 핵심 차별화 특성.
" + "초보자 팁: 이 테이블로 클러스터에 의미 있는 이름을 부여할 수 있습니다 " + "(예: '고가치 고객', '절약형 쇼퍼').
" + ), + }, + "sub_tsne": { + "tip": "지역 이웃 구조를 보존하는 비선형 2D 시각화 투영.", + "desc": ( + "t-SNE는 고차원 데이터를 2D 산점도로 압축하면서 어떤 포인트가 서로 유사한지를 보존합니다.
" + "플롯 읽기: 2D에서 가까운 점 = 원래 고차원 공간에서도 유사. 뚜렷한 군집 = 실제 그룹 가능.
" + "⚠️ 주의: 클러스터 간 거리는 의미 없음 | 클러스터 크기도 실제 크기를 반영하지 않음 | " + "매번 다른 모양의 플롯이 나올 수 있음(확률적 알고리즘)
" + ), + }, + "sub_umap": { + "tip": "지역+전역 데이터 구조를 모두 보존하는 빠른 비선형 2D 시각화.", + "desc": ( + "UMAP은 t-SNE의 현대적 대안으로, 일반적으로 더 빠르고 전역 레이아웃을 더 잘 보존합니다.
" + "t-SNE 대비 장점: 훨씬 빠름, 전역 구조 보존 우수, 클러스터 간 상대 위치에도 의미 있음
" + "초보자 팁: t-SNE와 UMAP이 비슷한 군집 구조를 보이면 그 군집이 실제일 가능성이 높습니다.
" + ), + }, + "sub_factor_analysis": { + "tip": "관측 변수 간 상관을 설명하는 숨겨진(잠재) 요인을 발견.", + "desc": ( + "요인 분석은 왜 특정 변수들이 서로 상관이 있는지를 숨겨진 잠재 요인으로 설명합니다.
" + "비유: 학생의 10개 과목 점수에서 '언어 능력', '수리 능력', '예술 능력' 같은 " + "3개 잠재 요인을 발견하는 것입니다.
" + "초보자 팁: 노이즈 분산이 높은 변수는 공통 요인으로 설명되지 않으며, " + "고유한 무언가를 측정하고 있을 수 있습니다.
" + ), + }, + "sub_factor_loadings": { + "tip": "각 관측 변수가 각 잠재 요인과 얼마나 강하게 관련되는지 표시.", + "desc": ( + "요인 적재량은 원래 변수와 잠재 요인 사이의 관계 강도를 수치화합니다.
" + "해석: |적재량| > 0.7: 강한 관계 | 0.4-0.7: 중간 | < 0.4: 약한 관계
" + "교차 적재: 한 변수가 여러 요인에 높게 적재되면, 요인 모델에 잘 맞지 않는 혼합 변수입니다.
" + ), + }, + "sub_feature_contrib": { + "tip": "PCA 적재량을 사용하여 각 특성의 총 분산 기여도를 순위화.", + "desc": ( + "PCA 가중 특성 기여도는 원래 특성들이 전체 분산에 얼마나 기여하는지 순위를 매겨 " + "비지도 특성 선택에 활용합니다.
" + "초보자 팁: 순위 하위의 특성은 전체 분산에 거의 기여하지 않으며 제거 후보입니다.
" + ), + }, + "sub_interaction": { + "tip": "특성 쌍 간 시너지적 곱-상호작용 효과를 탐지.", + "desc": ( + "상호작용 탐지는 두 특성의 곱이 개별 특성에는 없는 새로운 정보를 가지고 있는지 확인합니다.
" + "초보자 팁: 강한 상호작용 효과가 발견되면 해당 곱을 새로운 특성으로 추가하여 " + "모델 성능을 크게 향상시킬 수 있습니다.
" + ), + }, + "sub_monotonic": { + "tip": "피어슨(선형) vs 스피어만(순위) 상관 비교로 비선형 단조 패턴을 탐색.", + "desc": ( + "단조 관계 분석은 함께 일관되게 증가/감소하지만 직선은 아닌 변수 쌍을 탐지합니다.
" + "핵심: |스피어만| - |피어슨|이 크면 지수, 로그, 시그모이드 같은 비선형 단조 패턴입니다. " + "단조 변환(log, sqrt 등)을 적용하면 선형 모델 성능이 향상됩니다.
" + ), + }, + "sub_binning": { + "tip": "등폭·등빈도 구간화를 엔트로피 분석으로 비교 평가.", + "desc": ( + "구간화 분석은 연속 변수를 이산 범주(구간)로 변환하는 전략을 평가합니다.
" + "두 전략: 등폭(동일 구간 폭, 이상치에 민감) vs 등빈도(동일 데이터 수, 비대칭에 적합)
" + "초보자 팁: 구간화는 연령 그룹처럼 수치를 범주로 바꿀 때나 극단 이상치를 " + "다뤄야 하는 트리 모델에 유용합니다.
" + ), + }, + "sub_cardinality": { + "tip": "고유값 수를 분석하고 범주형 열에 적합한 인코딩 방법을 권장.", + "desc": ( + "카디널리티 & 인코딩 권장은 고유값 수에 따라 최적 인코딩을 제안합니다:
" + "누수 위험 평가는 타겟 변수에 직접/간접적으로 접근하는 특성을 검사합니다.
" + "위험 신호: 타겟과 거의 완벽한 상관, 카디널리티=행 수(ID 열), 미래 정보 포함 특성
" + "초보자 팁: 모델이 '너무 좋아 보이면'(예: 99% 정확도) 데이터 누수가 가장 의심됩니다.
" + ), + }, + "sub_iso_forest": { + "tip": "랜덤 특성 분할로 이상치를 격리하는 트리 기반 이상 탐지.", + "desc": ( + "고립 포레스트(Isolation Forest)는 '이상치는 격리하기 쉽다'는 아이디어에 기반합니다.
" + "작동 방식: 랜덤 분할 트리 구축 → 각 포인트 격리에 필요한 평균 분할 수 측정 → " + "적은 분할로 격리 = 더 이상적
" + "초보자 팁: 최소한의 파라미터 조정으로 고차원 데이터에서도 잘 작동하므로 " + "이상 탐지의 첫 번째 선택으로 추천됩니다.
" + ), + }, + "sub_lof": { + "tip": "각 포인트의 지역 밀도를 이웃과 비교하는 밀도 기반 이상 탐지.", + "desc": ( + "LOF(Local Outlier Factor)는 각 데이터 포인트의 지역 밀도를 k개 가장 가까운 이웃의 밀도와 비교합니다.
" + "LOF ≈ 1: 정상 | LOF >> 1: 이웃보다 훨씬 희소한 영역(이상적)
" + "초보자 팁: 클러스터마다 밀도가 다를 때 LOF가 Isolation Forest보다 효과적입니다.
" + ), + }, + "sub_mahalanobis": { + "tip": "특성 간 상관을 고려하여 데이터 중심으로부터의 다변량 거리를 측정.", + "desc": ( + "마할라노비스 거리는 공분산 구조를 고려하여 각 관측치가 데이터 중심에서 얼마나 먼지 측정합니다.
" + "유클리드 vs 마할라노비스: 유클리드는 모든 방향을 동등하게 취급하지만, " + "마할라노비스는 해당 방향에서 얼마나 비일상적인지로 측정합니다.
" + "초보자 팁: 특성이 상관된 경우 이상적입니다. 항상 함께 움직이는 두 특성에서 " + "하나만 높고 다른 하나는 낮으면 진짜 비정상 -- 마할라노비스가 잡아냅니다.
" + ), + }, + "sub_consensus": { + "tip": "3가지 이상 탐지 방법 중 2개 이상이 동의한 포인트를 플래그.", + "desc": ( + "합의 이상 탐지는 Isolation Forest, LOF, 마할라노비스의 결과를 결합합니다.
" + "투표 규칙: 3가지 방법 중 2개 이상 동의해야 이상으로 판정. " + "오탐(false positive)이 크게 줄어듭니다.
" + "초보자 팁: 합의 플래그부터 조사하세요 -- 가장 신뢰할 수 있는 이상치 후보입니다.
" + ), + }, + "test_levene": { + "tip": "그룹 간의 등분산성(homoscedasticity) 가정을 검정.", + "desc": ( + "레빈 검정은 서로 다른 그룹의 분산이 대략 같은지 확인합니다.
" + "해석: p > 0.05: 등분산 가정 성립 | p ≤ 0.05: 분산이 유의하게 다름, " + "웰치 t-검정이나 비모수 대안 사용 권장.
" + ), + }, + "test_kruskal_wallis": { + "tip": "비모수 ANOVA: 여러 그룹이 같은 분포에서 왔는지 검정.", + "desc": ( + "크루스칼-왈리스 검정은 일원 ANOVA의 비모수 버전입니다. " + "정규성 가정 없이 여러 그룹이 같은 분포에서 왔는지 검정합니다.
" + "해석: p < 0.05: 최소 하나의 그룹이 유의하게 다름 → 쌍별 만-휘트니 후속 검정 | " + "p ≥ 0.05: 유의한 차이 발견 안 됨
" + ), + }, + "test_mann_whitney": { + "tip": "비모수 이표본 검정: 두 독립 그룹의 분포를 비교.", + "desc": ( + "만-휘트니 U 검정(윌콕슨 순위합)은 두 독립 그룹이 같은 분포에서 왔는지 판단합니다.
" + "해석: p < 0.05: 두 그룹이 유의하게 다름 | p ≥ 0.05: 유의한 차이 없음
" + "초보자 팁: 분포의 '형태' 차이도 검출하므로, 평균이 같아도 산포가 다르면 유의할 수 있습니다.
" + ), + }, + "test_chi_square": { + "tip": "관측된 범주 빈도가 기대 빈도와 유의하게 다른지 검정.", + "desc": ( + "카이제곱 적합도 검정은 관측된 범주 분포가 기대 분포(기본: 균일)와 일치하는지 확인합니다.
" + "해석: p < 0.05: 관측 빈도가 기대와 유의하게 다름 | p ≥ 0.05: 기대 분포와 일치
" + "조건: 각 기대 빈도가 5 이상이어야 검정이 유효합니다.
" + ), + }, + "test_grubbs": { + "tip": "데이터의 최극단값이 통계적으로 유의한 이상치인지 검정.", + "desc": ( + "그럽스 검정은 데이터의 가장 극단적인 값이 분포의 자연스러운 극단인지, " + "아니면 유의미한 이상치인지 평가합니다.
" + "해석: p < 0.05: 유의한 이상치 | p ≥ 0.05: 예상 범위 내
" + "초보자 팁: 그럽스는 한 번에 하나의 극단값만 검정합니다. " + "여러 이상치가 있으면 IQR 방법이나 Isolation Forest를 사용하세요.
" + ), + }, + "test_adf": { + "tip": "시계열이 정상(stationary)인지, 즉 통계적 속성이 시간에 따라 일정한지 검정.", + "desc": ( + "ADF(Augmented Dickey-Fuller) 검정은 시계열에 단위근이 있는지, " + "즉 비정상(평균·분산이 시간에 따라 변함)인지 판단합니다.
" + "해석: p < 0.05: 정상 시계열 ✓ | p ≥ 0.05: 비정상, 차분(differencing)이나 추세 제거 고려
" + "초보자 팁: 순차적 측정값 열이라면 회귀 전에 ADF를 확인하세요. " + "비정상 예측 변수는 회귀 결과를 무의미하게 만들 수 있습니다.
" + ), + }, + "sub_column_quality": { + "tip": "열별 완전성·유일성·유효성 품질 점수.", + "desc": ( + "컬럼 품질은 전체 점수를 개별 열 단위로 분해하여 어떤 열에 문제가 있는지 정확히 파악합니다.
" + "초보자 팁: 품질 점수가 매우 낮은 열이 정제나 제거의 최우선 대상입니다.
" + ), + }, + "sub_cleaning_log": { + "tip": "모든 자동 정제 작업을 단계별로 기록한 로그.", + "desc": "전처리 로그는 완전한 투명성과 재현성을 위해 시스템이 적용한 모든 변환을 기록합니다.
", + }, + "sub_detected_issues": { + "tip": "전처리 과정에서 발견된 데이터 품질 이슈 목록.", + "desc": "탐지된 문제는 혼합 타입, 의심스러운 패턴('999' 결측값 마커 등), 인코딩 오류 등을 열거합니다.
", + }, + "sub_normality_tests": { + "tip": "각 수치 열에 대한 Shapiro-Wilk, Anderson-Darling, Jarque-Bera 정규성 검정.", + "desc": ( + "정규성 검정 & 형태 분석은 세 가지 보완적 검정으로 각 열이 정규분포를 따르는지 평가합니다.
" + "세 검정 모두 p < 0.05이면 비정규일 가능성이 높습니다. 불일치하면 히스토그램을 확인하세요.
" + ), + }, + "sub_vif": { + "tip": "분산팽창계수(VIF)로 특성 간 다중공선성을 탐지.", + "desc": ( + "VIF는 각 특성의 회귀 계수 분산이 다른 특성과의 상관에 의해 얼마나 팽창하는지 측정합니다.
" + "해석: VIF=1: 상관 없음 | 1-5: 낮음 | 5-10: 보통 | >10: 심각, 특성 제거나 결합 고려
" + "초보자 팁: 높은 VIF는 불안정한 회귀 계수를 유발합니다. " + "데이터가 조금만 변해도 계수 부호가 뒤바뀔 수 있습니다.
" + ), + }, + "sub_summary": { + "tip": "분포 형태, 정규성 결과, 이상치 수의 빠른 요약.", + "desc": "요약은 왜도 분류, 첨도 유형, 정규성 지표, 이상치 수를 한 테이블에 제공합니다.
", + }, + "sub_variance_explained": { + "tip": "각 주성분이 포착하는 분산 비율(스크리 플롯 데이터).", + "desc": ( + "설명된 분산은 각 주성분의 개별 및 누적 분산 기여율을 보여줍니다.
" + "스크리 플롯: '엘보' 지점(곡선이 급격히 꺾이는 곳)이 추가 성분이 큰 가치를 더하지 못하는 시점입니다.
" + ), + }, + "sub_loadings": { + "tip": "각 원래 특성이 각 주성분에 기여하는 정도.", + "desc": ( + "PCA 적재량은 각 원래 특성의 주성분별 가중치(기여도)를 보여줍니다.
" + "예시: PC1이 '키', '몸무게', 'BMI'에 높은 적재량을 가지면, " + "PC1을 '체격' 성분으로 해석할 수 있습니다.
" + ), + }, +} + +# -- Chinese ----------------------------------------------------------- +METHOD_INFO["zh"] = { + "section_overview": { + "tip": "数据集概要:行/列数、数据类型分布、内存占用。", + "desc": ( + "概览在深入分析之前,展示数据集的整体结构。
" + "包含信息:
" + "重要性:先检查行数和类型可以在分析前捕获加载错误(截断文件、分隔符错误、编码问题)。
" + "初学者提示:如果行数远少于预期,可能是分隔符设置有误;如果数值列显示为“文本”,说明含有非数字字符。
" + ), + }, + "section_quality": { + "tip": "从完整性·唯一性·一致性·有效性四个维度评估数据质量(0-100%)。", + "desc": ( + "数据质量评估从四个独立维度对数据进行“健康检查”,每项0~100%。
" + "四个维度:
" + "分数标准:90-100%优秀 | 70-89%可接受,需关注标记 | <70%需在建模前解决
" + "综合公式:0.35×完整性 + 0.25×唯一性 + 0.20×一致性 + 0.20×有效性
" + ), + }, + "section_preprocessing": { + "tip": "记录分析前自动执行的所有清洗和转换步骤。", + "desc": ( + "预处理日志按顺序记录系统对原始数据执行的每一步自动清洗操作。
" + "常见步骤:删除空/常量列、字符串转数值、编码修复、无法解析的行移除。
" + "重要性:可重复性是可信分析的基础。必须清楚知道做了哪些变换才能验证结果。
" + "初学者提示:如果重要列被删除了,说明原始数据可能有格式问题,需要手动修复。
" + ), + }, + "section_descriptive": { + "tip": "每列的集中趋势、离散度和分布形态统计汇总。", + "desc": ( + "描述性统计是探索性数据分析(EDA)的基础,用数字概括每列的中心、散布和形状。
" + "数值列指标:
" + "初学者提示:均值与中位数差异大的列可能有异常值或严重偏斜。
" + ), + }, + "section_distribution": { + "tip": "通过直方图和Q-Q图可视化每个数值列的分布形态。", + "desc": ( + "分布分析让你直观“看到”每列值的分布形状。
" + "图表类型:
" + "常见形状:钟形(正态)·右偏(收入、价格)·左偏(接近满分的考试成绩)·双峰(两个子群混合)·均匀
" + "重要性:许多ML算法假设正态输入。了解实际分布有助于选择正确模型或应用变换。
" + ), + }, + "section_correlation": { + "tip": "衡量数值列间的Pearson(线性)和Spearman(秩)相关。", + "desc": ( + "相关分析测量数值列对之间的关系强度。
" + "两种相关系数:
" + "热力图:颜色越深=相关越强。红色=正相关, 蓝色=负相关。
" + "警戒线:|r|>0.90严重多重共线性 | |r|>0.70强相关 | |r|<0.30弱相关
" + "初学者提示:两个高度相关的特征携带相似信息,同时放入线性模型会导致不稳定。
" + ), + }, + "section_missing": { + "tip": "分析缺失数据的模式、比例和产生机制。", + "desc": ( + "缺失数据分析调查数据在哪里、多少、为什么缺失。
" + "三种缺失机制:
" + "实操指南:<5%可删或均值填充 | 5-30%用KNN/MICE | >50%考虑删除该列
" + ), + }, + "section_outlier": { + "tip": "使用IQR围栏和Z分数检测异常数据点。", + "desc": ( + "异常值检测识别远离其他数据的异常点。
" + "IQR方法:IQR=Q3-Q1 | 一般异常:
重要:并非所有异常值都是错误!在欺诈检测、罕见疾病等领域,异常值可能是最有价值的数据。
" + "初学者提示:箱线图中超出须(whisker)的点是需要查看的潜在异常值。
" + ), + }, + "section_categorical": { + "tip": "类别/布尔列的频率分布、柱状图和熵分析。", + "desc": ( + "类别分析检查非数值列——文本标签、类别、布尔值。
" + "关键指标:
" + "初学者提示:柱状图中某个柱远高于其他,说明该列“不平衡”,可能需要过采样等技术。
" + ), + }, + "section_importance": { + "tip": "用方差和互信息量按信息价值排列特征。", + "desc": ( + "特征重要性回答:哪些列携带最有用的信息?
" + "方法:方差(≈0的常量列无信息)·平均相关·互信息(捕获线性+非线性)
" + "初学者提示:不要盲目删除低重要性特征——它们单独弱,但与其他特征组合后可能很强(交互效应)。
" + ), + }, + "section_pca": { + "tip": "主成分分析揭示数据的内在维度和方差结构。", + "desc": ( + "PCA(主成分分析)将相关特征转换为按方差排序的不相关成分。
" + "关键输出:碎石图(各成分方差)·累积方差(保留90%需多少成分)·载荷矩阵(特征权重)
" + "初学者提示:PCA在特征尺度相近时效果最好。系统已自动标准化(z-score)后再执行PCA。
" + ), + }, + "section_duplicates": { + "tip": "识别可能膨胀统计量或偏移模型的完全重复行。", + "desc": ( + "重复项分析扫描所有列完全相同的行。
" + "危害:膨胀样本量使置信区间偏窄·训练测试集泄露·接近100%的重复率通常是加载错误。
" + "初学者提示:少量重复(<1%)通常无害,但意外的高比率必须调查。
" + ), + }, + "section_warnings": { + "tip": "汇聚所有分析中发现的警告和潜在问题。", + "desc": ( + "警告和问题将全部分析中发现的异常集中展示。
" + "常见警告:高缺失率(>30%)·常量列·多重共线性·极端异常值·类型不匹配
" + "初学者提示:把这里当作优先待办清单,先解决高严重度警告。
" + ), + }, + "sub_best_fit": { + "tip": "将每列与理论分布(正态、Gamma、Weibull等)比较,选择最佳拟合。", + "desc": ( + "最佳拟合分布将每个数值列与正态、对数正态、指数、Gamma等分布进行比较。
" + "选择标准:AIC(越低越好)·KS统计量(越小越好)·p>0.05表示可接受
" + "初学者提示:最佳分布为“norm”(正态)则大部分标准检验可直接使用;否则考虑对数变换。
" + ), + }, + "sub_jarque_bera": { + "tip": "检验偏度和峰度是否符合正态分布。", + "desc": ( + "Jarque-Bera检验专门检查数据的形状是否符合正态。
" + "解读:p≥0.05=不能拒绝正态 | p<0.05=显著非正态
" + "初学者提示:非正态数据非常常见,这不意味着数据“坏”,而是需要使用非参数方法或变换。
" + ), + }, + "sub_power_transform": { + "tip": "推荐Box-Cox/Yeo-Johnson变换使偏斜分布更接近正态。", + "desc": ( + "幂变换推荐建议可将偏斜数据转为更加钟形的数学变换。
" + "两种方法:Box-Cox(仅正数) | Yeo-Johnson(任意数据)
" + "初学者提示:幂变换是线性回归和神经网络的必要预处理步骤。
" + ), + }, + "sub_kde_bandwidth": { + "tip": "用Scott/Silverman规则确定核密度估计的最优平滑参数。", + "desc": ( + "KDE带宽分析寻找密度曲线的最佳“平滑度”。
" + "权衡:小带宽=捕捉细节但过拟合噪声 | 大带宽=平滑但可能遗漏特征
" + "初学者提示:两种规则给出的带宽差异大时,数据可能有异常值或多峰。
" + ), + }, + "sub_partial_corr": { + "tip": "控制其他所有变量后测量两变量间的直接关系。", + "desc": ( + "偏相关回答:两变量的关系是直接的,还是由第三变量引起的?
" + "例子:冰淇淋销量与溺水事故相关,但控制温度后偏相关接近零。温度才是真正驱动因素。
" + "解读:高偏相关=直接关系 | 接近零=虚假关系(被其他变量中介)
" + ), + }, + "sub_mutual_info": { + "tip": "捕获线性与非线性依赖的信息论度量。", + "desc": ( + "互信息(MI)衡量知道一个变量后能获得多少关于另一个变量的信息。
" + "核心:MI=0=统计独立 | MI>0=存在依赖。可捕获任何关系类型,包括Pearson为零的非线性关系。
" + "初学者提示:MI高但Pearson低说明存在非线性关系,建议查看散点图。
" + ), + }, + "sub_bootstrap_ci": { + "tip": "每对相关系数的重采样95%置信区间。", + "desc": ( + "Bootstrap相关置信区间告诉你每个相关估计实际有多可靠。
" + "原理:有放回抽样1000次→计算每次相关→2.5~97.5百分位=95% CI
" + "解读:窄CI=稳定 | 宽CI=不确定性高 | CI跨零=可能不显著
" + ), + }, + "sub_distance_corr": { + "tip": "Szekely距离相关——检测Pearson遗漏的非线性依赖。", + "desc": ( + "距离相关当且仅当变量真正独立时为零——比Pearson更强的保证。
" + "比较:低Pearson+高距离相关→存在非线性关系!用散点图发现模式。
" + ), + }, + "sub_kmeans": { + "tip": "通过轮廓分析自动优化聚类数的K-Means。", + "desc": ( + "K-Means聚类自动将数据分为k组。
" + "流程:标准化→k=2~10逐一尝试→选择最高轮廓分数的k
" + "指标:轮廓分>0.5=好 | >0.7=强结构 | 惯性(WCSS):越低越紧凑
" + "初学者提示:K-Means假设大致球形、大小相近的簇。不规则形状请看DBSCAN。
" + ), + }, + "sub_dbscan": { + "tip": "自动发现任意形状簇并识别噪声的密度聚类。", + "desc": ( + "DBSCAN通过寻找数据密集区域形成簇。
" + "优势:无需指定k·可发现任意形状·自动识别噪声点
" + "初学者提示:如果只找到1个簇且噪声很多,可能数据没有明显密度结构或eps需要调。
" + ), + }, + "sub_hierarchical": { + "tip": "展示簇在各层级如何合并的树状图。", + "desc": ( + "层次聚类构建树形结构(树状图)展示数据逐步合并的过程。
" + "读图:y轴=合并“距离”。在任意高度画水平线可得不同k。长竖线=自然簇边界。
" + ), + }, + "sub_cluster_profiles": { + "tip": "每个K-Means簇在所有特征上的统计摘要。", + "desc": ( + "聚类画像展示每个簇的均值/标准差,说明每个簇的独特之处。
" + "用途:簇间均值差异大的特征=定义簇的关键区分特征。
" + "初学者提示:用这张表为簇命名(如“高价值客户”“节约型买家”)。
" + ), + }, + "sub_tsne": { + "tip": "保留局部邻域结构的非线性2D可视化投影。", + "desc": ( + "t-SNE将高维数据压缩到2D散点图,同时保留哪些点彼此相似。
" + "读图:2D中靠近=原始空间中相似。明显分组=可能是真实簇。
" + "⚠️ 注意:簇间距离无意义·簇大小不反映实际·每次运行结果可能不同(随机算法)
" + ), + }, + "sub_umap": { + "tip": "同时保留局部和全局结构的快速非线性2D可视化。", + "desc": ( + "UMAP是t-SNE的现代替代,通常更快且更好保留全局布局。
" + "优势:速度快·全局结构保留好·簇间相对位置有一定意义
" + "初学者提示:如果t-SNE和UMAP显示相似簇结构,这些簇很可能是真实的。
" + ), + }, + "sub_factor_analysis": { + "tip": "发现解释观测变量间相关性的隐藏(潜在)因子。", + "desc": ( + "因子分析解释为什么某些变量彼此相关——假设存在隐藏的潜在因子。
" + "类比:10科成绩中可能隐含“语言能力”“数学能力”“艺术能力”三个潜在因子。
" + "初学者提示:噪声方差高的变量说明它不被公共因子解释,可能在测量独特的东西。
" + ), + }, + "sub_factor_loadings": { + "tip": "显示每个观测变量与每个潜在因子的关联强度。", + "desc": ( + "因子载荷量化原始变量与潜在因子之间的关系。
" + "解读:|载荷|>0.7=强 | 0.4-0.7=中等 | <0.4=弱
" + "交叉载荷:一个变量在多个因子上高载荷——不适合因子模型的混合变量。
" + ), + }, + "sub_feature_contrib": { + "tip": "用PCA载荷加权各特征的方差贡献,排名特征重要性。", + "desc": ( + "PCA加权特征贡献按对总方差的贡献排名原始特征,用于无监督特征选择。
" + "初学者提示:排名末尾的特征对整体方差贡献极小,是可以考虑移除的候选。
" + ), + }, + "sub_interaction": { + "tip": "检测特征对之间的协同乘积交互效应。", + "desc": ( + "交互检测检验两个特征的乘积是否包含单个特征中没有的信息。
" + "初学者提示:找到强交互后,将乘积作为新特征加入可以显著提升模型性能。
" + ), + }, + "sub_monotonic": { + "tip": "比较Pearson与Spearman识别非线性单调模式。", + "desc": ( + "单调关系分析检测一致增减但非直线的变量对。
" + "关键:|Spearman|-|Pearson|差距大=指数、对数等非线性单调模式。应用单调变换可改善线性模型。
" + ), + }, + "sub_binning": { + "tip": "用熵分析评估等宽和等频分箱策略。", + "desc": ( + "分箱分析评估将连续变量转为离散类别的策略。
" + "两种策略:等宽(相同区间宽度,对异常值敏感) vs 等频(相同数据量,适合偏斜数据)
" + ), + }, + "sub_cardinality": { + "tip": "分析唯一值数量并推荐类别编码方法。", + "desc": ( + "基数与编码推荐按唯一值数建议最佳编码策略:
" + "泄漏风险检查可能直接/间接获取目标变量的特征。
" + "初学者提示:如果模型精度“好得不真实”(如99%),数据泄漏是首要嫌疑。
" + ), + }, + "sub_iso_forest": { + "tip": "通过随机分割隔离异常值的树模型。", + "desc": ( + "隔离森林基于“异常值更容易被隔离”的思想。
" + "原理:构建随机分割树→测量每个点的平均隔离路径长度→路径短=更异常
" + "初学者提示:参数少、高维数据表现好,是异常检测的首选方法。
" + ), + }, + "sub_lof": { + "tip": "将每个点的局部密度与邻居比较的密度型检测。", + "desc": ( + "LOF比较每个点的局部密度与k近邻的密度。
" + "LOF≈1=正常 | LOF>>1=比邻居稀疏得多(异常)
" + "初学者提示:当各簇密度不同时,LOF比隔离森林更有效。
" + ), + }, + "sub_mahalanobis": { + "tip": "考虑特征相关性的数据中心多变量距离。", + "desc": ( + "马氏距离考虑协方差结构测量每个观测值到数据中心的距离。
" + "vs 欧氏距离:欧氏对所有方向一视同仁;马氏考虑异常程度——两个通常同涨同跌的特征," + "一高一低才是真正异常。
" + ), + }, + "sub_consensus": { + "tip": "3种方法中≥2种同意时标记为异常。", + "desc": ( + "共识异常检测结合隔离森林、LOF和马氏距离。
" + "规则:≥2/3方法同意→标记为异常,大大减少误报。
" + "初学者提示:从共识标记开始调查——这些是最可靠的异常候选。
" + ), + }, + "test_levene": { + "tip": "检验各组方差是否相等(齐方差假设)。", + "desc": ( + "Levene检验确认不同组的方差是否大致相等。
" + "解读:p>0.05=齐方差成立 | p≤0.05=方差显著不同,建议用Welch t检验或非参数方法。
" + ), + }, + "test_kruskal_wallis": { + "tip": "非参数ANOVA:检验多组是否来自相同分布。", + "desc": ( + "Kruskal-Wallis检验是单因素ANOVA的非参数版本,无需正态假设。
" + "解读:p<0.05=至少一组显著不同→做两两Mann-Whitney | p≥0.05=无显著差异
" + ), + }, + "test_mann_whitney": { + "tip": "非参数两样本检验:比较两独立组的分布。", + "desc": ( + "Mann-Whitney U检验判断两组是否来自相同分布。
" + "解读:p<0.05=两组显著不同 | p≥0.05=无显著差异
" + "初学者提示:即使均值相同,若离散度不同也可能显著。
" + ), + }, + "test_chi_square": { + "tip": "检验观测类别频率是否偏离期望频率。", + "desc": ( + "卡方适合度检验检查观测分布是否符合期望(默认均匀)。
" + "解读:p<0.05=显著偏离 | p≥0.05=一致
" + "条件:每个期望频率须≥5。
" + ), + }, + "test_grubbs": { + "tip": "检验最极端值是否为统计显著异常值。", + "desc": ( + "Grubbs检验评估最极端值是分布自然极端还是显著异常。
" + "解读:p<0.05=显著异常值 | p≥0.05=在预期范围内
" + "初学者提示:Grubbs一次只检验一个极端值。多个异常值请用IQR或隔离森林。
" + ), + }, + "test_adf": { + "tip": "检验时间序列是否平稳(统计特性随时间不变)。", + "desc": ( + "ADF检验判断时间序列是否有单位根(非平稳)。
" + "解读:p<0.05=平稳✓ | p≥0.05=非平稳,考虑差分或去趋势
" + "初学者提示:顺序测量列在回归前必须检查ADF。非平稳预测变量会使回归结果无意义。
" + ), + }, + "sub_column_quality": { + "tip": "每列的完整性·唯一性·有效性质量评分。", + "desc": ( + "列质量将总分拆解到每列,精确定位问题列。
" + "初学者提示:质量分极低的列是清洗或删除的首要目标。
" + ), + }, + "sub_cleaning_log": { + "tip": "所有自动清洗操作的逐步记录。", + "desc": "清洗日志记录系统执行的每一步变换,保证完全透明和可重复。
", + }, + "sub_detected_issues": { + "tip": "预处理中发现的数据质量问题清单。", + "desc": "检测到的问题列举混合类型、可疑模式(如'999'缺失标记)、编码错误等。
", + }, + "sub_normality_tests": { + "tip": "每列的Shapiro-Wilk、Anderson-Darling、Jarque-Bera正态性检验。", + "desc": ( + "正态性检验用三种互补检验评估每列是否服从正态分布。
" + "三者都p<0.05则很可能非正态。如果结果不一致,查看直方图确认原因。
" + ), + }, + "sub_vif": { + "tip": "通过方差膨胀因子检测多重共线性。", + "desc": ( + "VIF衡量回归系数方差因特征间相关而膨胀多少。
" + "解读:VIF=1无相关 | 1-5低 | 5-10中等 | >10严重,考虑移除或合并
" + "初学者提示:高VIF导致系数不稳定,数据微小变化就可能翻转系数符号。
" + ), + }, + "sub_summary": { + "tip": "分布形态、正态性和异常值数的快速汇总。", + "desc": "摘要在一张表中提供偏度分类、峰度类型、正态性指标和异常值计数。
", + }, + "sub_variance_explained": { + "tip": "每个主成分所捕获的方差比例(碎石图数据)。", + "desc": ( + "解释方差展示每个主成分的个体和累积方差贡献率。
" + "碎石图:“肘部”(曲线急弯处)是追加成分价值变小的转折点。
" + ), + }, + "sub_loadings": { + "tip": "每个原始特征对每个主成分的贡献权重。", + "desc": ( + "PCA载荷展示每个原始特征在各主成分上的权重。
" + "示例:PC1在“身高”“体重”“BMI”上载荷高,可解读为“体型”成分。
" + ), + }, +} + +# -- Japanese ---------------------------------------------------------- +METHOD_INFO["ja"] = { + "section_overview": { + "tip": "データセット概要:行/列数、データ型分布、メモリ使用量。", + "desc": ( + "概要は、詳細分析の前にデータセット全体の構造を把握するためのセクションです。
" + "含まれる情報:
" + "重要性:行数と型を最初に確認することで、読み込みエラー(ファイル切断、区切り文字ミス、文字化け)を早期発見できます。
" + "初心者向けヒント:行数が予想より少なければ区切り文字の問題、数値列がテキストと表示されていれば非数値文字が混入している可能性があります。
" + ), + }, + "section_quality": { + "tip": "完全性・一意性・一貫性・妥当性の4次元で品質を0-100%で評価。", + "desc": ( + "データ品質評価は、4つの独立した軸でデータの「健康診断」を行います。各軸0〜100%。
" + "4つの軸:
" + "スコア基準:90-100%=優秀 | 70-89%=注意すべきフラグあり | <70%=モデリング前に対処必須
" + "総合式:0.35×完全性 + 0.25×一意性 + 0.20×一貫性 + 0.20×妥当性
" + ), + }, + "section_preprocessing": { + "tip": "分析前に自動実行した全クリーニング/変換ステップの記録。", + "desc": ( + "前処理ログは、生データに対して実行された全自動クリーニング操作を時系列で記録します。
" + "一般的な操作:空/定数列の削除、文字列→数値変換、エンコーディング修正、解析不能行の除去。
" + "重要性:再現性は信頼できる分析の基盤です。結果を検証するには、どの変換が行われたか正確に知る必要があります。
" + "初心者向けヒント:重要な列が削除された場合、元データにフォーマットの問題がある可能性があります。手動修正を検討してください。
" + ), + }, + "section_descriptive": { + "tip": "各列の中心傾向・ばらつき・分布形状の統計要約。", + "desc": ( + "記述統計は探索的データ分析(EDA)の基礎で、各列の中心・広がり・形状を数値で要約します。
" + "数値列の指標:
" + "初心者向けヒント:平均と中央値の差が大きい列は、外れ値や強い偏りがある可能性があります。
" + ), + }, + "section_distribution": { + "tip": "ヒストグラムとQ-Qプロットで各数値列の分布形状を可視化。", + "desc": ( + "分布分析により、各列の値の分布を「目で見て」理解できます。
" + "グラフの種類:
" + "よくある形状:釣り鐘型(正規)・右歪み(収入、価格)・左歪み(満点に近い成績)・二峰性(2グループの混合)・一様
" + "重要性:多くのML手法は正規的な入力を仮定します。分布を知ることでモデル選択や変換の判断ができます。
" + ), + }, + "section_correlation": { + "tip": "数値列間のPearson(線形)とSpearman(順位)相関を測定。", + "desc": ( + "相関分析は変数ペア間の関係の強さを測定します。
" + "2種類の相関:
" + "ヒートマップ:色が濃い=相関が強い。赤=正相関、青=負相関。
" + "警戒ライン:|r|>0.90 重度の多重共線性 | |r|>0.70 強い相関 | |r|<0.30 弱い相関
" + "初心者向けヒント:高相関の特徴量は類似情報を持ちます。線形モデルに両方入れると不安定になります。
" + ), + }, + "section_missing": { + "tip": "欠損データのパターン・割合・メカニズムを分析。", + "desc": ( + "欠損分析は、データがどこで、どれだけ、なぜ欠損しているかを調査します。
" + "3つのメカニズム:
" + "対処の目安:<5%=削除・平均補完 | 5-30%=KNN/MICE | >50%=列の削除を検討
" + ), + }, + "section_outlier": { + "tip": "IQRフェンスとZスコアで異常データポイントを検出。", + "desc": ( + "外れ値検出は、他のデータから極端に離れた値を特定します。
" + "IQR法:IQR=Q3-Q1 | 一般外れ値:
重要:すべての外れ値がエラーではありません!不正検知や希少疾患では、外れ値こそ最も価値あるデータです。
" + "初心者向けヒント:箱ひげ図のひげ(whisker)を超えた点が調査すべき外れ値候補です。
" + ), + }, + "section_categorical": { + "tip": "カテゴリ/ブール列の頻度分布・棒グラフ・エントロピー分析。", + "desc": ( + "カテゴリ分析は非数値列—テキストラベル、カテゴリ、ブール値を検査します。
" + "主要指標:
" + "初心者向けヒント:棒グラフで1本の棒が異常に高ければ「不均衡」です。オーバーサンプリング等の対策が必要な場合があります。
" + ), + }, + "section_importance": { + "tip": "分散と相互情報量で特徴量を情報価値順にランク付け。", + "desc": ( + "特徴量重要度は、どの列が最も有用な情報を持っているか?に答えます。
" + "手法:分散(≈0の定数列は無情報)・平均相関・相互情報量(線形+非線形を捕捉)
" + "初心者向けヒント:低重要度の特徴を盲目的に削除しないでください。単独では弱くても他の特徴と組み合わせると強力になれます(交互作用効果)。
" + ), + }, + "section_pca": { + "tip": "主成分分析でデータの内在次元と分散構造を解明。", + "desc": ( + "PCA(主成分分析)は相関のある特徴を分散順に並んだ無相関の成分に変換します。
" + "主要出力:スクリープロット(各成分の分散)・累積分散(90%保持に必要な成分数)・負荷量行列(特徴の重み)
" + "初心者向けヒント:PCAは特徴のスケールが揃っている時に最も有効です。システムはz-scoreで自動標準化してからPCAを実行しています。
" + ), + }, + "section_duplicates": { + "tip": "統計量の膨張やモデルの偏りを招く完全重複行を特定。", + "desc": ( + "重複分析は、全列が完全に同一の行をスキャンします。
" + "影響:サンプルサイズの膨張で信頼区間が狭くなる・訓練/テスト間のデータ漏洩・100%近い重複率はロードエラーの可能性大。
" + "初心者向けヒント:少量の重複(<1%)は通常無害ですが、予想外に高い場合は必ず調査してください。
" + ), + }, + "section_warnings": { + "tip": "全分析から抽出された警告と潜在的問題を集約。", + "desc": ( + "警告と問題は、全分析で検出された異常を一か所に集めて表示します。
" + "よくある警告:高欠損率(>30%)・定数列・多重共線性・極端外れ値・型不整合
" + "初心者向けヒント:ここを優先TODOリストとして使い、重大度の高い警告から対処しましょう。
" + ), + }, + "sub_best_fit": { + "tip": "各列を理論分布(正規、ガンマ、ワイブル等)と比較し最適を選択。", + "desc": ( + "最適分布フィッティングは、各数値列を正規・対数正規・指数・ガンマ等と比較します。
" + "選択基準:AIC(低い方がよい)・KS統計量(小さい方がよい)・p>0.05で許容範囲
" + "初心者向けヒント:最適が“norm”(正規)なら標準検定がそのまま使えます。そうでなければ対数変換等を検討。
" + ), + }, + "sub_jarque_bera": { + "tip": "歪度・尖度が正規分布と一致するか検定。", + "desc": ( + "Jarque-Bera検定はデータの形状が正規に合致するかを専門的に検定します。
" + "解釈:p≥0.05=正規性を棄却できない | p<0.05=有意に非正規
" + "初心者向けヒント:非正規は非常に一般的です。データが「悪い」のではなく、ノンパラメトリック手法や変換が必要なだけです。
" + ), + }, + "sub_power_transform": { + "tip": "Box-Cox/Yeo-Johnson変換で歪んだ分布を正規に近づける推奨。", + "desc": ( + "べき変換推奨は、歪んだデータをより釣り鐘型に変換する数学変換を提案します。
" + "2つの手法:Box-Cox(正の値のみ) | Yeo-Johnson(任意の値)
" + "初心者向けヒント:べき変換は線形回帰やニューラルネットワークの必須前処理ステップです。
" + ), + }, + "sub_kde_bandwidth": { + "tip": "Scott/Silvermanルールでカーネル密度推定の最適平滑パラメータを決定。", + "desc": ( + "KDE帯域幅分析は密度曲線の最適な「滑らかさ」を探します。
" + "トレードオフ:帯域幅小=細部を捉えるがノイズに過剰適合 | 帯域幅大=滑らかだが特徴を見逃す可能性
" + "初心者向けヒント:2つのルールの帯域幅が大きく異なる場合、外れ値や多峰性がある可能性があります。
" + ), + }, + "sub_partial_corr": { + "tip": "他の全変数をコントロールした後の2変数間の直接関係を測定。", + "desc": ( + "偏相関は、2変数の関係は直接的か、第三の変数によるものか?に答えます。
" + "例:アイスクリーム売上と溺水事故は相関しますが、気温をコントロールすると偏相関はゼロに。気温が真の駆動因子です。
" + "解釈:高偏相関=直接関係 | ≈0=疑似相関(他の変数を介した関係)
" + ), + }, + "sub_mutual_info": { + "tip": "線形・非線形双方の依存関係を捕捉する情報理論的尺度。", + "desc": ( + "相互情報量(MI)は、一方の変数を知ることで他方についてどれだけ情報が得られるかを測定します。
" + "特徴:MI=0=統計的独立 | MI>0=依存あり。Pearsonがゼロの非線形関係も含め、あらゆる依存を捕捉。
" + "初心者向けヒント:MIが高くPearsonが低ければ非線形関係が存在します。散布図で確認しましょう。
" + ), + }, + "sub_bootstrap_ci": { + "tip": "各相関係数のリサンプリング95%信頼区間。", + "desc": ( + "ブートストラップ信頼区間は、各相関推定値の信頼性を示します。
" + "手法:復元抽出1000回→各回の相関を計算→2.5〜97.5パーセンタイル=95%CI
" + "解釈:CIが狭い=安定 | CIが広い=不確実性が高い | CIがゼロを跨ぐ=有意でない可能性
" + ), + }, + "sub_distance_corr": { + "tip": "Szekely距離相関——Pearsonが見逃す非線形依存を検出。", + "desc": ( + "距離相関は、変数が真に独立の場合にのみゼロになる唯一の相関尺度です。
" + "比較:Pearsonが低く距離相関が高い→非線形関係の存在!散布図でパターンを確認。
" + ), + }, + "sub_kmeans": { + "tip": "シルエット分析で最適クラスタ数を自動選択するK-Means。", + "desc": ( + "K-Meansクラスタリングはデータを自動的にk個のグループに分割します。
" + "手順:標準化→k=2〜10を順に試行→シルエットスコア最大のkを選択
" + "指標:シルエット>0.5=良好 | >0.7=強い構造 | 慣性(WCSS):低いほどコンパクト
" + "初心者向けヒント:K-Meansは大まかに球形・似た大きさのクラスタを仮定します。不規則な形状にはDBSCANが適しています。
" + ), + }, + "sub_dbscan": { + "tip": "任意形状のクラスタを自動発見しノイズ点を識別する密度ベース法。", + "desc": ( + "DBSCANはデータの密集領域を探してクラスタを形成します。
" + "利点:k指定不要・任意形状を発見可能・ノイズ点を自動識別
" + "初心者向けヒント:クラスタが1つでノイズが多い場合、明確な密度構造がないかeps調整が必要です。
" + ), + }, + "sub_hierarchical": { + "tip": "データ統合の階層構造を示すデンドログラム。", + "desc": ( + "階層的クラスタリングはデータの段階的マージ過程を樹形図(デンドログラム)で表示します。
" + "読み方:y軸=マージ「距離」。任意の高さで水平線を引くと異なるkが得られます。長い縦線=自然なクラスタ境界。
" + ), + }, + "sub_cluster_profiles": { + "tip": "各K-Meansクラスタの全特徴量にわたる統計プロファイル。", + "desc": ( + "クラスタプロファイルは各クラスタの平均/標準偏差を表示し、各クラスタの特徴を明らかにします。
" + "用途:クラスタ間で平均差が最も大きい特徴=クラスタを定義する重要な区別特徴。
" + "初心者向けヒント:この表を使ってクラスタに名前を付けましょう(例:「高価値顧客」「節約志向の購買者」)。
" + ), + }, + "sub_tsne": { + "tip": "局所近傍構造を保持する非線形2D可視化。", + "desc": ( + "t-SNEは高次元データを2D散布図に圧縮し、どの点が似ているかを保持します。
" + "読み方:2Dで近い=元の空間で類似。はっきりした塊=本当のクラスタの可能性。
" + "⚠️ 注意:クラスタ間距離は無意味・サイズは実際を反映しない・毎回結果が変化(確率的アルゴリズム)
" + ), + }, + "sub_umap": { + "tip": "局所+大域構造を同時保持する高速非線形2D可視化。", + "desc": ( + "UMAPはt-SNEの現代的代替で、通常より高速で大域レイアウトもよく保持します。
" + "利点:高速・大域構造保持が良好・クラスタ間相対位置にある程度の意味。
" + "初心者向けヒント:t-SNEとUMAPの両方で似たクラスタ構造が出れば、それは本当のクラスタである可能性が高いです。
" + ), + }, + "sub_factor_analysis": { + "tip": "観測変数間の相関を説明する隠れた(潜在)因子を発見。", + "desc": ( + "因子分析は、なぜ特定の変数が相関し合うのかを説明します――背後に潜在因子があると仮定します。
" + "例え:10科目の成績の背後に「言語能力」「数学能力」「芸術能力」という3つの潜在因子が隠れている可能性。
" + "初心者向けヒント:ノイズ分散が高い変数は共通因子で説明されず、何か独自のものを測定しています。
" + ), + }, + "sub_factor_loadings": { + "tip": "各観測変数と各潜在因子間の関連強度。", + "desc": ( + "因子負荷量は元の変数と潜在因子の間の結びつきの強さを定量化します。
" + "解釈:|負荷量|>0.7=強 | 0.4-0.7=中程度 | <0.4=弱
" + "交差負荷:複数因子に高い負荷を持つ変数——因子モデルに適さない混合変数。
" + ), + }, + "sub_feature_contrib": { + "tip": "PCA負荷量加重で各特徴の分散寄与をランキング。", + "desc": ( + "PCA加重特徴寄与は総分散への寄与で元の特徴をランク付けし、教師なし特徴選択に使います。
" + "初心者向けヒント:ランキング下位の特徴は全体分散への貢献が極めて小さく、除外候補です。
" + ), + }, + "sub_interaction": { + "tip": "特徴ペア間の相乗的な積交互作用効果を検出。", + "desc": ( + "交互作用検出は、2つの特徴の積が個別にはない情報を含むかを検定します。
" + "初心者向けヒント:強い交互作用を発見したら、その積を新たな特徴として追加するとモデル性能が大幅に向上する場合があります。
" + ), + }, + "sub_monotonic": { + "tip": "Pearson vs Spearman比較で非線形単調パターンを識別。", + "desc": ( + "単調関係分析は、直線ではないが一緒に増減する変数ペアを検出します。
" + "ポイント:|Spearman|-|Pearson|差が大きい=指数・対数等の非線形単調パターン。単調変換で線形モデルを改善できます。
" + ), + }, + "sub_binning": { + "tip": "エントロピー分析で等幅・等頻度ビニング戦略を評価。", + "desc": ( + "ビニング分析は連続変数を離散カテゴリに変換する戦略を評価します。
" + "2つの戦略:等幅(同じ区間幅、外れ値に敏感) vs 等頻度(同じデータ量、歪んだデータに適切)
" + ), + }, + "sub_cardinality": { + "tip": "ユニーク値数を分析しカテゴリエンコーディング手法を推奨。", + "desc": ( + "カーディナリティとエンコーディング推奨:ユニーク値数に基づき最適戦略を提案:
" + "リークリスクはターゲット変数に直接/間接的にアクセスできる特徴を検査します。
" + "初心者向けヒント:モデルの精度が「出来すぎ」(例:99%)の場合、データリークが最有力犯です。
" + ), + }, + "sub_iso_forest": { + "tip": "ランダム分割で異常値を隔離するツリーベースの手法。", + "desc": ( + "Isolation Forestは「異常は隔離しやすい」という考えに基づきます。
" + "原理:ランダム分割木を構築→各点の平均隔離パス長を測定→パスが短い=より異常
" + "初心者向けヒント:パラメータが少なく高次元データに強い、異常検出の第一選択アルゴリズムです。
" + ), + }, + "sub_lof": { + "tip": "各点の局所密度を近傍と比較する密度ベースの異常検出。", + "desc": ( + "LOF(局所外れ値因子)は各点の局所密度をk近傍の密度と比較します。
" + "LOF≈1=正常 | LOF>>1=近傍よりはるかに疎(異常)
" + "初心者向けヒント:各クラスタの密度が異なる場合、LOFはIsolation Forestより有効です。
" + ), + }, + "sub_mahalanobis": { + "tip": "特徴間の相関を考慮したデータ中心からの多変量距離。", + "desc": ( + "マハラノビス距離は共分散構造を考慮して各観測点からデータ中心までの距離を測定します。
" + "vs ユークリッド:ユークリッドは全方向を等しく扱いますが、マハラノビスは異常の程度を考慮します。" + "通常は連動する2つの特徴が一方高・他方低の場合、それが真の異常です。
" + ), + }, + "sub_consensus": { + "tip": "3手法中≥2が同意した場合にのみ異常値としてフラグ。", + "desc": ( + "合意型異常検出はIsolation Forest・LOF・マハラノビスを統合します。
" + "ルール:≥2/3の手法が同意→異常としてフラグ。誤検出を大幅に削減します。
" + "初心者向けヒント:まず合意フラグから調査しましょう——最も信頼性の高い異常候補です。
" + ), + }, + "test_levene": { + "tip": "グループ間の分散が等しいか検定(等分散の仮定)。", + "desc": ( + "Levene検定は異なるグループの分散がほぼ等しいかを確認します。
" + "解釈:p>0.05=等分散成立 | p≤0.05=分散が有意に異なる→Welch t検定やノンパラ法を推奨
" + ), + }, + "test_kruskal_wallis": { + "tip": "ノンパラ版ANOVA:複数グループが同じ分布からのものか検定。", + "desc": ( + "Kruskal-Wallis検定は一元配置ANOVAのノンパラメトリック版で、正規性仮定が不要です。
" + "解釈:p<0.05=少なくとも1グループが有意に異なる→ペアワイズMann-Whitney | p≥0.05=有意差なし
" + ), + }, + "test_mann_whitney": { + "tip": "ノンパラ二標本検定:2独立グループの分布を比較。", + "desc": ( + "Mann-Whitney U検定は2つのグループが同じ分布から来ているか判定します。
" + "解釈:p<0.05=2群は有意に異なる | p≥0.05=有意差なし
" + "初心者向けヒント:平均が同じでも散らばりが異なれば有意になることがあります。
" + ), + }, + "test_chi_square": { + "tip": "観測カテゴリ頻度が期待頻度から逸脱しているか検定。", + "desc": ( + "カイ二乗適合度検定は観測分布が期待(デフォルトで均一)に合致するかを検定します。
" + "解釈:p<0.05=有意に逸脱 | p≥0.05=一致
" + "条件:各期待頻度が5以上であること。
" + ), + }, + "test_grubbs": { + "tip": "最極端値が統計的に有意な外れ値かを検定。", + "desc": ( + "Grubbs検定は最極端値が分布の自然な極端か有意な異常かを評価します。
" + "解釈:p<0.05=有意な外れ値 | p≥0.05=予想範囲内
" + "初心者向けヒント:Grubbs検定は一度に1つの極端値のみ検定します。複数の外れ値にはIQRやIsolation Forestを使いましょう。
" + ), + }, + "test_adf": { + "tip": "時系列が定常か(統計的性質が時間変化しないか)検定。", + "desc": ( + "ADF検定は時系列に単位根(非定常)があるか判定します。
" + "解釈:p<0.05=定常✓ | p≥0.05=非定常、差分やトレンド除去を検討
" + "初心者向けヒント:時系列的な列は回帰前にADFチェックが必須です。非定常な予測変数は回帰結果を無意味にします。
" + ), + }, + "sub_column_quality": { + "tip": "各列の完全性・一意性・妥当性の品質スコア。", + "desc": ( + "列品質は全体スコアを各列に分解し、問題列をピンポイントで特定します。
" + "初心者向けヒント:品質スコアが極端に低い列がクリーニングまたは削除の最優先ターゲットです。
" + ), + }, + "sub_cleaning_log": { + "tip": "全自動クリーニング操作のステップバイステップ記録。", + "desc": "クリーニングログはシステムが実行した各変換を記録し、完全な透明性と再現性を保証します。
", + }, + "sub_detected_issues": { + "tip": "前処理で検出されたデータ品質問題のリスト。", + "desc": "検出された問題は混合型、疑わしいパターン(「999」欠損マーカーなど)、エンコーディングエラー等を列挙します。
", + }, + "sub_normality_tests": { + "tip": "各列のShapiro-Wilk、Anderson-Darling、Jarque-Bera正規性検定。", + "desc": ( + "正規性検定は3つの相補的な検定で各列が正規分布に従うか評価します。
" + "3つ全てp<0.05なら非正規の可能性が高いです。不一致ならヒストグラムで確認しましょう。
" + ), + }, + "sub_vif": { + "tip": "分散膨張因子(VIF)で多重共線性を検出。", + "desc": ( + "VIFは特徴間の相関により回帰係数の分散がどれだけ膨張するかを測定します。
" + "解釈:VIF=1=相関なし | 1-5=低 | 5-10=中程度 | >10=深刻、除去や結合を検討
" + "初心者向けヒント:VIFが高いと係数が不安定になり、データの微小変化で符号が反転することがあります。
" + ), + }, + "sub_summary": { + "tip": "分布形状、正規性、外れ値数のクイックサマリー。", + "desc": "要約は歪度分類、尖度タイプ、正規性指標、外れ値カウントを1つの表にまとめます。
", + }, + "sub_variance_explained": { + "tip": "各主成分が捕捉する分散の割合(スクリープロットデータ)。", + "desc": ( + "説明分散は各主成分の個別および累積分散寄与率を表示します。
" + "スクリープロット:「肘」(曲線が急に曲がる点)が追加成分の価値が低下する転換点です。
" + ), + }, + "sub_loadings": { + "tip": "各元の特徴の各主成分への寄与の重み。", + "desc": ( + "PCA負荷量は各元の特徴が各主成分にどの程度の重みを持つかを表示します。
" + "例:PC1が「身長」「体重」「BMI」に高い負荷→「体格」成分と解釈可能。
" + ), + }, +} + +# -- German ------------------------------------------------------------ +METHOD_INFO["de"] = { + "section_overview": { + "tip": "Datensatz-Überblick: Zeilen/Spalten, Datentypen, Speicherverbrauch.", + "desc": ( + "Überblick zeigt die Gesamtstruktur des Datensatzes vor der Detailanalyse.
" + "Enthaltene Informationen:
" + "Warum wichtig: Eine Überprüfung der Zeilen und Typen deckt Ladefehler (abgeschnittene Dateien, Trennzeichenfehler, Kodierungsprobleme) frühzeitig auf.
" + "Anfänger-Tipp: Weniger Zeilen als erwartet? Möglicherweise ein Trennzeichenproblem. Numerische Spalten als «Text» angezeigt? Es könnten nicht-numerische Zeichen enthalten sein.
" + ), + }, + "section_quality": { + "tip": "Datenqualität in 4 Dimensionen: Vollständigkeit, Eindeutigkeit, Konsistenz, Gültigkeit (0-100%).", + "desc": ( + "Qualitätsbewertung – ein «Gesundheitscheck» über 4 unabhängige Achsen, je 0–100%.
" + "Die vier Achsen:
" + "Bewertung: 90-100% = ausgezeichnet | 70-89% = akzeptabel, Hinweise beachten | <70% = vor Modellierung beheben
" + "Formel: 0,35×Vollst. + 0,25×Eindeut. + 0,20×Konsist. + 0,20×Gültigkeit
" + ), + }, + "section_preprocessing": { + "tip": "Protokoll aller automatischen Bereinigungsschritte vor der Analyse.", + "desc": ( + "Vorverarbeitungsprotokoll dokumentiert chronologisch alle automatischen Bereinigungsoperationen.
" + "Typische Schritte: Entfernung leerer/konstanter Spalten, String-zu-Zahl-Konvertierung, Kodierungskorrekturen, nicht parsbare Zeilen entfernen.
" + "Warum wichtig: Reproduzierbarkeit ist die Grundlage vertrauenswürdiger Analysen. Nur wer weiß, welche Transformationen angewandt wurden, kann Ergebnisse validieren.
" + "Anfänger-Tipp: Wurde eine wichtige Spalte entfernt? Dann hat die Originaldatei möglicherweise Formatprobleme.
" + ), + }, + "section_descriptive": { + "tip": "Zentrale Tendenz, Streuung und Verteilungsform jeder Spalte.", + "desc": ( + "Deskriptive Statistik ist das Fundament der explorativen Datenanalyse (EDA) und fasst Zentrum, Streuung und Form jeder Spalte zusammen.
" + "Numerische Kennzahlen:
" + "Anfänger-Tipp: Spalten mit großer Abweichung zwischen Mittelwert und Median könnten Ausreißer oder starke Schiefe aufweisen.
" + ), + }, + "section_distribution": { + "tip": "Histogramme und Q-Q-Plots zur Visualisierung der Verteilungsform.", + "desc": ( + "Verteilungsanalyse macht die Werteverteilung jeder Spalte sichtbar.
" + "Diagrammtypen:
" + "Häufige Formen: Glocke (normal), rechtschief (Einkommen), linksschief (Prüfungsnoten nahe Maximum), bimodal (Mischung), gleichverteilt.
" + "Warum wichtig: Viele ML-Algorithmen setzen normalverteilte Eingaben voraus. Kenntnis der tatsächlichen Verteilung hilft bei Modellwahl und Transformationen.
" + ), + }, + "section_correlation": { + "tip": "Pearson- (linear) und Spearman- (Rang-)Korrelation zwischen numerischen Spalten.", + "desc": ( + "Korrelationsanalyse misst die Stärke der Beziehung zwischen Variablenpaaren.
" + "Zwei Korrelationsmaße:
" + "Heatmap: Dunklere Farbe = stärkere Korrelation. Rot = positiv, Blau = negativ.
" + "Schwellenwerte: |r|>0,90 schwere Multikollinearität | |r|>0,70 stark | |r|<0,30 schwach
" + "Anfänger-Tipp: Hoch korrelierte Features tragen ähnliche Informationen. Beide in einem linearen Modell → Instabilität.
" + ), + }, + "section_missing": { + "tip": "Analyse fehlender Daten: Muster, Anteil und Mechanismus.", + "desc": ( + "Analyse fehlender Daten untersucht, wo, wie viel und warum Daten fehlen.
" + "Drei Mechanismen:
" + "Praxis-Leitfaden: <5% Löschen/Mittelwert | 5-30% KNN/MICE | >50% Spalte entfernen erwägen
" + ), + }, + "section_outlier": { + "tip": "Erkennung anomaler Datenpunkte mittels IQR-Zäunen und Z-Scores.", + "desc": ( + "Ausreißererkennung identifiziert Werte, die weit vom Rest entfernt liegen.
" + "IQR-Methode: IQR = Q3−Q1 | Mild:
Wichtig: Nicht alle Ausreißer sind Fehler! Bei Betrugserkennung oder seltenen Krankheiten können Ausreißer die wertvollsten Daten sein.
" + "Anfänger-Tipp: Punkte jenseits der Whisker im Boxplot sind potenzielle Ausreißer, die untersucht werden sollten.
" + ), + }, + "section_categorical": { + "tip": "Häufigkeitsverteilung, Balkendiagramme und Entropie kategorischer Spalten.", + "desc": ( + "Kategorische Analyse untersucht nicht-numerische Spalten – Textlabels, Kategorien, boolesche Werte.
" + "Kernkennzahlen:
" + "Anfänger-Tipp: Ein extrem hoher Balken im Diagramm zeigt «Ungleichgewicht» – ggf. Oversampling nötig.
" + ), + }, + "section_importance": { + "tip": "Feature-Ranking nach Varianz und gegenseitiger Information.", + "desc": ( + "Feature-Wichtigkeit beantwortet: Welche Spalten tragen die nützlichsten Informationen?
" + "Methoden: Varianz (≈0 = konstant, keine Info) · mittlere Korrelation · Mutual Information (linear + nichtlinear).
" + "Anfänger-Tipp: Löschen Sie unwichtige Features nicht blind – einzeln schwach, aber in Kombinationen möglicherweise stark (Interaktionseffekte).
" + ), + }, + "section_pca": { + "tip": "Hauptkomponentenanalyse: intrinsische Dimensionen und Varianzstruktur.", + "desc": ( + "PCA transformiert korrelierte Features in nach Varianz geordnete, unkorrelierte Komponenten.
" + "Ergebnisse: Scree-Plot · kumulative Varianz · Loadings-Matrix
" + "Anfänger-Tipp: PCA funktioniert am besten bei ähnlichen Skalen. Das System hat automatisch z-standardisiert.
" + ), + }, + "section_duplicates": { + "tip": "Erkennung vollständig identischer Zeilen, die Statistiken aufblähen.", + "desc": ( + "Duplikat-Analyse scannt nach Zeilen, die in allen Spalten identisch sind.
" + "Risiken: Aufgeblähte Stichprobe → zu enge Konfidenzintervalle · Train/Test-Leakage · ≈100% Duplikate = wahrscheinlich ein Ladefehler.
" + "Anfänger-Tipp: Wenige Duplikate (<1%) sind meist harmlos, aber unerwartet hohe Raten erfordern Untersuchung.
" + ), + }, + "section_warnings": { + "tip": "Zusammenfassung aller erkannten Warnungen und potenzieller Probleme.", + "desc": ( + "Warnungen und Probleme bündelt alle Anomalien aus allen Analysen an einem Ort.
" + "Häufige Warnungen: Hoher Fehlanteil (>30%) · Konstante Spalten · Multikollinearität · Extreme Ausreißer · Typ-Inkompatibilitäten
" + "Anfänger-Tipp: Behandeln Sie dies als priorisierte Aufgabenliste – beginnen Sie mit den schwerwiegendsten.
" + ), + }, + "sub_best_fit": { + "tip": "Vergleich jeder Spalte mit theoretischen Verteilungen (Normal, Gamma, Weibull, …).", + "desc": ( + "Best-Fit-Verteilung vergleicht numerische Spalten mit Normal-, Log-Normal-, Exponential-, Gamma-Verteilungen usw.
" + "Kriterien: AIC (niedriger = besser) · KS-Statistik (kleiner = besser) · p>0,05 = akzeptabel
" + "Anfänger-Tipp: Best-Fit «norm» → Standard-Tests direkt anwendbar; sonst Log-Transformation erwägen.
" + ), + }, + "sub_jarque_bera": { + "tip": "Prüft, ob Schiefe und Kurtosis einer Normalverteilung entsprechen.", + "desc": ( + "Jarque-Bera-Test prüft speziell die Form der Verteilung.
" + "Interpretation: p≥0,05 = Normalität nicht abgelehnt | p<0,05 = signifikant nicht-normal
" + "Anfänger-Tipp: Nicht-Normalität ist sehr häufig und bedeutet nicht «schlechte» Daten, sondern dass nicht-parametrische Methoden oder Transformationen nötig sind.
" + ), + }, + "sub_power_transform": { + "tip": "Empfehlung für Box-Cox/Yeo-Johnson zur Annäherung an Normalverteilung.", + "desc": ( + "Potenztransformation schlägt mathematische Umformungen vor, die schiefe Daten glockenförmiger machen.
" + "Zwei Verfahren: Box-Cox (nur positive Werte) | Yeo-Johnson (beliebige Werte)
" + "Anfänger-Tipp: Eine wichtige Vorverarbeitungs-Stufe für lineare Regression und neuronale Netze.
" + ), + }, + "sub_kde_bandwidth": { + "tip": "Optimale Glättung der Kerndichteschätzung nach Scott/Silverman.", + "desc": ( + "KDE-Bandbreite bestimmt die optimale «Glätte» der Dichtekurve.
" + "Abwägung: Klein = Detailreich, aber Rauschen-Überanpassung | Groß = Glatt, aber Merkmale werden möglicherweise übersehen
" + "Anfänger-Tipp: Große Differenz beider Regeln → möglicherweise Ausreißer oder Multimodalität.
" + ), + }, + "sub_partial_corr": { + "tip": "Direkte Beziehung zweier Variablen nach Kontrolle aller anderen.", + "desc": ( + "Partielle Korrelation beantwortet: Ist die Beziehung direkt oder durch eine dritte Variable verursacht?
" + "Beispiel: Eisverkauf ↔ Ertrinkungsfälle korrelieren, aber nach Kontrolle der Temperatur ≈0. Temperatur ist der wahre Treiber.
" + "Interpretation: Hohe partielle Korr. = direkt | ≈0 = Scheinkorrelation (durch andere Variablen vermittelt)
" + ), + }, + "sub_mutual_info": { + "tip": "Informationstheoretische Maßzahl für lineare und nichtlineare Abhängigkeit.", + "desc": ( + "Gegenseitige Information (MI) misst, wie viel man über eine Variable erfährt, wenn man eine andere kennt.
" + "Kern: MI=0 = statistisch unabhängig | MI>0 = Abhängigkeit vorhanden. Erfasst auch nichtlineare Beziehungen, bei denen Pearson null ist.
" + "Anfänger-Tipp: Hohe MI bei niedrigem Pearson → nichtlineare Beziehung. Streudiagramm prüfen!
" + ), + }, + "sub_bootstrap_ci": { + "tip": "Resampling-basierte 95%-Konfidenzintervalle für jede Korrelation.", + "desc": ( + "Bootstrap-Konfidenzintervall zeigt, wie zuverlässig jede Korrelationsschätzung ist.
" + "Methode: 1000× mit Zurücklegen ziehen → Korrelation berechnen → 2,5–97,5 Perzentil = 95% KI
" + "Interpretation: Eng = stabil | Breit = hohe Unsicherheit | KI überquert Null = möglicherweise nicht signifikant
" + ), + }, + "sub_distance_corr": { + "tip": "Szekely-Distanzkorrelation – erkennt nichtlineare Abhängigkeiten.", + "desc": ( + "Distanzkorrelation ist genau dann null, wenn Variablen wirklich unabhängig sind – stärkere Garantie als Pearson.
" + "Vergleich: Niedriger Pearson + hohe Distanzkorrelation → nichtlineare Beziehung! Streudiagramm anschauen.
" + ), + }, + "sub_kmeans": { + "tip": "K-Means mit automatischer Silhouetten-Optimierung der Clusteranzahl.", + "desc": ( + "K-Means-Clustering teilt Daten automatisch in k Gruppen.
" + "Ablauf: Standardisierung → k=2–10 durchprobieren → k mit höchstem Silhouetten-Score wählen
" + "Metriken: Silhouette >0,5 = gut | >0,7 = starke Struktur | Trägheit (WCSS): niedriger = kompakter
" + "Anfänger-Tipp: K-Means setzt kugelförmige, ähnlich große Cluster voraus. Für unregelmäßige Formen: DBSCAN.
" + ), + }, + "sub_dbscan": { + "tip": "Dichtebasiertes Clustering: findet beliebige Clusterformen automatisch.", + "desc": ( + "DBSCAN bildet Cluster durch Auffinden dichter Datenbereiche.
" + "Vorteile: Kein k nötig · Beliebige Formen · Automatische Rauscherkennung
" + "Anfänger-Tipp: Nur 1 Cluster + viel Rauschen? Keine klare Dichtestruktur oder eps-Anpassung nötig.
" + ), + }, + "sub_hierarchical": { + "tip": "Dendrogramm: zeigt schrittweise Zusammenführung der Cluster.", + "desc": ( + "Hierarchisches Clustering zeigt den schrittweisen Zusammenführungsprozess als Baumdiagramm (Dendrogramm).
" + "Lesen: y-Achse = Fusionsdistanz. Horizontale Linie auf beliebiger Höhe → verschiedene k. Lange vertikale Linien = natürliche Clustergrenzen.
" + ), + }, + "sub_cluster_profiles": { + "tip": "Statistische Zusammenfassung jedes K-Means-Clusters über alle Features.", + "desc": ( + "Cluster-Profile zeigen pro Cluster Mittelwert/Std.-Abweichung und machen die Besonderheiten sichtbar.
" + "Verwendung: Features mit den größten Mittelwertunterschieden = die definierenden Merkmale.
" + "Anfänger-Tipp: Nutzen Sie diese Tabelle, um Clustern Namen zu geben (z. B. «Premium-Kunden», «Sparfüchse»).
" + ), + }, + "sub_tsne": { + "tip": "Nichtlineare 2D-Projektion mit Erhaltung lokaler Nachbarschaften.", + "desc": ( + "t-SNE komprimiert hochdimensionale Daten auf ein 2D-Streudiagramm und bewahrt Ähnlichkeiten.
" + "Lesen: Nah in 2D = ähnlich im Originalraum. Klar abgegrenzte Gruppen = mögliche echte Cluster.
" + "⚠️ Achtung: Inter-Cluster-Abstände sind bedeutungslos · Größen spiegeln nicht die Realität · Ergebnisse variieren (stochastisch)
" + ), + }, + "sub_umap": { + "tip": "Schnelle nichtlineare 2D-Projektion mit Erhalt lokaler + globaler Struktur.", + "desc": ( + "UMAP ist die moderne Alternative zu t-SNE – meist schneller und besser im Erhalt globaler Layouts.
" + "Vorteile: Schnell · Besserer Erhalt globaler Struktur · Relative Positionen der Cluster teilweise aussagekräftig
" + "Anfänger-Tipp: Zeigen t-SNE und UMAP ähnliche Clusterstrukturen, sind diese wahrscheinlich real.
" + ), + }, + "sub_factor_analysis": { + "tip": "Entdeckung verborgener (latenter) Faktoren hinter beobachteten Korrelationen.", + "desc": ( + "Faktorenanalyse erklärt, warum bestimmte Variablen korrelieren – sie nimmt latente Faktoren an.
" + "Analogie: 10 Prüfungsnoten könnten 3 latente Faktoren widerspiegeln: «Sprachfähigkeit», «Mathefähigkeit», «Kreativität».
" + "Anfänger-Tipp: Variablen mit hoher Rauschvarianz werden nicht durch gemeinsame Faktoren erklärt und messen möglicherweise etwas Einzigartiges.
" + ), + }, + "sub_factor_loadings": { + "tip": "Stärke der Zuordnung jeder Variable zu jedem latenten Faktor.", + "desc": ( + "Faktorladungen quantifizieren die Beziehung zwischen Originalvariablen und latenten Faktoren.
" + "Interpretation: |Ladung|>0,7 = stark | 0,4-0,7 = mäßig | <0,4 = schwach
" + "Querladungen: Variable lädt hoch auf mehrere Faktoren – passt nicht gut ins Faktormodell.
" + ), + }, + "sub_feature_contrib": { + "tip": "PCA-gewichteter Varianzbeitrag jedes Features zum Gesamtdatensatz.", + "desc": ( + "PCA-gewichteter Feature-Beitrag rankt Originalfeatures nach ihrem Anteil an der Gesamtvarianz für unüberwachte Feature-Auswahl.
" + "Anfänger-Tipp: Features am unteren Ende tragen minimal zur Gesamtvarianz bei – Kandidaten für Entfernung.
" + ), + }, + "sub_interaction": { + "tip": "Erkennung synergistischer Produkt-Interaktionen zwischen Feature-Paaren.", + "desc": ( + "Interaktionserkennung prüft, ob das Produkt zweier Features Informationen enthält, die einzeln fehlen.
" + "Anfänger-Tipp: Starke Interaktion gefunden? Produkt als neues Feature hinzufügen kann Modellleistung deutlich steigern.
" + ), + }, + "sub_monotonic": { + "tip": "Pearson vs. Spearman zum Aufdecken nichtlinearer monotoner Muster.", + "desc": ( + "Monotone Beziehungsanalyse erkennt Variablenpaare, die zusammen steigen/fallen, aber nicht linear.
" + "Schlüssel: Große |Spearman|−|Pearson|-Differenz = exponentielle/logarithmische Muster. Monotone Transformation kann lineare Modelle verbessern.
" + ), + }, + "sub_binning": { + "tip": "Bewertung von Gleich-Breite- und Gleich-Frequenz-Binning per Entropie.", + "desc": ( + "Binning-Analyse evaluiert Strategien zur Diskretisierung kontinuierlicher Variablen.
" + "Zwei Strategien: Gleiche Breite (ausreißerempfindlich) vs. Gleiche Frequenz (geeignet für schiefe Daten)
" + ), + }, + "sub_cardinality": { + "tip": "Analyse der Unique-Wert-Anzahl und empfohlene Kodierungsmethode.", + "desc": ( + "Kardinalität und Kodierung: Empfehlung basierend auf Unique-Wert-Anzahl:
" + "Leakage-Risiko prüft Features, die direkt/indirekt auf die Zielvariable zugreifen.
" + "Anfänger-Tipp: «Zu gute» Genauigkeit (z. B. 99%) → Data Leakage ist der Hauptverdächtige.
" + ), + }, + "sub_iso_forest": { + "tip": "Baumbasierte Anomalieerkennung durch zufällige Isolation.", + "desc": ( + "Isolation Forest basiert auf der Idee: «Anomalien sind leichter zu isolieren.»
" + "Prinzip: Zufällige Teilungsbäume → mittlere Isolation-Pfadlänge → kurzer Pfad = anomaler
" + "Anfänger-Tipp: Wenige Parameter, gut bei hochdimensionalen Daten – erste Wahl für Anomalieerkennung.
" + ), + }, + "sub_lof": { + "tip": "Dichtebasierte Erkennung: Vergleich lokaler Dichte mit Nachbarn.", + "desc": ( + "LOF vergleicht die lokale Dichte eines Punktes mit der seiner k nächsten Nachbarn.
" + "LOF≈1 = normal | LOF>>1 = deutlich dünner als Nachbarn (anomal)
" + "Anfänger-Tipp: Bei Clustern unterschiedlicher Dichte ist LOF effektiver als Isolation Forest.
" + ), + }, + "sub_mahalanobis": { + "tip": "Multivariater Abstand zum Datenzentrum unter Berücksichtigung von Korrelationen.", + "desc": ( + "Mahalanobis-Distanz misst den Abstand jeder Beobachtung zum Datenzentrum unter Berücksichtigung der Kovarianzstruktur.
" + "vs. Euklid: Euklid behandelt alle Richtungen gleich; Mahalanobis berücksichtigt Korrelationen – " + "wenn zwei normalerweise gleichzeitig steigende Features gegeneinander laufen, ist das wirklich anomal.
" + ), + }, + "sub_consensus": { + "tip": "Nur als anomal markiert, wenn ≥2 von 3 Methoden zustimmen.", + "desc": ( + "Konsens-Anomalieerkennung kombiniert Isolation Forest, LOF und Mahalanobis.
" + "Regel: ≥2/3 Methoden stimmen zu → als anomal markiert. Reduziert Fehlalarme erheblich.
" + "Anfänger-Tipp: Beginnen Sie die Untersuchung mit Konsens-Flags – das sind die zuverlässigsten Anomalie-Kandidaten.
" + ), + }, + "test_levene": { + "tip": "Prüft, ob die Varianzen verschiedener Gruppen gleich sind.", + "desc": ( + "Levene-Test bestätigt, ob Gruppenvarianzen annähernd gleich sind.
" + "Interpretation: p>0,05 = Varianzhomogenität bestätigt | p≤0,05 = signifikant unterschiedlich → Welch-Test oder nicht-parametrisch
" + ), + }, + "test_kruskal_wallis": { + "tip": "Nichtparametrische ANOVA: Prüft, ob Gruppen aus derselben Verteilung stammen.", + "desc": ( + "Kruskal-Wallis-Test ist das nicht-parametrische Pendant zur Einweg-ANOVA. Keine Normalitätsannahme nötig.
" + "Interpretation: p<0,05 = mindestens eine Gruppe unterscheidet sich → paarweise Mann-Whitney | p≥0,05 = kein Unterschied
" + ), + }, + "test_mann_whitney": { + "tip": "Nichtparametrischer Zweistichprobentest: Vergleich zweier Gruppen.", + "desc": ( + "Mann-Whitney U-Test prüft, ob zwei Gruppen aus derselben Verteilung stammen.
" + "Interpretation: p<0,05 = signifikanter Unterschied | p≥0,05 = kein Unterschied
" + "Anfänger-Tipp: Gleicher Mittelwert, aber unterschiedliche Streuung kann ebenfalls signifikant sein.
" + ), + }, + "test_chi_square": { + "tip": "Prüft, ob beobachtete Kategorie-Häufigkeiten von Erwartungen abweichen.", + "desc": ( + "Chi-Quadrat-Anpassungstest prüft, ob die beobachtete Verteilung der erwarteten (Standard: gleichmäßig) entspricht.
" + "Interpretation: p<0,05 = signifikante Abweichung | p≥0,05 = Übereinstimmung
" + "Voraussetzung: Jede erwartete Häufigkeit muss ≥5 sein.
" + ), + }, + "test_grubbs": { + "tip": "Prüft, ob der extremste Wert ein statistisch signifikanter Ausreißer ist.", + "desc": ( + "Grubbs-Test bewertet, ob der extremste Wert ein natürliches Extrem oder eine signifikante Anomalie ist.
" + "Interpretation: p<0,05 = signifikanter Ausreißer | p≥0,05 = im erwarteten Bereich
" + "Anfänger-Tipp: Grubbs testet nur einen Extremwert. Für mehrere Ausreißer: IQR oder Isolation Forest.
" + ), + }, + "test_adf": { + "tip": "Prüft, ob eine Zeitreihe stationär ist (zeitinvariante Statistiken).", + "desc": ( + "ADF-Test prüft auf Einheitswurzel (Nicht-Stationarität) in Zeitreihen.
" + "Interpretation: p<0,05 = stationär ✓ | p≥0,05 = nicht-stationär → Differenzierung erwägen
" + "Anfänger-Tipp: Zeitreihen-Spalten müssen vor der Regression mit ADF geprüft werden. Nicht-stationäre Prädiktoren machen Regressionen sinnlos.
" + ), + }, + "sub_column_quality": { + "tip": "Qualitätsscore pro Spalte: Vollständigkeit, Eindeutigkeit, Gültigkeit.", + "desc": ( + "Spaltenqualität zerlegt den Gesamtscore auf jede Spalte, um Problemspalten zu identifizieren.
" + "Anfänger-Tipp: Spalten mit extrem niedrigem Score sind vorrangige Ziele für Bereinigung oder Entfernung.
" + ), + }, + "sub_cleaning_log": { + "tip": "Schritt-für-Schritt-Protokoll aller automatischen Bereinigungen.", + "desc": "Bereinigungsprotokoll dokumentiert jede Transformation für volle Transparenz und Reproduzierbarkeit.
", + }, + "sub_detected_issues": { + "tip": "Liste der bei der Vorverarbeitung erkannten Datenqualitätsprobleme.", + "desc": "Erkannte Probleme listet gemischte Typen, verdächtige Muster (z. B. «999» als Fehlwert), Kodierungsfehler etc.
", + }, + "sub_normality_tests": { + "tip": "Shapiro-Wilk, Anderson-Darling und Jarque-Bera pro Spalte.", + "desc": ( + "Normalitätstests bewerten mit drei komplementären Tests, ob jede Spalte normalverteilt ist.
" + "Alle drei p<0,05 → wahrscheinlich nicht normal. Bei Widersprüchen Histogramm prüfen.
" + ), + }, + "sub_vif": { + "tip": "Varianzinflationsfaktor zur Erkennung von Multikollinearität.", + "desc": ( + "VIF misst, wie stark die Varianz eines Regressionskoeffizienten durch Feature-Korrelation aufgebläht wird.
" + "Interpretation: VIF=1 keine Korrelation | 1-5 gering | 5-10 mäßig | >10 ernst – Entfernung/Zusammenlegung erwägen
" + "Anfänger-Tipp: Hoher VIF → instabile Koeffizienten; kleine Datenänderungen können das Vorzeichen umkehren.
" + ), + }, + "sub_summary": { + "tip": "Kurzübersicht: Verteilungsform, Normalität und Ausreißeranzahl.", + "desc": "Zusammenfassung zeigt Schiefekategorie, Kurtosis-Typ, Normalitätsindikator und Ausreißeranzahl in einer Tabelle.
", + }, + "sub_variance_explained": { + "tip": "Von jeder Hauptkomponente erfasster Varianzanteil (Scree-Plot-Daten).", + "desc": ( + "Erklärte Varianz zeigt den individuellen und kumulativen Varianzanteil jeder Komponente.
" + "Scree-Plot: Der «Ellbogen» (Stelle des stärksten Knicks) ist der Punkt, ab dem zusätzliche Komponenten wenig beitragen.
" + ), + }, + "sub_loadings": { + "tip": "Gewicht jedes Originalfeatures in jeder Hauptkomponente.", + "desc": ( + "PCA-Ladungen zeigen, wie stark jedes Originalfeature zu jeder Hauptkomponente beiträgt.
" + "Beispiel: PC1 lädt hoch auf «Größe», «Gewicht», «BMI» → interpretierbar als «Körperbau»-Komponente.
" + ), + }, +} +# -- French ------------------------------------------------------------ +METHOD_INFO["fr"] = { + "section_overview": { + "tip": "Aperçu du jeu de données : lignes/colonnes, types, mémoire.", + "desc": ( + "Vue d'ensemble – structure globale du jeu de données avant l'analyse détaillée.
" + "Informations incluses :
" + "Pourquoi c'est important : Vérifier les lignes et les types révèle les erreurs de chargement (fichier tronqué, mauvais séparateur, problème d'encodage).
" + "Conseil débutant : Moins de lignes que prévu ? Peut-être un problème de séparateur. Colonnes numériques affichées comme « texte » ? Elles contiennent probablement des caractères non numériques.
" + ), + }, + "section_quality": { + "tip": "Qualité en 4 dimensions : complétude, unicité, cohérence, validité (0-100%).", + "desc": ( + "Évaluation de la qualité – un « bilan de santé » sur 4 axes indépendants, chacun de 0 à 100%.
" + "Les quatre axes :
" + "Notation : 90-100% = excellent | 70-89% = acceptable, attention aux alertes | <70% = à corriger avant modélisation
" + "Formule : 0,35×Complétude + 0,25×Unicité + 0,20×Cohérence + 0,20×Validité
" + ), + }, + "section_preprocessing": { + "tip": "Journal de toutes les étapes de nettoyage automatique avant l'analyse.", + "desc": ( + "Journal de prétraitement documente chronologiquement toutes les opérations de nettoyage automatique.
" + "Étapes typiques : Suppression de colonnes vides/constantes, conversion texte→nombre, correction d'encodage, suppression de lignes non analysables.
" + "Pourquoi c'est important : La reproductibilité est le fondement d'analyses fiables. Connaître les transformations permet de valider les résultats.
" + "Conseil débutant : Si une colonne importante a été supprimée, le fichier source a probablement un problème de format.
" + ), + }, + "section_descriptive": { + "tip": "Tendance centrale, dispersion et forme de la distribution pour chaque colonne.", + "desc": ( + "Statistiques descriptives – fondement de l'analyse exploratoire (EDA), résumant le centre, la dispersion et la forme de chaque colonne.
" + "Indicateurs numériques :
" + "Conseil débutant : Les colonnes avec une grande différence moyenne/médiane peuvent contenir des valeurs aberrantes ou une forte asymétrie.
" + ), + }, + "section_distribution": { + "tip": "Histogrammes et Q-Q plots pour visualiser la forme de chaque colonne numérique.", + "desc": ( + "Analyse des distributions – visualisation de la répartition des valeurs.
" + "Types de graphiques :
" + "Formes courantes : Cloche (normale), asymétrie droite (revenus), asymétrie gauche (notes en haut d'échelle), bimodale (mélange), uniforme.
" + "Pourquoi c'est important : De nombreux algorithmes ML supposent des données normales. Connaître la distribution réelle guide le choix du modèle et des transformations.
" + ), + }, + "section_correlation": { + "tip": "Corrélation de Pearson (linéaire) et Spearman (rang) entre colonnes numériques.", + "desc": ( + "Analyse de corrélation mesure la force des liens entre paires de variables.
" + "Deux mesures :
" + "Heatmap : Couleur foncée = corrélation forte. Rouge = positive, bleu = négative.
" + "Seuils : |r|>0,90 multicolinéarité sévère | |r|>0,70 forte | |r|<0,30 faible
" + "Conseil débutant : Les features fortement corrélées portent des informations similaires. Les deux dans un modèle linéaire → instabilité.
" + ), + }, + "section_missing": { + "tip": "Analyse des données manquantes : motif, proportion et mécanisme.", + "desc": ( + "Analyse des données manquantes – où, combien et pourquoi les données sont absentes.
" + "Trois mécanismes :
" + "Guide pratique : <5% suppression/moyenne | 5–30% KNN/MICE | >50% suppression de colonne
" + ), + }, + "section_outlier": { + "tip": "Détection des anomalies via IQR et Z-score.", + "desc": ( + "Détection des valeurs aberrantes – points extrêmement éloignés des autres données.
" + "Méthode IQR : IQR = Q3−Q1 | Modéré :
Important : Toutes les valeurs aberrantes ne sont pas des erreurs ! En détection de fraude ou maladies rares, elles sont parfois les plus précieuses.
" + "Conseil débutant : Dans le boxplot, les points au-delà des moustaches (whiskers) sont des aberrants potentiels à examiner.
" + ), + }, + "section_categorical": { + "tip": "Fréquences, diagrammes en barres et entropie des colonnes catégorielles.", + "desc": ( + "Analyse catégorielle examine les colonnes non numériques – labels texte, catégories, booléens.
" + "Indicateurs clés :
" + "Conseil débutant : Une barre très haute dans le graphique signale un « déséquilibre » – sur-échantillonnage peut être nécessaire.
" + ), + }, + "section_importance": { + "tip": "Classement des features par variance et information mutuelle.", + "desc": ( + "Importance des features : quelles colonnes portent l'information la plus utile ?
" + "Méthodes : Variance (≈0 = constante, sans info) · corrélation moyenne · information mutuelle (linéaire + non linéaire).
" + "Conseil débutant : Ne supprimez pas aveuglément les features peu importantes – faibles seules, elles peuvent être fortes en combinaison (effets d'interaction).
" + ), + }, + "section_pca": { + "tip": "ACP : dimensions intrinsèques et structure de la variance.", + "desc": ( + "ACP transforme les features corrélées en composantes non corrélées, ordonnées par variance.
" + "Résultats : Scree plot · variance cumulée · matrice des loadings
" + "Conseil débutant : L'ACP fonctionne mieux avec des échelles similaires. Le système a automatiquement standardisé (z-score).
" + ), + }, + "section_duplicates": { + "tip": "Détection des lignes en double exact pouvant gonfler les statistiques.", + "desc": ( + "Analyse des doublons – recherche de lignes identiques sur toutes les colonnes.
" + "Risques : Effectif gonflé → intervalles de confiance trop étroits · fuite train/test · ≈100% de doublons = probable erreur de chargement.
" + "Conseil débutant : Quelques doublons (<1%) sont généralement inoffensifs, mais un taux élevé inattendu doit être investigué.
" + ), + }, + "section_warnings": { + "tip": "Synthèse de toutes les alertes et problèmes détectés.", + "desc": ( + "Avertissements et problèmes regroupe toutes les anomalies détectées en un seul endroit.
" + "Avertissements courants : Fort taux de manquants (>30%) · Colonnes constantes · Multicolinéarité · Aberrants extrêmes · Incohérences de type
" + "Conseil débutant : Utilisez cette section comme liste de tâches prioritaire – commencez par les plus sévères.
" + ), + }, + "sub_best_fit": { + "tip": "Comparaison de chaque colonne avec des distributions théoriques (normale, gamma, Weibull…).", + "desc": ( + "Meilleur ajustement compare les colonnes numériques à la normale, log-normale, exponentielle, gamma, etc.
" + "Critères : AIC (plus bas = mieux) · stat KS (plus petit = mieux) · p>0,05 = acceptable
" + "Conseil débutant : Résultat « norm » → tests standards applicables ; sinon, envisager une transformation log.
" + ), + }, + "sub_jarque_bera": { + "tip": "Teste si asymétrie et kurtosis sont compatibles avec la loi normale.", + "desc": ( + "Test de Jarque-Bera vérifie la forme de la distribution.
" + "Interprétation : p≥0,05 = normalité non rejetée | p<0,05 = significativement non normale
" + "Conseil débutant : La non-normalité est très fréquente. Cela ne signifie pas des données « mauvaises », mais qu'il faut utiliser des méthodes non paramétriques ou des transformations.
" + ), + }, + "sub_power_transform": { + "tip": "Recommandation Box-Cox/Yeo-Johnson pour rapprocher la distribution de la normale.", + "desc": ( + "Transformation de puissance suggère des transformations mathématiques pour rendre les données asymétriques plus symétriques.
" + "Deux méthodes : Box-Cox (valeurs positives uniquement) | Yeo-Johnson (toutes valeurs)
" + "Conseil débutant : Étape de prétraitement essentielle pour la régression linéaire et les réseaux de neurones.
" + ), + }, + "sub_kde_bandwidth": { + "tip": "Largeur de bande KDE optimale selon les règles de Scott/Silverman.", + "desc": ( + "Largeur de bande KDE – le degré de « lissage » optimal de la courbe de densité.
" + "Compromis : Faible = détaillé mais surapprentissage du bruit | Élevé = lisse mais peut manquer des caractéristiques
" + "Conseil débutant : Grande différence entre les deux règles → possibilité d'aberrants ou de multimodalité.
" + ), + }, + "sub_partial_corr": { + "tip": "Relation directe entre deux variables après contrôle de toutes les autres.", + "desc": ( + "Corrélation partielle : la relation est-elle directe ou causée par une tierce variable ?
" + "Exemple : Ventes de glaces ↔ noyades corrélées ; après contrôle de la température → corrél. partielle ≈ 0. La température est le vrai moteur.
" + "Interprétation : Corrélation partielle élevée = relation directe | ≈0 = corrélation fallacieuse
" + ), + }, + "sub_mutual_info": { + "tip": "Mesure informationnelle capturant les dépendances linéaires et non linéaires.", + "desc": ( + "Information mutuelle (MI) – quantité d'information obtenue sur une variable en connaissant l'autre.
" + "Essentiel : MI=0 = indépendance statistique | MI>0 = dépendance. Capte toute relation, y compris non linéaire quand Pearson est nul.
" + "Conseil débutant : MI élevée + Pearson faible → relation non linéaire. Vérifiez le nuage de points.
" + ), + }, + "sub_bootstrap_ci": { + "tip": "Intervalle de confiance à 95% par rééchantillonnage pour chaque corrélation.", + "desc": ( + "IC Bootstrap montre la fiabilité de chaque estimation de corrélation.
" + "Méthode : 1000 tirages avec remise → calcul corrélation → percentiles 2,5–97,5 = IC 95%
" + "Interprétation : IC étroit = stable | IC large = incertitude élevée | IC franchissant zéro = peut-être non significatif
" + ), + }, + "sub_distance_corr": { + "tip": "Corrélation de distance de Szekely – détecte les dépendances non linéaires.", + "desc": ( + "Corrélation de distance : exactement nulle seulement si les variables sont vraiment indépendantes.
" + "Comparaison : Faible Pearson + haute corrélation de distance → relation non linéaire ! Consulter le nuage de points.
" + ), + }, + "sub_kmeans": { + "tip": "K-Means avec optimisation automatique du nombre de clusters par silhouette.", + "desc": ( + "Clustering K-Means divise automatiquement les données en k groupes.
" + "Processus : Standardisation → k=2–10 essayés → k avec le meilleur score silhouette choisi
" + "Métriques : Silhouette >0,5 = bon | >0,7 = structure forte | Inertie (WCSS) : plus bas = plus compact
" + "Conseil débutant : K-Means suppose des clusters globalement sphériques. Pour des formes irrégulières : DBSCAN.
" + ), + }, + "sub_dbscan": { + "tip": "Clustering par densité : formes arbitraires sans spécifier k.", + "desc": ( + "DBSCAN forme des clusters en trouvant les zones denses des données.
" + "Avantages : Pas de k · Formes quelconques · Détection automatique du bruit
" + "Conseil débutant : Un seul cluster + beaucoup de bruit ? Pas de structure de densité claire ou ajustement d'eps nécessaire.
" + ), + }, + "sub_hierarchical": { + "tip": "Dendrogramme : fusion progressive des clusters.", + "desc": ( + "Clustering hiérarchique – dendrogramme montrant le processus de fusion étape par étape.
" + "Lecture : Axe y = distance de fusion. Ligne horizontale à n'importe quelle hauteur → différents k. Longue ligne verticale = frontière naturelle.
" + ), + }, + "sub_cluster_profiles": { + "tip": "Profil statistique de chaque cluster K-Means sur toutes les features.", + "desc": ( + "Profils de clusters montrent moyenne/écart-type par cluster, révélant les particularités.
" + "Utilisation : Les features avec les plus grandes différences de moyenne = caractéristiques définissantes.
" + "Conseil débutant : Utilisez ce tableau pour nommer les clusters (ex. : « Clients premium », « Économes »).
" + ), + }, + "sub_tsne": { + "tip": "Projection 2D non linéaire préservant les voisinages locaux.", + "desc": ( + "t-SNE comprime les données haute dimension en 2D tout en préservant les similarités.
" + "Lecture : Proches en 2D = similaires dans l'espace original. Groupes nets = clusters potentiellement réels.
" + "⚠️ Attention : Distances inter-clusters sans signification · Tailles non représentatives · Résultats variables (stochastique)
" + ), + }, + "sub_umap": { + "tip": "Projection 2D rapide préservant structures locales et globales.", + "desc": ( + "UMAP – alternative moderne au t-SNE, plus rapide et meilleur pour la structure globale.
" + "Avantages : Rapide · Meilleure préservation globale · Positions relatives des clusters partiellement significatives
" + "Conseil débutant : Si t-SNE et UMAP montrent des clusters similaires, ils sont probablement réels.
" + ), + }, + "sub_factor_analysis": { + "tip": "Découverte des facteurs latents cachés derrière les corrélations observées.", + "desc": ( + "Analyse factorielle explique pourquoi certaines variables sont corrélées – en supposant des facteurs latents.
" + "Analogie : 10 notes d'examen pourraient refléter 3 facteurs latents : « aptitude verbale », « aptitude mathématique », « créativité ».
" + "Conseil débutant : Les variables avec forte variance de bruit ne sont pas expliquées par les facteurs communs et mesurent peut-être quelque chose d'unique.
" + ), + }, + "sub_factor_loadings": { + "tip": "Force de l'association entre chaque variable et chaque facteur latent.", + "desc": ( + "Loadings factoriels quantifient le lien entre variables originales et facteurs latents.
" + "Interprétation : |loading|>0,7 = fort | 0,4-0,7 = modéré | <0,4 = faible
" + "Loadings croisés : Variable sur plusieurs facteurs – ne convient pas bien au modèle factoriel.
" + ), + }, + "sub_feature_contrib": { + "tip": "Contribution en variance de chaque feature pondérée par l'ACP.", + "desc": ( + "Contribution ACP-pondérée classe les features par contribution à la variance totale pour la sélection non supervisée.
" + "Conseil débutant : Les features en bas du classement contribuent très peu – candidates à la suppression.
" + ), + }, + "sub_interaction": { + "tip": "Détection d'interactions produit synergiques entre paires de features.", + "desc": ( + "Détection d'interactions : le produit de deux features contient-il une information absente individuellement ?
" + "Conseil débutant : Interaction forte trouvée ? Ajouter le produit comme nouvelle feature peut améliorer significativement le modèle.
" + ), + }, + "sub_monotonic": { + "tip": "Pearson vs Spearman pour identifier des relations monotones non linéaires.", + "desc": ( + "Analyse monotone détecte les paires de variables qui croissent/décroissent ensemble sans être linéaires.
" + "Clé : Grande différence |Spearman|−|Pearson| = motif exponentiel/logarithmique. Une transformation monotone peut améliorer le modèle linéaire.
" + ), + }, + "sub_binning": { + "tip": "Évaluation du binning à largeur égale et fréquence égale par entropie.", + "desc": ( + "Analyse de binning – stratégies de discrétisation des variables continues.
" + "Deux stratégies : Largeur égale (sensible aux aberrants) vs fréquence égale (adapté aux données asymétriques)
" + ), + }, + "sub_cardinality": { + "tip": "Analyse des valeurs uniques et recommandation d'encodage.", + "desc": ( + "Cardinalité et encodage : recommandation selon le nombre de valeurs uniques :
" + "Risque de fuite examine les features accédant directement/indirectement à la variable cible.
" + "Conseil débutant : Précision « trop belle pour être vraie » (ex. 99%) → la fuite de données est le suspect nᵒ1.
" + ), + }, + "sub_iso_forest": { + "tip": "Détection d'anomalies par isolation aléatoire (forêt d'arbres).", + "desc": ( + "Isolation Forest : « les anomalies sont plus faciles à isoler ».
" + "Principe : Arbres de partition aléatoire → longueur moyenne du chemin d'isolation → chemin court = plus anomal
" + "Conseil débutant : Peu de paramètres, bon en haute dimension – premier choix pour la détection d'anomalies.
" + ), + }, + "sub_lof": { + "tip": "Détection basée sur la densité locale comparée aux voisins.", + "desc": ( + "LOF compare la densité locale d'un point à celle de ses k plus proches voisins.
" + "LOF≈1 = normal | LOF>>1 = bien plus clairsemé que les voisins (anomal)
" + "Conseil débutant : En présence de clusters de densités différentes, LOF est plus efficace qu'Isolation Forest.
" + ), + }, + "sub_mahalanobis": { + "tip": "Distance multivariée au centre prenant en compte les corrélations.", + "desc": ( + "Distance de Mahalanobis mesure la distance au centre des données en tenant compte de la structure de covariance.
" + "vs Euclidien : L'euclidien traite toutes les directions de la même façon ; Mahalanobis intègre les corrélations – " + "deux features habituellement conjointes évoluant en sens opposé = vrai anomal.
" + ), + }, + "sub_consensus": { + "tip": "Anomalie signalée uniquement si ≥2 méthodes sur 3 sont d'accord.", + "desc": ( + "Détection par consensus combine Isolation Forest, LOF et Mahalanobis.
" + "Règle : ≥2/3 méthodes concordantes → signalé comme anomalie. Réduit fortement les fausses alertes.
" + "Conseil débutant : Commencez l'investigation par les signalements par consensus – les candidats les plus fiables.
" + ), + }, + "test_levene": { + "tip": "Vérifie l'égalité des variances entre groupes.", + "desc": ( + "Test de Levene confirme si les variances des groupes sont approximativement égales.
" + "Interprétation : p>0,05 = homogénéité confirmée | p≤0,05 = variances différentes → utiliser Welch ou non-paramétrique
" + ), + }, + "test_kruskal_wallis": { + "tip": "ANOVA non paramétrique : les groupes viennent-ils de la même distribution ?", + "desc": ( + "Test de Kruskal-Wallis – équivalent non paramétrique de l'ANOVA à un facteur.
" + "Interprétation : p<0,05 = au moins un groupe diffère → Mann-Whitney par paires | p≥0,05 = pas de différence
" + ), + }, + "test_mann_whitney": { + "tip": "Test non paramétrique à deux échantillons : comparaison de deux groupes indépendants.", + "desc": ( + "Test de Mann-Whitney U : les deux groupes proviennent-ils de la même distribution ?
" + "Interprétation : p<0,05 = différence significative | p≥0,05 = pas de différence
" + "Conseil débutant : Même moyenne, dispersion différente → peut quand même être significatif.
" + ), + }, + "test_chi_square": { + "tip": "Les fréquences observées s'écartent-elles des fréquences attendues ?", + "desc": ( + "Test du Chi-deux d'ajustement : la distribution observée correspond-elle à l'attendue (par défaut : uniforme) ?
" + "Interprétation : p<0,05 = écart significatif | p≥0,05 = concordance
" + "Condition : Chaque fréquence attendue doit être ≥5.
" + ), + }, + "test_grubbs": { + "tip": "La valeur la plus extrême est-elle un aberrant statistiquement significatif ?", + "desc": ( + "Test de Grubbs : la valeur extrême est-elle un phénomène naturel ou une vraie anomalie ?
" + "Interprétation : p<0,05 = aberrant significatif | p≥0,05 = dans les limites attendues
" + "Conseil débutant : Grubbs ne teste qu'un seul extrême. Pour plusieurs aberrants : IQR ou Isolation Forest.
" + ), + }, + "test_adf": { + "tip": "La série temporelle est-elle stationnaire ?", + "desc": ( + "Test ADF : recherche de racine unitaire (non-stationnarité).
" + "Interprétation : p<0,05 = stationnaire ✓ | p≥0,05 = non stationnaire → différenciation nécessaire
" + "Conseil débutant : Les colonnes temporelles doivent passer le test ADF avant régression. Des prédicteurs non stationnaires invalident la régression.
" + ), + }, + "sub_column_quality": { + "tip": "Score qualité par colonne : complétude, unicité, validité.", + "desc": ( + "Qualité par colonne décompose le score global pour identifier les colonnes problématiques.
" + "Conseil débutant : Les colonnes au score très bas sont les premières à nettoyer ou supprimer.
" + ), + }, + "sub_cleaning_log": { + "tip": "Journal étape par étape de toutes les opérations de nettoyage automatique.", + "desc": "Journal de nettoyage documente chaque transformation pour une transparence et reproductibilité totales.
", + }, + "sub_detected_issues": { + "tip": "Liste des problèmes de qualité détectés lors du prétraitement.", + "desc": "Problèmes détectés : types mixtes, motifs suspects (ex. « 999 » comme marqueur manquant), erreurs d'encodage, etc.
", + }, + "sub_normality_tests": { + "tip": "Tests de Shapiro-Wilk, Anderson-Darling et Jarque-Bera par colonne.", + "desc": ( + "Tests de normalité évaluent la normalité de chaque colonne avec trois tests complémentaires.
" + "Les trois à p<0,05 → probablement non normale. En cas de désaccord, vérifier l'histogramme.
" + ), + }, + "sub_vif": { + "tip": "Facteur d'inflation de la variance pour détecter la multicolinéarité.", + "desc": ( + "VIF mesure à quel point la variance d'un coefficient de régression est gonflée par la corrélation entre features.
" + "Interprétation : VIF=1 aucune corrélation | 1-5 faible | 5-10 modéré | >10 sévère – suppression/fusion à envisager
" + "Conseil débutant : VIF élevé → coefficients instables ; un petit changement de données peut inverser le signe.
" + ), + }, + "sub_summary": { + "tip": "Aperçu rapide : forme de distribution, normalité et nombre d'aberrants.", + "desc": "Résumé rassemble catégorie d'asymétrie, type de kurtosis, indicateur de normalité et nombre d'aberrants en un tableau.
", + }, + "sub_variance_explained": { + "tip": "Proportion de variance captée par chaque composante principale.", + "desc": ( + "Variance expliquée montre la contribution individuelle et cumulée de chaque composante.
" + "Scree plot : Le « coude » est le point où les composantes supplémentaires apportent de moins en moins.
" + ), + }, + "sub_loadings": { + "tip": "Poids de chaque feature originale dans chaque composante principale.", + "desc": ( + "Loadings ACP montrent le poids de chaque feature dans chaque composante.
" + "Exemple : PC1 charge fortement sur « Taille », « Poids », « IMC » → interprétable comme composante « corpulence ».
" + ), + }, +} + + +def get_method_info_json() -> str: + "“”Return METHOD_INFO dict as a JSON string for embedding in JS.“”" + import json + return json.dumps(METHOD_INFO, ensure_ascii=False) + + +# ===================================================================== +# Metric tooltip translations (column header / cell hover tips) +# ===================================================================== +# METRIC_TIPS_I18N[lang_code][metric_key] = translated tooltip string +# English tips are canonical; other languages mirror them. + +METRIC_TIPS_I18N: dict[str, dict[str, str]] = {} + +# ----- English (en) -------------------------------------------------- +METRIC_TIPS_I18N["en"] = { + "type": "Inferred data type of the column (numeric, categorical, text, datetime, boolean).", + "count": "Number of non-null values in the column.", + "missing": "Number of missing (null / NaN) values.", + "missing_%": "Percentage of missing values = (missing / total rows) x 100.", + "unique": "Number of distinct values in the column.", + "mean": "Arithmetic mean = sum of values / count.", + "median": "Middle value when data is sorted (50th percentile).", + "std": "Standard deviation -- measures spread around the mean. Larger = more dispersed.", + "se": "Standard error of the mean = std / sqrt(n). Indicates precision of the sample mean.", + "cv": "Coefficient of variation = std / |mean|. Unitless relative measure of variability.", + "mad": "Median Absolute Deviation = median(|xi - median|). Robust measure of spread.", + "min": "Minimum value in the column.", + "max": "Maximum value in the column.", + "range": "Range = max - min. Total spread of the data.", + "p5": "5th percentile -- 5% of data falls below this value.", + "q1": "1st quartile (25th percentile) -- 25% of data falls below this value.", + "q3": "3rd quartile (75th percentile) -- 75% of data falls below this value.", + "p95": "95th percentile -- 95% of data falls below this value.", + "iqr": "Interquartile Range = Q3 - Q1. Middle 50% spread, used for outlier detection.", + "skewness": "Skewness measures distribution asymmetry. 0 = symmetric, >0 = right-skewed, <0 = left-skewed.", + "kurtosis": "Excess kurtosis measures tail heaviness. 0 = normal, >0 = heavy tails, <0 = light tails.", + "top": "Most frequently occurring value in the column.", + "freq": "Frequency count of the most common value.", + "n": "Number of non-null observations used for the distribution test.", + "skew_type": "Interpretation of skewness: symmetric (|s|<0.5), moderate skew (0.5-1), high skew (>1).", + "kurt_type": "Interpretation of kurtosis: mesokurtic (~0), leptokurtic (>1, heavy tails), platykurtic (<-1, light tails).", + "normality_test": "Primary normality test used (Shapiro-Wilk for n<=5000, D'Agostino-Pearson for larger).", + "normality_p": "p-value of the primary normality test. p<0.05 -> likely non-normal.", + "is_normal_0.05": "True if p-value >= 0.05, meaning the null hypothesis of normality is not rejected at alpha=0.05.", + "shapiro_p": "p-value from Shapiro-Wilk test. Best for small-medium samples (n<=5000).", + "dagostino_p": "p-value from D'Agostino-Pearson test. Uses skewness + kurtosis, good for n>=20.", + "ks_p": "p-value from Kolmogorov-Smirnov test vs. normal distribution.", + "anderson_stat": "Anderson-Darling test statistic. Higher = stronger evidence against normality.", + "anderson_5pct_cv": "Anderson-Darling 5% critical value. If stat > cv -> reject normality at 5%.", + "missing_count": "Number of missing (null) values in this column.", + "missing_ratio": "Fraction of missing values = missing_count / total_rows (0 to 1).", + "dtype": "Pandas dtype of the column.", + "lower_bound": "IQR lower fence = Q1 - k x IQR. Values below this are outliers (default k=1.5).", + "upper_bound": "IQR upper fence = Q3 + k x IQR. Values above this are outliers (default k=1.5).", + "outlier_count": "Number of values falling outside the outlier bounds.", + "outlier_%": "Percentage of outlier values = (outlier_count / total) x 100.", + "min_outlier": "Smallest outlier value detected.", + "max_outlier": "Largest outlier value detected.", + "threshold": "Z-score threshold used. Values with |z| > threshold are outliers.", + "max_zscore": "Maximum absolute z-score found in the column.", + "top_value": "The most frequently occurring category value.", + "top_frequency": "Count of the most frequent category.", + "top_%": "Percentage of the most frequent category = (top_freq / total) x 100.", + "entropy": "Shannon entropy (bits). Higher = more uniform distribution among categories.", + "norm_entropy": "Normalized entropy = entropy / log2(unique). 1.0 = perfectly uniform.", + "max_entropy": "Maximum possible entropy = log2(unique). Achieved when all categories are equally frequent.", + "normalized_entropy": "Same as norm_entropy: entropy / max_entropy. 1.0 = uniform.", + "unique_values": "Number of distinct category values.", + "variance": "Variance of the column = mean of squared deviations from mean.", + "mean_abs_corr": "Mean absolute Pearson correlation with all other numeric columns.", + "avg_mutual_info": "Average mutual information with all other columns (uses sklearn).", + "VIF": "Variance Inflation Factor. VIF=1 -> no multicollinearity, >5 -> moderate, >10 -> severe.", + "multicollinearity": "Interpretation of VIF: low (<5), moderate (5-10), or high (>=10).", + "variance_ratio": "Proportion of total variance explained by this principal component.", + "cumulative_ratio": "Cumulative proportion of variance explained up to this component.", + "eigenvalue": "Eigenvalue of the covariance matrix for this component. Higher = more variance.", + "n_components": "Total number of principal components computed.", + "total_variance_explained": "Total variance captured by all computed components.", + "components_for_90pct": "Minimum number of components needed to explain >= 90% of variance.", + "top_component_variance": "Variance ratio of the first (most important) principal component.", + "total_rows": "Total number of rows in the dataset.", + "duplicate_rows": "Number of exact duplicate rows found.", + "unique_rows": "Number of unique (non-duplicate) rows.", + "duplicate_ratio": "Fraction of duplicate rows = duplicate_rows / total_rows.", + "uniqueness_ratio": "Ratio of unique values = unique / total_non_null. 1.0 = all unique.", + "total_non_null": "Number of non-null values used for uniqueness calculation.", + "is_unique_key": "True if every non-null value is unique -- potential primary key.", + "completeness": "Fraction of non-missing values = 1 - (missing / total). 1.0 = no missing data.", + "uniqueness": "Ratio of unique values to total non-null values. Higher = more diverse.", + "consistency": "Measures type consistency. 1.0 = all values match the expected data type.", + "validity": "Fraction of values within expected ranges/formats. 1.0 = all valid.", + "overall": "Weighted quality score = 0.35*completeness + 0.25*uniqueness + 0.20*consistency + 0.20*validity.", + "quality_score": "Per-column quality score combining completeness and uniqueness.", + "column": "Column name in the dataset.", + "component": "Principal component identifier (PC1, PC2, ...).", + "value": "Category or discrete value.", + "percentage": "Percentage share of this value = (count / total) x 100.", + "best_distribution": "Scipy distribution that best fits the data according to AIC.", + "aic": "Akaike Information Criterion -- lower is better. Penalises complexity.", + "bic": "Bayesian Information Criterion -- lower is better. More conservative than AIC.", + "ks_statistic": "Kolmogorov-Smirnov statistic measuring max CDF deviation from the fitted distribution.", + "jarque_bera_stat": "Jarque-Bera test statistic. Large values indicate non-normality.", + "jb_p_value": "p-value of the Jarque-Bera test. p < 0.05 -> reject normality.", + "recommended_transform": "Power transform recommended to make the column more normal (Box-Cox or Yeo-Johnson).", + "original_skew": "Skewness of the original (untransformed) column.", + "transformed_skew": "Skewness after applying the recommended power transform.", + "bandwidth_silverman": "Kernel bandwidth via Silverman's rule for KDE estimation.", + "bandwidth_scott": "Kernel bandwidth via Scott's rule for KDE estimation.", + "partial_corr": "Partial correlation -- Pearson correlation after removing confounding effects.", + "mutual_information": "Mutual information (bits) -- measures non-linear dependency between two variables.", + "ci_lower": "Lower bound of the 95% bootstrap confidence interval for the correlation.", + "ci_upper": "Upper bound of the 95% bootstrap confidence interval for the correlation.", + "distance_corr": "Szekely distance correlation -- captures non-linear dependencies (0=independent, 1=dependent).", + "optimal_k": "Best number of clusters determined by silhouette score analysis.", + "best_silhouette": "Highest mean silhouette score across evaluated k values (-1 to 1, higher=better).", + "inertia": "Within-cluster sum of squares (WCSS). Lower = tighter clusters.", + "n_clusters_dbscan": "Number of clusters found by DBSCAN (excludes noise).", + "noise_ratio": "Fraction of points labelled as noise by DBSCAN.", + "eps": "DBSCAN epsilon -- neighbourhood radius auto-estimated from k-distance plot.", + "kl_divergence": "Kullback-Leibler divergence of the t-SNE embedding. Lower = better fit.", + "tsne_perplexity": "Perplexity parameter for t-SNE (balances local vs. global structure).", + "n_factors": "Number of latent factors retained via Kaiser criterion (eigenvalue > 1).", + "factor_loading": "Correlation between an observed variable and a latent factor.", + "noise_variance": "Estimated noise (uniqueness) for each variable in Factor Analysis.", + "interaction_strength": "Pearson correlation between a product-interaction term and the top feature.", + "monotonic_gap": "Gap between Pearson and Spearman correlations -- large gap -> non-linear monotonic.", + "entropy_equal_width": "Shannon entropy of equal-width binning. Lower = more concentrated distribution.", + "entropy_equal_freq": "Shannon entropy of equal-frequency binning. Lower = more concentrated.", + "cardinality": "Number of unique values in a categorical column.", + "encoding_rec": "Recommended encoding strategy based on cardinality analysis.", + "leakage_risk": "Risk level (low/medium/high) that a feature may leak target information.", + "anomaly_score_if": "Isolation Forest anomaly score. More negative = more anomalous.", + "lof_score": "Local Outlier Factor minus-score. More negative = more anomalous.", + "mahalanobis_dist": "Mahalanobis distance from the data centroid. Larger = more unusual.", + "consensus_flag": "True if >= 2 out of 3 anomaly methods agree the point is anomalous.", + "levene_stat": "Levene test statistic for equality of variances.", + "levene_p": "p-value of Levene's test. p < 0.05 -> variances are significantly different.", + "kw_stat": "Kruskal-Wallis H statistic -- non-parametric one-way ANOVA.", + "kw_p": "p-value of Kruskal-Wallis test. p < 0.05 -> at least one group differs.", + "mw_stat": "Mann-Whitney U statistic -- non-parametric two-sample rank test.", + "mw_p": "p-value of Mann-Whitney U test.", + "chi2_stat": "Chi-square goodness-of-fit statistic vs. uniform distribution.", + "chi2_p": "p-value of chi-square goodness-of-fit test.", + "grubbs_stat": "Grubbs test statistic for detecting a single outlier.", + "grubbs_p": "p-value of Grubbs test.", + "adf_stat": "Augmented Dickey-Fuller test statistic for stationarity.", + "adf_p": "p-value of the ADF test. p < 0.05 -> series is stationary.", + "numeric_ratio": "Fraction of columns that are numeric.", + "categorical_ratio": "Fraction of columns that are categorical.", + "duplicate_row_ratio": "Fraction of rows that are exact duplicates.", +} + +# ----- Korean (ko) --------------------------------------------------- +METRIC_TIPS_I18N["ko"] = { + "type": "Column(Feature)의 추론된 데이터 타입 (numeric, categorical, text, datetime, boolean).", + "count": "Column(Feature)의 null이 아닌 값 개수.", + "missing": "결측(null / NaN) 값 개수.", + "missing_%": "결측 비율 = (결측 수 / 전체 행) x 100.", + "unique": "Column(Feature)의 고유(distinct) 값 개수.", + "mean": "산술 평균 = 값 합계 / 개수.", + "median": "데이터를 정렬했을 때 중앙값 (50번째 백분위).", + "std": "표준편차 -- 평균 주위의 분산 정도를 측정. 클수록 분산이 큼.", + "se": "평균의 표준오차 = std / sqrt(n). 표본 평균의 정밀도를 나타냄.", + "cv": "변동계수 = std / |mean|. 단위 없는 상대적 변동성 측정치.", + "mad": "중앙값 절대 편차 = median(|xi - median|). 강건한 산포 측정치.", + "min": "Column(Feature)의 최솟값.", + "max": "Column(Feature)의 최댓값.", + "range": "범위 = max - min. 데이터의 전체 분산 폭.", + "p5": "5번째 백분위 -- 데이터의 5%가 이 값보다 낮음.", + "q1": "1사분위수 (25번째 백분위) -- 데이터의 25%가 이 값보다 낮음.", + "q3": "3사분위수 (75번째 백분위) -- 데이터의 75%가 이 값보다 낮음.", + "p95": "95번째 백분위 -- 데이터의 95%가 이 값보다 낮음.", + "iqr": "사분위 범위 = Q3 - Q1. 중간 50% 분포 폭, 이상치 탐지에 사용.", + "skewness": "왜도: 분포의 비대칭성. 0 = 대칭, >0 = 오른쪽 치우침, <0 = 왼쪽 치우침.", + "kurtosis": "초과 첨도: 꼬리 두께. 0 = 정규, >0 = 두꺼운 꼬리, <0 = 얇은 꼬리.", + "top": "Column(Feature)에서 가장 자주 나타나는 값.", + "freq": "최빈값의 빈도 수.", + "n": "분포 검정에 사용된 null이 아닌 관측치 수.", + "skew_type": "왜도 해석: 대칭(|s|<0.5), 중간 왜도(0.5-1), 높은 왜도(>1).", + "kurt_type": "첨도 해석: 중간첨도(~0), 급첨(>1, 두꺼운 꼬리), 완첨(<-1, 얇은 꼬리).", + "normality_test": "사용된 주 정규성 검정 (Shapiro-Wilk n<=5000, D'Agostino-Pearson 큰 표본).", + "normality_p": "주 정규성 검정의 p값. p<0.05 -> 비정규 가능성 높음.", + "is_normal_0.05": "p값 >= 0.05이면 True, 즉 alpha=0.05에서 정규성 귀무가설이 기각되지 않음.", + "shapiro_p": "Shapiro-Wilk 검정 p값. 소-중 표본에 적합 (n<=5000).", + "dagostino_p": "D'Agostino-Pearson 검정 p값. 왜도 + 첨도 사용, n>=20에 적합.", + "ks_p": "Kolmogorov-Smirnov 검정 p값 (정규분포와 비교).", + "anderson_stat": "Anderson-Darling 검정 통계량. 높을수록 정규성 반증 강함.", + "anderson_5pct_cv": "Anderson-Darling 5% 임계값. stat > cv -> 5%에서 정규성 기각.", + "missing_count": "이 Column(Feature)의 결측(null) 값 개수.", + "missing_ratio": "결측 비율 = missing_count / total_rows (0~1).", + "dtype": "Column(Feature)의 Pandas dtype.", + "lower_bound": "IQR 하한 = Q1 - k x IQR. 이보다 낮으면 이상치 (기본 k=1.5).", + "upper_bound": "IQR 상한 = Q3 + k x IQR. 이보다 높으면 이상치 (기본 k=1.5).", + "outlier_count": "이상치 범위 밖의 값 개수.", + "outlier_%": "이상치 비율 = (outlier_count / 전체) x 100.", + "min_outlier": "탐지된 가장 작은 이상치 값.", + "max_outlier": "탐지된 가장 큰 이상치 값.", + "threshold": "사용된 Z-점수 임계값. |z| > threshold이면 이상치.", + "max_zscore": "Column(Feature)에서 발견된 최대 절대 Z-점수.", + "top_value": "가장 빈번한 범주 값.", + "top_frequency": "가장 빈번한 범주의 빈도 수.", + "top_%": "가장 빈번한 범주의 비율 = (top_freq / 전체) x 100.", + "entropy": "Shannon 엔트로피 (비트). 높을수록 범주 간 분포가 더 균일.", + "norm_entropy": "정규화 엔트로피 = entropy / log2(unique). 1.0 = 완전히 균일.", + "max_entropy": "최대 가능 엔트로피 = log2(unique). 모든 범주가 동일 빈도일 때 달성.", + "normalized_entropy": "norm_entropy와 동일: entropy / max_entropy. 1.0 = 균일.", + "unique_values": "고유 범주 값 개수.", + "variance": "Column(Feature)의 분산 = 평균으로부터 편차 제곱의 평균.", + "mean_abs_corr": "다른 모든 수치형 Column(Feature)과의 평균 절대 Pearson 상관계수.", + "avg_mutual_info": "다른 모든 Column(Feature)과의 평균 상호정보량.", + "VIF": "분산팽창계수. VIF=1 -> 다중공선성 없음, >5 -> 보통, >10 -> 심각.", + "multicollinearity": "VIF 해석: 낮음(<5), 보통(5-10), 높음(>=10).", + "variance_ratio": "이 주성분이 설명하는 총 분산의 비율.", + "cumulative_ratio": "이 성분까지 누적 설명 분산 비율.", + "eigenvalue": "이 성분의 공분산 행렬 고유값. 높을수록 더 많은 분산.", + "n_components": "계산된 주성분 총 개수.", + "total_variance_explained": "모든 성분이 설명하는 총 분산.", + "components_for_90pct": "분산 90% 이상 설명에 필요한 최소 성분 수.", + "top_component_variance": "첫 번째 (가장 중요한) 주성분의 분산 비율.", + "total_rows": "데이터셋의 총 행 수.", + "duplicate_rows": "정확히 중복된 행 수.", + "unique_rows": "고유한(비중복) 행 수.", + "duplicate_ratio": "중복 행 비율 = duplicate_rows / total_rows.", + "uniqueness_ratio": "고유값 비율 = unique / total_non_null. 1.0 = 모두 고유.", + "total_non_null": "고유성 계산에 사용된 null이 아닌 값 수.", + "is_unique_key": "모든 null이 아닌 값이 고유하면 True -- 잠재적 기본 키.", + "completeness": "비결측 값 비율 = 1 - (missing / total). 1.0 = 결측 없음.", + "uniqueness": "고유 값 대 총 non-null 값 비율. 높을수록 다양함.", + "consistency": "타입 일관성 측정. 1.0 = 모든 값이 예상 데이터 타입과 일치.", + "validity": "예상 범위/형식 내 값 비율. 1.0 = 모두 유효.", + "overall": "가중 품질 점수 = 0.35*completeness + 0.25*uniqueness + 0.20*consistency + 0.20*validity.", + "quality_score": "completeness와 uniqueness를 결합한 Column(Feature)별 품질 점수.", + "column": "데이터셋의 Column(Feature) 이름.", + "component": "주성분 식별자 (PC1, PC2, ...).", + "value": "범주 또는 이산 값.", + "percentage": "이 값의 비율 = (count / total) x 100.", + "best_distribution": "AIC 기준 데이터에 가장 적합한 scipy 분포.", + "aic": "아카이케 정보 기준 -- 낮을수록 좋음. 복잡도를 벌점화.", + "bic": "베이지안 정보 기준 -- 낮을수록 좋음. AIC보다 보수적.", + "ks_statistic": "적합된 분포와의 최대 CDF 편차를 측정하는 K-S 통계량.", + "jarque_bera_stat": "Jarque-Bera 검정 통계량. 큰 값은 비정규성을 나타냄.", + "jb_p_value": "Jarque-Bera 검정 p값. p < 0.05 -> 정규성 기각.", + "recommended_transform": "Column(Feature)을 더 정규적으로 만드는 권장 변환 (Box-Cox 또는 Yeo-Johnson).", + "original_skew": "원래(변환 전) Column(Feature)의 왜도.", + "transformed_skew": "권장 변환 적용 후 왜도.", + "bandwidth_silverman": "KDE 추정을 위한 Silverman 규칙 커널 대역폭.", + "bandwidth_scott": "KDE 추정을 위한 Scott 규칙 커널 대역폭.", + "partial_corr": "편상관 -- 다른 변수의 혼동 효과를 제거한 후의 Pearson 상관.", + "mutual_information": "상호정보량 (비트) -- 두 변수 간 비선형 의존성 측정.", + "ci_lower": "상관계수에 대한 95% 부트스트랩 신뢰구간의 하한.", + "ci_upper": "상관계수에 대한 95% 부트스트랩 신뢰구간의 상한.", + "distance_corr": "Szekely 거리 상관 -- 비선형 의존성 포착 (0=독립, 1=의존).", + "optimal_k": "실루엣 점수 분석으로 결정된 최적 클러스터 수.", + "best_silhouette": "평가된 k 값 중 최고 평균 실루엣 점수 (-1~1, 높을수록 좋음).", + "inertia": "클러스터 내 제곱합(WCSS). 낮을수록 더 밀집된 클러스터.", + "n_clusters_dbscan": "DBSCAN이 찾은 클러스터 수 (노이즈 제외).", + "noise_ratio": "DBSCAN이 노이즈로 분류한 점의 비율.", + "eps": "DBSCAN epsilon -- k-distance 플롯에서 추정된 이웃 반경.", + "kl_divergence": "t-SNE 임베딩의 KL 발산. 낮을수록 좋은 적합度.", + "tsne_perplexity": "t-SNE perplexity(혼란도) (지역 vs 전역 구조 균형).", + "n_factors": "Kaiser 기준(eigenvalue > 1)으로 유지된 잠재 요인 수.", + "factor_loading": "관측 변수와 잠재 요인 간 상관.", + "noise_variance": "요인 분석에서 각 변수의 추정 노이즈 (고유분산).", + "interaction_strength": "곱 상호작용 항과 상위 특성 간의 Pearson 상관.", + "monotonic_gap": "Pearson과 Spearman 상관 간 차이 -- 큰 차이 -> 비선형 단조 관계.", + "entropy_equal_width": "등폭 구간 Shannon 엔트로피. 낮을수록 분포가 집중적.", + "entropy_equal_freq": "등빈도 구간 Shannon 엔트로피. 낮을수록 집중적.", + "cardinality": "범주형 Column(Feature)의 고유 값 수.", + "encoding_rec": "카디널리티 분석 기반 권장 인코딩 전략.", + "leakage_risk": "특성이 타겟 정보를 누출할 수 있는 위험 수준 (low/medium/high).", + "anomaly_score_if": "Isolation Forest 이상 점수. 더 음수일수록 더 이상적.", + "lof_score": "Local Outlier Factor 마이너스 점수. 더 음수일수록 더 이상적.", + "mahalanobis_dist": "데이터 중심으로부터의 Mahalanobis 거리. 클수록 비정상적.", + "consensus_flag": "3가지 이상치 방법 중 2개 이상 동의하면 True.", + "levene_stat": "등분산성에 대한 Levene 검정 통계량.", + "levene_p": "Levene 검정 p값. p < 0.05 -> 분산이 유의하게 다름.", + "kw_stat": "Kruskal-Wallis H 통계량 -- 비모수 일원 ANOVA.", + "kw_p": "Kruskal-Wallis 검정 p값. p < 0.05 -> 적어도 하나의 집단이 다름.", + "mw_stat": "Mann-Whitney U 통계량 -- 비모수 두 표본 순위 검정.", + "mw_p": "Mann-Whitney U 검정 p값.", + "chi2_stat": "균일 분포 대비 카이제곱 적합도 검정 통계량.", + "chi2_p": "카이제곱 적합도 검정 p값.", + "grubbs_stat": "단일 이상치 탐지를 위한 Grubbs 검정 통계량.", + "grubbs_p": "Grubbs 검정 p값.", + "adf_stat": "정상성에 대한 ADF(Augmented Dickey-Fuller) 검정 통계량.", + "adf_p": "ADF 검정 p값. p < 0.05 -> 시계열이 정상적.", + "numeric_ratio": "수치형 Column(Feature) 비율.", + "categorical_ratio": "범주형 Column(Feature) 비율.", + "duplicate_row_ratio": "정확히 중복된 행의 비율.", +} + +# ----- Chinese (zh) -------------------------------------------------- +METRIC_TIPS_I18N["zh"] = { + "type": "推断的数据类型 (numeric, categorical, text, datetime, boolean).", + "count": "该列中非空值的数量.", + "missing": "缺失(null / NaN)值的数量.", + "missing_%": "缺失率 = (缺失数 / 总行数) x 100.", + "unique": "该列中不同(唯一)值的数量.", + "mean": "算术平均 = 值总和 / 个数.", + "median": "按大小排序后的中间值(第50百分位).", + "std": "标准差 -- 衡量围绕均值的离散程度. 越大越分散.", + "se": "均值标准误 = std / sqrt(n). 表示样本均值的精度.", + "cv": "变异系数 = std / |mean|. 无单位的相对变异度量.", + "mad": "中位绝对偏差 = median(|xi - median|). 稳健的离散度量.", + "min": "该列的最小值.", + "max": "该列的最大值.", + "range": "极差 = max - min. 数据的总分布幅度.", + "p5": "第5百分位 -- 5%的数据低于此值.", + "q1": "第1四分位(第25百分位) -- 25%的数据低于此值.", + "q3": "第3四分位(第75百分位) -- 75%的数据低于此值.", + "p95": "第95百分位 -- 95%的数据低于此值.", + "iqr": "四分位距 = Q3 - Q1. 中间50%的分布幅度, 用于异常值检测.", + "skewness": "偏度衡量分布不对称性. 0=对称, >0=右偏, <0=左偏.", + "kurtosis": "超额峰度衡量尾部厚度. 0=正态, >0=厚尾, <0=薄尾.", + "top": "该列中出现频率最高的值.", + "freq": "最频繁值的出现次数.", + "n": "用于分布检验的非空观测数.", + "skew_type": "偏度解释: 对称(|s|<0.5), 中等偏(0.5-1), 高偏(>1).", + "kurt_type": "峰度解释: 中等峰(~0), 尖峰(>1,厚尾), 扁峰(<-1,薄尾).", + "normality_test": "使用的主要正态性检验(n<=5000用Shapiro-Wilk, 大样本用D'Agostino-Pearson).", + "normality_p": "主要正态性检验的p值. p<0.05 -> 可能非正态.", + "is_normal_0.05": "若p值>=0.05则为True, 即在alpha=0.05下正态性零假设未被拒绝.", + "shapiro_p": "Shapiro-Wilk检验p值. 适合中小样本(n<=5000).", + "dagostino_p": "D'Agostino-Pearson检验p值. 使用偏度+峰度, 适合n>=20.", + "ks_p": "Kolmogorov-Smirnov检验p值(与正态分布比较).", + "anderson_stat": "Anderson-Darling检验统计量. 越高则反正态证据越强.", + "anderson_5pct_cv": "Anderson-Darling 5%临界值. stat > cv -> 在5%水平拒绝正态性.", + "missing_count": "该列中缺失(null)值的数量.", + "missing_ratio": "缺失比例 = missing_count / total_rows (0至1).", + "dtype": "该列的Pandas数据类型.", + "lower_bound": "IQR下限 = Q1 - k x IQR. 低于此值为异常值(默认k=1.5).", + "upper_bound": "IQR上限 = Q3 + k x IQR. 高于此值为异常值(默认k=1.5).", + "outlier_count": "落在异常值范围外的值数量.", + "outlier_%": "异常值百分比 = (outlier_count / 总数) x 100.", + "min_outlier": "检测到的最小异常值.", + "max_outlier": "检测到的最大异常值.", + "threshold": "使用的Z分数阈值. |z| > threshold则为异常值.", + "max_zscore": "该列中找到的最大绝对Z分数.", + "top_value": "出现频率最高的类别值.", + "top_frequency": "最频繁类别的计数.", + "top_%": "最频繁类别的百分比 = (top_freq / 总数) x 100.", + "entropy": "Shannon熵(比特). 越高意味着类别间分布越均匀.", + "norm_entropy": "归一化熵 = entropy / log2(unique). 1.0=完全均匀.", + "max_entropy": "最大可能熵 = log2(unique). 所有类别频率相同时达到.", + "normalized_entropy": "与norm_entropy相同: entropy / max_entropy. 1.0=均匀.", + "unique_values": "不同类别值的数量.", + "variance": "该列方差 = 与均值偏差平方的均值.", + "mean_abs_corr": "与所有其他数值列的平均绝对Pearson相关系数.", + "avg_mutual_info": "与所有其他列的平均互信息量.", + "VIF": "方差膨胀因子. VIF=1->无多重共线性, >5->中等, >10->严重.", + "multicollinearity": "VIF解释: 低(<5), 中等(5-10), 高(>=10).", + "variance_ratio": "该主成分解释的总方差比例.", + "cumulative_ratio": "截至该成分的累计方差解释比例.", + "eigenvalue": "该成分的协方差矩阵特征值. 越高解释越多方差.", + "n_components": "计算的主成分总数.", + "total_variance_explained": "所有成分解释的总方差.", + "components_for_90pct": "解释>=90%方差所需的最小成分数.", + "top_component_variance": "第一个(最重要)主成分的方差比例.", + "total_rows": "数据集的总行数.", + "duplicate_rows": "找到的完全重复行数.", + "unique_rows": "唯一(非重复)行数.", + "duplicate_ratio": "重复行比例 = duplicate_rows / total_rows.", + "uniqueness_ratio": "唯一值比例 = unique / total_non_null. 1.0=全部唯一.", + "total_non_null": "用于唯一性计算的非空值数.", + "is_unique_key": "若每个非空值都唯一则为True -- 可能是主键.", + "completeness": "非缺失值比例 = 1 - (missing / total). 1.0=无缺失.", + "uniqueness": "唯一值与总非空值的比率. 越高越多样.", + "consistency": "类型一致性度量. 1.0=所有值匹配预期数据类型.", + "validity": "在预期范围/格式内的值比例. 1.0=全部有效.", + "overall": "加权质量分数 = 0.35*completeness + 0.25*uniqueness + 0.20*consistency + 0.20*validity.", + "quality_score": "结合完整性和唯一性的逐列质量分数.", + "column": "数据集中的列名.", + "component": "主成分标识符(PC1, PC2, ...).", + "value": "类别或离散值.", + "percentage": "该值的占比 = (count / total) x 100.", + "best_distribution": "根据AIC拟合最好的scipy分布.", + "aic": "赤池信息准则 -- 越低越好, 惩罚复杂度.", + "bic": "贝叶斯信息准则 -- 越低越好, 比AIC更保守.", + "ks_statistic": "衡量拟合分布最大CDF偏差的K-S统计量.", + "jarque_bera_stat": "Jarque-Bera检验统计量. 大值表示非正态.", + "jb_p_value": "Jarque-Bera检验p值. p < 0.05 -> 拒绝正态性.", + "recommended_transform": "使列更正态的推荐变换(Box-Cox或Yeo-Johnson).", + "original_skew": "原始(未变换)列的偏度.", + "transformed_skew": "应用推荐变换后的偏度.", + "bandwidth_silverman": "用于KDE估计的Silverman规则核带宽.", + "bandwidth_scott": "用于KDE估计的Scott规则核带宽.", + "partial_corr": "偏相关 -- 去除其他变量混淆效应后的Pearson相关.", + "mutual_information": "互信息(比特) -- 衡量两变量间的非线性依赖.", + "ci_lower": "相关系数95%自助法置信区间下限.", + "ci_upper": "相关系数95%自助法置信区间上限.", + "distance_corr": "Szekely距离相关 -- 捕获非线性依赖(0=独立, 1=依赖).", + "optimal_k": "基于轮廓分数分析确定的最佳聚类数.", + "best_silhouette": "所评估k值中最高平均轮廓分数(-1至1, 越高越好).", + "inertia": "簇内平方和(WCSS). 越低=越紧凑.", + "n_clusters_dbscan": "DBSCAN找到的聚类数(不含噪声).", + "noise_ratio": "DBSCAN标记为噪声的点的比例.", + "eps": "DBSCAN epsilon -- 从k距离图估计的邻域半径.", + "kl_divergence": "t-SNE嵌入的KL散度. 越低拟合越好.", + "tsne_perplexity": "t-SNE困惑度参数(平衡局部与全局结构).", + "n_factors": "通过Kaiser准则(特征值>1)保留的潜因子数.", + "factor_loading": "观测变量与潜因子间的相关.", + "noise_variance": "因子分析中每个变量的估计噪声(独特方差).", + "interaction_strength": "乘积交互项与头部特征间的Pearson相关.", + "monotonic_gap": "Pearson与Spearman相关之差 -- 差距大->非线性单调关系.", + "entropy_equal_width": "等宽分箱Shannon熵. 越低分布越集中.", + "entropy_equal_freq": "等频分箱Shannon熵. 越低越集中.", + "cardinality": "类别列的唯一值数量.", + "encoding_rec": "基于基数分析的推荐编码策略.", + "leakage_risk": "特征可能泄露目标信息的风险等级(low/medium/high).", + "anomaly_score_if": "Isolation Forest异常分数. 越负越异常.", + "lof_score": "Local Outlier Factor负分数. 越负越异常.", + "mahalanobis_dist": "距数据中心的Mahalanobis距离. 越大越异常.", + "consensus_flag": "若3种异常方法中2种以上同意则为True.", + "levene_stat": "等方差性Levene检验统计量.", + "levene_p": "Levene检验p值. p < 0.05 -> 方差有显著差异.", + "kw_stat": "Kruskal-Wallis H统计量 -- 非参数单因素ANOVA.", + "kw_p": "Kruskal-Wallis检验p值. p < 0.05 -> 至少一组不同.", + "mw_stat": "Mann-Whitney U统计量 -- 非参数两样本秩检验.", + "mw_p": "Mann-Whitney U检验p值.", + "chi2_stat": "对均匀分布的卡方拟合优度统计量.", + "chi2_p": "卡方拟合优度检验p值.", + "grubbs_stat": "检测单个异常值的Grubbs检验统计量.", + "grubbs_p": "Grubbs检验p值.", + "adf_stat": "平稳性ADF(Augmented Dickey-Fuller)检验统计量.", + "adf_p": "ADF检验p值. p < 0.05 -> 序列平稳.", + "numeric_ratio": "数值列的比例.", + "categorical_ratio": "类别列的比例.", + "duplicate_row_ratio": "完全重复行的比例.", +} + +# ----- Japanese (ja) ------------------------------------------------- +METRIC_TIPS_I18N["ja"] = { + "type": "推論されたデータ型 (numeric, categorical, text, datetime, boolean).", + "count": "列の非null値の数.", + "missing": "欠損(null / NaN)値の数.", + "missing_%": "欠損率 = (欠損数 / 総行数) x 100.", + "unique": "列のユニーク(固有)値の数.", + "mean": "算術平均 = 値の合計 / 個数.", + "median": "データを並べた時の中央値(第50百分位).", + "std": "標準偏差 -- 平均周りの散らばりを測定. 大きいほど分散が大きい.", + "se": "平均の標準誤差 = std / sqrt(n). 標本平均の精度を示す.", + "cv": "変動係数 = std / |mean|. 無次元の相対的変動性の尺度.", + "mad": "中央絶対偏差 = median(|xi - median|). ロバストな散布度の尺度.", + "min": "列の最小値.", + "max": "列の最大値.", + "range": "範囲 = max - min. データの全体的な広がり.", + "p5": "第5百分位 -- データの5%がこの値未満.", + "q1": "第1四分位(第25百分位) -- データの25%がこの値未満.", + "q3": "第3四分位(第75百分位) -- データの75%がこの値未満.", + "p95": "第95百分位 -- データの95%がこの値未満.", + "iqr": "四分位範囲 = Q3 - Q1. 中央50%の広がり, 外れ値検出に使用.", + "skewness": "歪度: 分布の非対称性. 0=対称, >0=右に歪, <0=左に歪.", + "kurtosis": "超過尖度: 裾の厚さ. 0=正規, >0=厚い裾, <0=薄い裾.", + "top": "列で最も頻繁な値.", + "freq": "最頻値の出現回数.", + "n": "分布検定に使用された非null観測数.", + "skew_type": "歪度の解釈: 対称(|s|<0.5), 中程度の歪み(0.5-1), 高い歪み(>1).", + "kurt_type": "尖度の解釈: 中程度(~0), 尖峰(>1,厚い裾), 扁平(<-1,薄い裾).", + "normality_test": "使用された正規性検定(n<=5000ならShapiro-Wilk, 大標本はD'Agostino-Pearson).", + "normality_p": "主正規性検定のp値. p<0.05 -> 非正規の可能性.", + "is_normal_0.05": "p値>=0.05ならTrue. alpha=0.05で正規性の帰無仮説が棄却されない.", + "shapiro_p": "Shapiro-Wilk検定p値. 小中標本向け(n<=5000).", + "dagostino_p": "D'Agostino-Pearson検定p値. 歪度+尖度を使用, n>=20向け.", + "ks_p": "Kolmogorov-Smirnov検定p値(正規分布と比較).", + "anderson_stat": "Anderson-Darling検定統計量. 高いほど反正規性の証拠が強い.", + "anderson_5pct_cv": "Anderson-Darling 5%臨界値. stat > cv -> 5%で正規性棄却.", + "missing_count": "この列の欠損(null)値の数.", + "missing_ratio": "欠損割合 = missing_count / total_rows (0~1).", + "dtype": "列のPandasデータ型.", + "lower_bound": "IQR下限 = Q1 - k x IQR. これより低い値は外れ値(デフォルトk=1.5).", + "upper_bound": "IQR上限 = Q3 + k x IQR. これより高い値は外れ値(デフォルトk=1.5).", + "outlier_count": "外れ値範囲外の値の数.", + "outlier_%": "外れ値率 = (outlier_count / 総数) x 100.", + "min_outlier": "検出された最小の外れ値.", + "max_outlier": "検出された最大の外れ値.", + "threshold": "使用されたZスコア閾値. |z| > thresholdなら外れ値.", + "max_zscore": "列で見つかった最大絶対Zスコア.", + "top_value": "最も頻繁なカテゴリ値.", + "top_frequency": "最も頻繁なカテゴリのカウント.", + "top_%": "最も頻繁なカテゴリの割合 = (top_freq / 総数) x 100.", + "entropy": "Shannonエントロピー(ビット). 高いほどカテゴリ間の分布が均一.", + "norm_entropy": "正規化エントロピー = entropy / log2(unique). 1.0=完全均一.", + "max_entropy": "最大可能エントロピー = log2(unique). 全カテゴリが同頻度で達成.", + "normalized_entropy": "norm_entropyと同じ: entropy / max_entropy. 1.0=均一.", + "unique_values": "固有カテゴリ値の数.", + "variance": "列の分散 = 平均からの偏差の二乗の平均.", + "mean_abs_corr": "他の全数値列との平均絶対Pearson相関係数.", + "avg_mutual_info": "他の全列との平均相互情報量.", + "VIF": "分散膨張係数. VIF=1->多重共線性なし, >5->中程度, >10->深刻.", + "multicollinearity": "VIF解釈: 低(<5), 中程度(5-10), 高(>=10).", + "variance_ratio": "この主成分が説明する総分散の割合.", + "cumulative_ratio": "この成分までの累積分散説明割合.", + "eigenvalue": "この成分の共分散行列の固有値. 高いほどより多くの分散を説明.", + "n_components": "計算された主成分の総数.", + "total_variance_explained": "全成分で説明される総分散.", + "components_for_90pct": "分散90%以上の説明に必要な最小成分数.", + "top_component_variance": "第1(最重要)主成分の分散比率.", + "total_rows": "データセットの総行数.", + "duplicate_rows": "完全重複行の数.", + "unique_rows": "ユニーク(非重複)行の数.", + "duplicate_ratio": "重複行の割合 = duplicate_rows / total_rows.", + "uniqueness_ratio": "ユニーク値の割合 = unique / total_non_null. 1.0=全て固有.", + "total_non_null": "ユニーク性計算に使用された非null値の数.", + "is_unique_key": "全ての非null値がユニークならTrue -- 主キーの候補.", + "completeness": "非欠損値の割合 = 1 - (missing / total). 1.0=欠損なし.", + "uniqueness": "ユニーク値と総非null値の比率. 高いほど多様.", + "consistency": "型一致性の測定. 1.0=全値が期待データ型と一致.", + "validity": "期待範囲/形式内の値の割合. 1.0=全て有効.", + "overall": "加重品質スコア = 0.35*completeness + 0.25*uniqueness + 0.20*consistency + 0.20*validity.", + "quality_score": "完全性と一意性を組み合わせた列別品質スコア.", + "column": "データセットの列名.", + "component": "主成分識別子(PC1, PC2, ...).", + "value": "カテゴリまたは離散値.", + "percentage": "この値の割合 = (count / total) x 100.", + "best_distribution": "AIC基準でデータに最も適合するscipy分布.", + "aic": "赤池情報量基準 -- 低いほど良い. 複雑さにペナルティ.", + "bic": "ベイズ情報量基準 -- 低いほど良い. AICより保守的.", + "ks_statistic": "適合分布との最大CDF偏差を測るK-S統計量.", + "jarque_bera_stat": "Jarque-Bera検定統計量. 大きい値は非正規性を示す.", + "jb_p_value": "Jarque-Bera検定p値. p < 0.05 -> 正規性棄却.", + "recommended_transform": "列をより正規にする推奨変換(Box-CoxまたはYeo-Johnson).", + "original_skew": "元の(未変換)列の歪度.", + "transformed_skew": "推奨変換適用後の歪度.", + "bandwidth_silverman": "KDE推定用Silvermanルールのカーネル帯域幅.", + "bandwidth_scott": "KDE推定用Scottルールのカーネル帯域幅.", + "partial_corr": "偏相関 -- 他変数の交絡効果を除去後のPearson相関.", + "mutual_information": "相互情報量(ビット) -- 2変数間の非線形依存性を測定.", + "ci_lower": "相関係数の95%ブートストラップ信頼区間の下限.", + "ci_upper": "相関係数の95%ブートストラップ信頼区間の上限.", + "distance_corr": "Szekely距離相関 -- 非線形依存性を捕捉(0=独立, 1=依存).", + "optimal_k": "シルエットスコア分析で決定された最適クラスタ数.", + "best_silhouette": "評価したk値の中で最高の平均シルエットスコア(-1~1, 高いほど良い).", + "inertia": "クラスタ内二乗和(WCSS). 低いほど密集したクラスタ.", + "n_clusters_dbscan": "DBSCANが発見したクラスタ数(ノイズ除く).", + "noise_ratio": "DBSCANがノイズとした点の割合.", + "eps": "DBSCAN epsilon -- k距離プロットから推定された近傍半径.", + "kl_divergence": "t-SNE埋め込みのKLダイバージェンス. 低いほど良い適合.", + "tsne_perplexity": "t-SNEパープレキシティ(局所と大域構造のバランス).", + "n_factors": "Kaiser基準(固有値>1)で保持された潜在因子数.", + "factor_loading": "観測変数と潜在因子間の相関.", + "noise_variance": "因子分析における各変数の推定ノイズ(固有分散).", + "interaction_strength": "積交互作用項と上位特徴間のPearson相関.", + "monotonic_gap": "PearsonとSpearman相関の差 -- 大きい差->非線形単調関係.", + "entropy_equal_width": "等幅ビンのShannonエントロピー. 低いほど集中した分布.", + "entropy_equal_freq": "等頻度ビンのShannonエントロピー. 低いほど集中.", + "cardinality": "カテゴリ列のユニーク値の数.", + "encoding_rec": "カーディナリティ分析に基づく推奨エンコーディング戦略.", + "leakage_risk": "特徴がターゲット情報を漏洩するリスクレベル(low/medium/high).", + "anomaly_score_if": "Isolation Forest異常スコア. より負=より異常.", + "lof_score": "Local Outlier Factor負スコア. より負=より異常.", + "mahalanobis_dist": "データ重心からのMahalanobis距離. 大きい=より異常.", + "consensus_flag": "3つの異常値手法のうち2つ以上が同意すればTrue.", + "levene_stat": "等分散性のLevene検定統計量.", + "levene_p": "Levene検定p値. p < 0.05 -> 分散に有意差あり.", + "kw_stat": "Kruskal-Wallis H統計量 -- ノンパラメトリック一元配置ANOVA.", + "kw_p": "Kruskal-Wallis検定p値. p < 0.05 -> 少なくとも1つのグループが異なる.", + "mw_stat": "Mann-Whitney U統計量 -- ノンパラメトリック二標本順位検定.", + "mw_p": "Mann-Whitney U検定p値.", + "chi2_stat": "均一分布に対するカイ二乗適合度統計量.", + "chi2_p": "カイ二乗適合度検定p値.", + "grubbs_stat": "単一外れ値検出のためのGrubbs検定統計量.", + "grubbs_p": "Grubbs検定p値.", + "adf_stat": "定常性のADF(Augmented Dickey-Fuller)検定統計量.", + "adf_p": "ADF検定p値. p < 0.05 -> 時系列は定常.", + "numeric_ratio": "数値列の割合.", + "categorical_ratio": "カテゴリ列の割合.", + "duplicate_row_ratio": "完全重複行の割合.", +} + +# ----- German (de) ---------------------------------------------------- +METRIC_TIPS_I18N["de"] = { + "type": "Abgeleiteter Datentyp der Spalte (numerisch, kategorisch, Text, Datum, boolesch).", + "count": "Anzahl nicht-leerer Werte in der Spalte.", + "missing": "Anzahl fehlender (null / NaN) Werte.", + "missing_%": "Prozent fehlender Werte = (fehlend / Gesamtzeilen) × 100.", + "unique": "Anzahl eindeutiger Werte in der Spalte.", + "mean": "Arithmetisches Mittel = Summe der Werte / Anzahl.", + "median": "Mittlerer Wert der sortierten Daten (50. Perzentil).", + "std": "Standardabweichung – misst die Streuung um den Mittelwert. Größer = stärker gestreut.", + "se": "Standardfehler des Mittelwerts = std / √n. Zeigt die Präzision des Stichprobenmittelwerts.", + "cv": "Variationskoeffizient = std / |Mittelwert|. Dimensionslose relative Variabilität.", + "mad": "Mediane Absolute Abweichung = Median(|xi − Median|). Robuste Streuungsmaß.", + "min": "Minimalwert in der Spalte.", + "max": "Maximalwert in der Spalte.", + "range": "Spannweite = Max − Min. Gesamtspanne der Daten.", + "p5": "5. Perzentil – 5% der Daten liegen unterhalb dieses Werts.", + "q1": "1. Quartil (25. Perzentil) – 25% der Daten liegen darunter.", + "q3": "3. Quartil (75. Perzentil) – 75% der Daten liegen darunter.", + "p95": "95. Perzentil – 95% der Daten liegen unterhalb dieses Werts.", + "iqr": "Interquartilsabstand = Q3 − Q1. Mittlere 50%, verwendet für Ausreißererkennung.", + "skewness": "Schiefe misst Verteilungsasymmetrie. 0=symmetrisch, >0=rechtschief, <0=linksschief.", + "kurtosis": "Exzess-Kurtosis misst Schwere der Ränder. 0=normal, >0=schwere Ränder, <0=leichte Ränder.", + "top": "Häufigster Wert in der Spalte.", + "freq": "Häufigkeit des häufigsten Werts.", + "n": "Anzahl nicht-leerer Beobachtungen für den Verteilungstest.", + "skew_type": "Interpretation der Schiefe: symmetrisch (|s|<0,5), mäßig schief (0,5–1), stark schief (>1).", + "kurt_type": "Interpretation der Kurtosis: mesokurtisch (~0), leptokurtisch (>1, schwere Ränder), platykurtisch (<−1, leichte Ränder).", + "normality_test": "Verwendeter primärer Normalitätstest (Shapiro-Wilk für n≤5000, D'Agostino-Pearson für größere).", + "normality_p": "p-Wert des primären Normalitätstests. p<0,05 → wahrscheinlich nicht normal.", + "is_normal_0.05": "Wahr, wenn p≥0,05 – Normalitätshypothese wird bei α=0,05 nicht abgelehnt.", + "shapiro_p": "p-Wert des Shapiro-Wilk-Tests. Am besten für kleine bis mittlere Stichproben (n≤5000).", + "dagostino_p": "p-Wert des D'Agostino-Pearson-Tests. Nutzt Schiefe + Kurtosis, gut für n≥20.", + "ks_p": "p-Wert des Kolmogorow-Smirnow-Tests gegen Normalverteilung.", + "anderson_stat": "Anderson-Darling-Teststatistik. Höher = stärkerer Hinweis gegen Normalität.", + "anderson_5pct_cv": "Anderson-Darling 5%-Kritischer Wert. Stat > krit. Wert → Normalität bei 5% ablehnen.", + "missing_count": "Anzahl fehlender (null) Werte in dieser Spalte.", + "missing_ratio": "Anteil fehlender Werte = Fehlend / Gesamtzeilen (0 bis 1).", + "dtype": "Pandas-Datentyp der Spalte.", + "lower_bound": "IQR-Untergrenze = Q1 − k×IQR. Werte darunter sind Ausreißer (Standard k=1,5).", + "upper_bound": "IQR-Obergrenze = Q3 + k×IQR. Werte darüber sind Ausreißer (Standard k=1,5).", + "outlier_count": "Anzahl der Werte außerhalb der Ausreißergrenzen.", + "outlier_%": "Prozent der Ausreißer = (Ausreißeranzahl / Gesamt) × 100.", + "min_outlier": "Kleinster erkannter Ausreißerwert.", + "max_outlier": "Größter erkannter Ausreißerwert.", + "threshold": "Verwendeter Z-Score-Schwellenwert. Werte mit |z| > Schwelle sind Ausreißer.", + "max_zscore": "Maximaler absoluter Z-Score in der Spalte.", + "top_value": "Häufigste Kategorie.", + "top_frequency": "Anzahl der häufigsten Kategorie.", + "top_%": "Prozent der häufigsten Kategorie = (Häufigkeit / Gesamt) × 100.", + "entropy": "Shannon-Entropie (Bit). Höher = gleichmäßigere Verteilung.", + "norm_entropy": "Normalisierte Entropie = Entropie / log₂(Unique). 1,0 = perfekt gleichmäßig.", + "max_entropy": "Maximal mögliche Entropie = log₂(Unique). Erreicht bei Gleichverteilung.", + "normalized_entropy": "Wie norm_entropy: Entropie / max. Entropie. 1,0 = gleichverteilt.", + "unique_values": "Anzahl verschiedener Kategoriewerte.", + "variance": "Varianz der Spalte = mittlere quadratische Abweichung vom Mittelwert.", + "mean_abs_corr": "Mittlere absolute Pearson-Korrelation mit allen anderen numerischen Spalten.", + "avg_mutual_info": "Mittlere gegenseitige Information mit allen anderen Spalten.", + "VIF": "Varianzinflationsfaktor. VIF=1 → keine Multikollinearität, >5 → mäßig, >10 → schwer.", + "multicollinearity": "Interpretation des VIF: niedrig (<5), mäßig (5–10) oder hoch (≥10).", + "variance_ratio": "Anteil der Gesamtvarianz, der von dieser Hauptkomponente erklärt wird.", + "cumulative_ratio": "Kumulativer Varianzanteil bis einschließlich dieser Komponente.", + "eigenvalue": "Eigenwert der Kovarianzmatrix für diese Komponente. Höher = mehr Varianz.", + "n_components": "Gesamtanzahl berechneter Hauptkomponenten.", + "total_variance_explained": "Von allen berechneten Komponenten erfasste Gesamtvarianz.", + "components_for_90pct": "Minimale Komponentenanzahl für ≥90% erklärte Varianz.", + "top_component_variance": "Varianzanteil der ersten (wichtigsten) Hauptkomponente.", + "total_rows": "Gesamtzahl der Zeilen im Datensatz.", + "duplicate_rows": "Anzahl exakter Duplikatzeilen.", + "unique_rows": "Anzahl einzigartiger (nicht-duplizierter) Zeilen.", + "duplicate_ratio": "Anteil duplizierter Zeilen = Duplikate / Gesamtzeilen.", + "uniqueness_ratio": "Quotient einzigartiger Werte = Unique / Gesamt (nicht-null). 1,0 = alle einzigartig.", + "total_non_null": "Nicht-leere Werte für die Eindeutigkeitsberechnung.", + "is_unique_key": "Wahr, wenn jeder nicht-leere Wert einzigartig ist – potenzieller Primärschlüssel.", + "completeness": "Anteil nicht-fehlender Werte = 1 − (Fehlend / Gesamt). 1,0 = keine Fehlwerte.", + "uniqueness": "Quotient eindeutiger Werte zu nicht-leeren Werten. Höher = vielfältiger.", + "consistency": "Typ-Konsistenz. 1,0 = alle Werte entsprechen dem erwarteten Datentyp.", + "validity": "Anteil gültiger Werte (in erwarteten Bereichen/Formaten). 1,0 = alle gültig.", + "overall": "Gewichteter Qualitätsscore = 0,35×Vollst. + 0,25×Eindeut. + 0,20×Konsist. + 0,20×Gültigk.", + "quality_score": "Spaltenqualität aus Vollständigkeit und Eindeutigkeit.", + "column": "Spaltenname im Datensatz.", + "component": "Hauptkomponenten-Kennung (PC1, PC2, …).", + "value": "Kategorie- oder diskreter Wert.", + "percentage": "Prozentualer Anteil dieses Werts = (Anzahl / Gesamt) × 100.", + "best_distribution": "Scipy-Verteilung mit dem besten AIC-Fit.", + "aic": "Akaike-Informationskriterium – niedriger = besser. Bestraft Komplexität.", + "bic": "Bayessches Informationskriterium – niedriger = besser. Konservativer als AIC.", + "ks_statistic": "KS-Statistik: maximale CDF-Abweichung von der angepassten Verteilung.", + "jarque_bera_stat": "Jarque-Bera-Teststatistik. Große Werte → Nicht-Normalität.", + "jb_p_value": "p-Wert des Jarque-Bera-Tests. p<0,05 → Normalität ablehnen.", + "recommended_transform": "Empfohlene Potenztransformation für bessere Normalität (Box-Cox oder Yeo-Johnson).", + "original_skew": "Schiefe der Originalspalte (untransformiert).", + "transformed_skew": "Schiefe nach Anwendung der empfohlenen Transformation.", + "bandwidth_silverman": "Kernel-Bandbreite nach Silvermans Regel für KDE.", + "bandwidth_scott": "Kernel-Bandbreite nach Scotts Regel für KDE.", + "partial_corr": "Partielle Korrelation – Pearson nach Entfernung von Konfounder-Effekten.", + "mutual_information": "Gegenseitige Information (Bit) – misst nicht-lineare Abhängigkeit.", + "ci_lower": "Untere Grenze des 95%-Bootstrap-Konfidenzintervalls der Korrelation.", + "ci_upper": "Obere Grenze des 95%-Bootstrap-Konfidenzintervalls der Korrelation.", + "distance_corr": "Szekely-Distanzkorrelation – erfasst nicht-lineare Abhängigkeiten (0=unabhängig, 1=abhängig).", + "optimal_k": "Beste Clusteranzahl laut Silhouetten-Analyse.", + "best_silhouette": "Höchster mittlerer Silhouetten-Score (-1 bis 1, höher = besser).", + "inertia": "Within-Cluster Summe der Quadrate (WCSS). Niedriger = kompaktere Cluster.", + "n_clusters_dbscan": "Anzahl der von DBSCAN gefundenen Cluster (ohne Rauschen).", + "noise_ratio": "Anteil der als Rauschen eingestuften Punkte.", + "eps": "DBSCAN-Epsilon – Nachbarschaftsradius, automatisch aus k-Distanz-Plot geschätzt.", + "kl_divergence": "Kullback-Leibler-Divergenz der t-SNE-Einbettung. Niedriger = bessere Anpassung.", + "tsne_perplexity": "Perplexitäts-Parameter für t-SNE (balanciert lokal vs. global).", + "n_factors": "Anzahl behaltener latenter Faktoren nach Kaiser-Kriterium (Eigenwert > 1).", + "factor_loading": "Korrelation zwischen beobachteter Variable und latentem Faktor.", + "noise_variance": "Geschätzte Rauschvarianz (Uniqueness) jeder Variable in der Faktorenanalyse.", + "interaction_strength": "Pearson-Korrelation zwischen Produkt-Interaktionsterm und Top-Feature.", + "monotonic_gap": "Lücke zwischen Pearson- und Spearman-Korrelation. Groß → nicht-linearer monotoner Zusammenhang.", + "entropy_equal_width": "Shannon-Entropie des Gleich-Breite-Binnings. Niedriger = konzentriertere Verteilung.", + "entropy_equal_freq": "Shannon-Entropie des Gleich-Frequenz-Binnings. Niedriger = konzentrierter.", + "cardinality": "Anzahl eindeutiger Werte einer kategorischen Spalte.", + "encoding_rec": "Empfohlene Kodierungsstrategie basierend auf der Kardinalitätsanalyse.", + "leakage_risk": "Risikostufe (niedrig/mittel/hoch), dass ein Feature Zielinformationen leakt.", + "anomaly_score_if": "Isolation-Forest-Anomalie-Score. Negativer = anomaler.", + "lof_score": "Local Outlier Factor (minus-Score). Negativer = anomaler.", + "mahalanobis_dist": "Mahalanobis-Distanz zum Datenzentrum. Größer = ungewöhnlicher.", + "consensus_flag": "Wahr, wenn ≥2 von 3 Anomalie-Methoden den Punkt als anomal einstufen.", + "levene_stat": "Levene-Teststatistik für Varianzgleichheit.", + "levene_p": "p-Wert des Levene-Tests. p<0,05 → Varianzen signifikant verschieden.", + "kw_stat": "Kruskal-Wallis-H-Statistik – nicht-parametrische Einweg-ANOVA.", + "kw_p": "p-Wert des Kruskal-Wallis-Tests. p<0,05 → mindestens eine Gruppe unterscheidet sich.", + "mw_stat": "Mann-Whitney-U-Statistik – nicht-parametrischer Zweistichproben-Rangtest.", + "mw_p": "p-Wert des Mann-Whitney-U-Tests.", + "chi2_stat": "Chi-Quadrat-Anpassungsstatistik vs. Gleichverteilung.", + "chi2_p": "p-Wert des Chi-Quadrat-Anpassungstests.", + "grubbs_stat": "Grubbs-Teststatistik zur Erkennung eines einzelnen Ausreißers.", + "grubbs_p": "p-Wert des Grubbs-Tests.", + "adf_stat": "ADF-Teststatistik (Augmented Dickey-Fuller) für Stationarität.", + "adf_p": "p-Wert des ADF-Tests. p<0,05 → Reihe ist stationär.", + "numeric_ratio": "Anteil numerischer Spalten.", + "categorical_ratio": "Anteil kategorischer Spalten.", + "duplicate_row_ratio": "Anteil exakter Duplikatzeilen.", +} + +# ----- French (fr) ---------------------------------------------------- +METRIC_TIPS_I18N["fr"] = { + "type": "Type de données inféré de la colonne (numérique, catégoriel, texte, date, booléen).", + "count": "Nombre de valeurs non nulles dans la colonne.", + "missing": "Nombre de valeurs manquantes (null / NaN).", + "missing_%": "Pourcentage de valeurs manquantes = (manquants / total lignes) × 100.", + "unique": "Nombre de valeurs distinctes dans la colonne.", + "mean": "Moyenne arithmétique = somme des valeurs / nombre.", + "median": "Valeur centrale des données triées (50e percentile).", + "std": "Écart-type – mesure la dispersion autour de la moyenne. Plus grand = plus dispersé.", + "se": "Erreur standard de la moyenne = std / √n. Indique la précision de la moyenne.", + "cv": "Coefficient de variation = std / |moyenne|. Mesure relative sans unité de la variabilité.", + "mad": "Déviation Absolue Médiane = médiane(|xi − médiane|). Mesure robuste de dispersion.", + "min": "Valeur minimale dans la colonne.", + "max": "Valeur maximale dans la colonne.", + "range": "Étendue = max − min. Dispersion totale des données.", + "p5": "5e percentile – 5% des données sont inférieures à cette valeur.", + "q1": "1er quartile (25e percentile) – 25% des données sont inférieures.", + "q3": "3e quartile (75e percentile) – 75% des données sont inférieures.", + "p95": "95e percentile – 95% des données sont inférieures à cette valeur.", + "iqr": "Écart interquartile = Q3 − Q1. 50% central, utilisé pour la détection d'aberrants.", + "skewness": "L'asymétrie mesure la déformation de la distribution. 0=symétrique, >0=droite, <0=gauche.", + "kurtosis": "Kurtosis excédentaire – poids des queues. 0=normal, >0=queues lourdes, <0=queues légères.", + "top": "Valeur la plus fréquente dans la colonne.", + "freq": "Fréquence de la valeur la plus courante.", + "n": "Nombre d'observations non nulles utilisées pour le test de distribution.", + "skew_type": "Interprétation de l'asymétrie : symétrique (|s|<0,5), modérée (0,5–1), forte (>1).", + "kurt_type": "Interprétation de la kurtosis : mésokurtique (~0), leptokurtique (>1), platykurtique (<−1).", + "normality_test": "Test de normalité principal (Shapiro-Wilk pour n≤5000, D'Agostino-Pearson pour plus grand).", + "normality_p": "p-value du test de normalité. p<0,05 → probablement non normal.", + "is_normal_0.05": "Vrai si p≥0,05 – la normalité n'est pas rejetée à α=0,05.", + "shapiro_p": "p-value du test de Shapiro-Wilk. Optimal pour échantillons petits à moyens (n≤5000).", + "dagostino_p": "p-value du test de D'Agostino-Pearson. Utilise asymétrie + kurtosis, adapté pour n≥20.", + "ks_p": "p-value du test de Kolmogorov-Smirnov vs distribution normale.", + "anderson_stat": "Statistique du test d'Anderson-Darling. Plus élevée = preuve plus forte contre la normalité.", + "anderson_5pct_cv": "Valeur critique à 5% d'Anderson-Darling. Stat > VC → rejeter la normalité à 5%.", + "missing_count": "Nombre de valeurs manquantes (null) dans cette colonne.", + "missing_ratio": "Fraction de valeurs manquantes = manquants / total (0 à 1).", + "dtype": "Type pandas de la colonne.", + "lower_bound": "Borne inférieure IQR = Q1 − k×IQR. Valeurs en dessous = aberrants (k=1,5 par défaut).", + "upper_bound": "Borne supérieure IQR = Q3 + k×IQR. Valeurs au-dessus = aberrants (k=1,5 par défaut).", + "outlier_count": "Nombre de valeurs en dehors des bornes.", + "outlier_%": "Pourcentage d'aberrants = (nombre / total) × 100.", + "min_outlier": "Plus petit aberrant détecté.", + "max_outlier": "Plus grand aberrant détecté.", + "threshold": "Seuil de Z-score utilisé. |z| > seuil = aberrant.", + "max_zscore": "Z-score absolu maximal trouvé dans la colonne.", + "top_value": "Catégorie la plus fréquente.", + "top_frequency": "Nombre d'occurrences de la catégorie la plus fréquente.", + "top_%": "Pourcentage de la catégorie la plus fréquente = (fréq / total) × 100.", + "entropy": "Entropie de Shannon (bits). Plus élevée = distribution plus uniforme.", + "norm_entropy": "Entropie normalisée = entropie / log₂(unique). 1,0 = parfaitement uniforme.", + "max_entropy": "Entropie maximale possible = log₂(unique). Atteinte quand toutes les catégories sont équiprobables.", + "normalized_entropy": "Idem norm_entropy : entropie / entropie max. 1,0 = uniforme.", + "unique_values": "Nombre de valeurs catégorielles distinctes.", + "variance": "Variance de la colonne = moyenne des écarts quadratiques à la moyenne.", + "mean_abs_corr": "Corrélation Pearson absolue moyenne avec toutes les autres colonnes numériques.", + "avg_mutual_info": "Information mutuelle moyenne avec toutes les autres colonnes.", + "VIF": "Facteur d'inflation de la variance. VIF=1 → pas de multicolinéarité, >5 → modérée, >10 → sévère.", + "multicollinearity": "Interprétation du VIF : faible (<5), modérée (5–10) ou élevée (≥10).", + "variance_ratio": "Part de la variance totale expliquée par cette composante principale.", + "cumulative_ratio": "Part cumulative de la variance expliquée jusqu'à cette composante.", + "eigenvalue": "Valeur propre de la matrice de covariance. Plus élevée = plus de variance.", + "n_components": "Nombre total de composantes principales calculées.", + "total_variance_explained": "Variance totale captée par toutes les composantes calculées.", + "components_for_90pct": "Nombre minimum de composantes pour ≥90% de variance expliquée.", + "top_component_variance": "Part de variance de la première (plus importante) composante.", + "total_rows": "Nombre total de lignes dans le jeu de données.", + "duplicate_rows": "Nombre de lignes exactement dupliquées.", + "unique_rows": "Nombre de lignes uniques (non dupliquées).", + "duplicate_ratio": "Part de lignes dupliquées = doublons / total.", + "uniqueness_ratio": "Ratio de valeurs uniques = unique / total non-null. 1,0 = toutes uniques.", + "total_non_null": "Valeurs non nulles utilisées pour le calcul d'unicité.", + "is_unique_key": "Vrai si chaque valeur non nulle est unique – clé primaire potentielle.", + "completeness": "Fraction de valeurs non manquantes = 1 − (manquants / total). 1,0 = aucune donnée manquante.", + "uniqueness": "Ratio de valeurs uniques / non-null. Plus élevé = plus diversifié.", + "consistency": "Cohérence des types. 1,0 = toutes les valeurs correspondent au type attendu.", + "validity": "Fraction de valeurs dans les plages/formats attendus. 1,0 = toutes valides.", + "overall": "Score qualité pondéré = 0,35×Complét. + 0,25×Unicit. + 0,20×Cohér. + 0,20×Valid.", + "quality_score": "Score qualité par colonne combinant complétude et unicité.", + "column": "Nom de la colonne dans le jeu de données.", + "component": "Identifiant de la composante principale (PC1, PC2, …).", + "value": "Valeur catégorielle ou discrète.", + "percentage": "Part en pourcentage = (nombre / total) × 100.", + "best_distribution": "Distribution scipy avec le meilleur ajustement AIC.", + "aic": "Critère d'information d'Akaike – plus bas = meilleur. Pénalise la complexité.", + "bic": "Critère d'information bayésien – plus bas = meilleur. Plus conservateur que l'AIC.", + "ks_statistic": "Statistique KS : écart CDF maximal par rapport à la distribution ajustée.", + "jarque_bera_stat": "Statistique du test de Jarque-Bera. Grande valeur → non-normalité.", + "jb_p_value": "p-value du test de Jarque-Bera. p<0,05 → rejeter la normalité.", + "recommended_transform": "Transformation de puissance recommandée (Box-Cox ou Yeo-Johnson).", + "original_skew": "Asymétrie de la colonne originale (non transformée).", + "transformed_skew": "Asymétrie après application de la transformation recommandée.", + "bandwidth_silverman": "Largeur de bande du noyau selon la règle de Silverman pour KDE.", + "bandwidth_scott": "Largeur de bande du noyau selon la règle de Scott pour KDE.", + "partial_corr": "Corrélation partielle – Pearson après suppression des effets confondants.", + "mutual_information": "Information mutuelle (bits) – mesure la dépendance non linéaire.", + "ci_lower": "Borne inférieure de l'IC bootstrap à 95% de la corrélation.", + "ci_upper": "Borne supérieure de l'IC bootstrap à 95% de la corrélation.", + "distance_corr": "Corrélation de distance de Szekely – capte les dépendances non linéaires (0=indépendant, 1=dépendant).", + "optimal_k": "Meilleur nombre de clusters selon l'analyse silhouette.", + "best_silhouette": "Score silhouette moyen le plus élevé (-1 à 1, plus haut = meilleur).", + "inertia": "Somme des carrés intra-cluster (WCSS). Plus bas = clusters plus compacts.", + "n_clusters_dbscan": "Nombre de clusters trouvés par DBSCAN (hors bruit).", + "noise_ratio": "Part des points étiquetés comme bruit par DBSCAN.", + "eps": "Epsilon DBSCAN – rayon de voisinage auto-estimé depuis le graphe k-distance.", + "kl_divergence": "Divergence de Kullback-Leibler de l'embedding t-SNE. Plus bas = meilleur.", + "tsne_perplexity": "Paramètre de perplexité pour t-SNE (équilibre local vs global).", + "n_factors": "Nombre de facteurs latents retenus par le critère de Kaiser (valeur propre > 1).", + "factor_loading": "Corrélation entre une variable observée et un facteur latent.", + "noise_variance": "Variance de bruit estimée (unicité) de chaque variable en analyse factorielle.", + "interaction_strength": "Corrélation Pearson entre un terme d'interaction (produit) et le meilleur feature.", + "monotonic_gap": "Écart entre corrélations Pearson et Spearman. Grand → relation monotone non linéaire.", + "entropy_equal_width": "Entropie de Shannon du binning à largeur égale. Plus bas = plus concentré.", + "entropy_equal_freq": "Entropie de Shannon du binning à fréquence égale. Plus bas = plus concentré.", + "cardinality": "Nombre de valeurs uniques d'une colonne catégorielle.", + "encoding_rec": "Stratégie d'encodage recommandée selon l'analyse de cardinalité.", + "leakage_risk": "Niveau de risque (faible/moyen/élevé) de fuite d'information cible.", + "anomaly_score_if": "Score d'anomalie Isolation Forest. Plus négatif = plus anomal.", + "lof_score": "Score LOF (minus). Plus négatif = plus anomal.", + "mahalanobis_dist": "Distance de Mahalanobis au centroïde. Plus grande = plus inhabituel.", + "consensus_flag": "Vrai si ≥2 des 3 méthodes considèrent le point comme anomal.", + "levene_stat": "Statistique du test de Levene pour l'égalité des variances.", + "levene_p": "p-value du test de Levene. p<0,05 → variances significativement différentes.", + "kw_stat": "Statistique H de Kruskal-Wallis – ANOVA non paramétrique à un facteur.", + "kw_p": "p-value du test de Kruskal-Wallis. p<0,05 → au moins un groupe diffère.", + "mw_stat": "Statistique U de Mann-Whitney – test de rang non paramétrique à deux échantillons.", + "mw_p": "p-value du test de Mann-Whitney U.", + "chi2_stat": "Statistique du test d'ajustement chi-deux vs distribution uniforme.", + "chi2_p": "p-value du test d'ajustement chi-deux.", + "grubbs_stat": "Statistique du test de Grubbs pour la détection d'un aberrant unique.", + "grubbs_p": "p-value du test de Grubbs.", + "adf_stat": "Statistique du test ADF (Augmented Dickey-Fuller) de stationnarité.", + "adf_p": "p-value du test ADF. p<0,05 → la série est stationnaire.", + "numeric_ratio": "Part des colonnes numériques.", + "categorical_ratio": "Part des colonnes catégorielles.", + "duplicate_row_ratio": "Part des lignes exactement dupliquées.", +} + + +def get_metric_tips_json() -> str: + "“”Return METRIC_TIPS_I18N dict as a JSON string for embedding in JS.“”" + import json + return json.dumps(METRIC_TIPS_I18N, ensure_ascii=False) diff --git a/f2a/stats/__init__.py b/f2a/stats/__init__.py new file mode 100644 index 0000000..341c810 --- /dev/null +++ b/f2a/stats/__init__.py @@ -0,0 +1,53 @@ +"""Stats module — statistical analysis engine.""" + +from f2a.stats.categorical import CategoricalStats +from f2a.stats.correlation import CorrelationStats +from f2a.stats.descriptive import DescriptiveStats +from f2a.stats.distribution import DistributionStats +from f2a.stats.duplicates import DuplicateStats +from f2a.stats.feature_importance import FeatureImportanceStats +from f2a.stats.missing import MissingStats +from f2a.stats.outlier import OutlierStats +from f2a.stats.pca_analysis import PCAStats +from f2a.stats.quality import QualityStats + +# Advanced stats modules +from f2a.stats.advanced_anomaly import AdvancedAnomalyStats +from f2a.stats.advanced_correlation import AdvancedCorrelationStats +from f2a.stats.advanced_dimreduction import AdvancedDimReductionStats +from f2a.stats.advanced_distribution import AdvancedDistributionStats +from f2a.stats.clustering import ClusteringStats +from f2a.stats.feature_insights import FeatureInsightsStats +from f2a.stats.statistical_tests import StatisticalTests + +# Enhancement modules (v2) +from f2a.stats.column_role import ColumnRoleClassifier +from f2a.stats.cross_analysis import CrossAnalysis +from f2a.stats.insight_engine import InsightEngine +from f2a.stats.ml_readiness import MLReadinessEvaluator + +__all__ = [ + "CategoricalStats", + "CorrelationStats", + "DescriptiveStats", + "DistributionStats", + "DuplicateStats", + "FeatureImportanceStats", + "MissingStats", + "OutlierStats", + "PCAStats", + "QualityStats", + # Advanced + "AdvancedAnomalyStats", + "AdvancedCorrelationStats", + "AdvancedDimReductionStats", + "AdvancedDistributionStats", + "ClusteringStats", + "FeatureInsightsStats", + "StatisticalTests", + # Enhancement + "ColumnRoleClassifier", + "CrossAnalysis", + "InsightEngine", + "MLReadinessEvaluator", +] diff --git a/f2a/stats/advanced_anomaly.py b/f2a/stats/advanced_anomaly.py new file mode 100644 index 0000000..e7baf14 --- /dev/null +++ b/f2a/stats/advanced_anomaly.py @@ -0,0 +1,379 @@ +"""Advanced anomaly detection module. + +Provides Isolation Forest, Local Outlier Factor, Mahalanobis distance, +and consensus anomaly scoring. + +References: + - Liu et al. (2008) — Isolation Forest + - Breunig et al. (2000) — Local Outlier Factor + - Mahalanobis (1936) — Mahalanobis distance +""" + +from __future__ import annotations + +from typing import Any + +import numpy as np +import pandas as pd + +from f2a.core.schema import DataSchema +from f2a.utils.logging import get_logger + +logger = get_logger(__name__) + + +class AdvancedAnomalyStats: + """Multi-method anomaly detection for numeric columns. + + Args: + df: Target DataFrame. + schema: Data schema. + max_sample: Max rows to sample for expensive operations. + contamination: Expected proportion of anomalies. + """ + + def __init__( + self, + df: pd.DataFrame, + schema: DataSchema, + max_sample: int = 5000, + contamination: float = 0.05, + ) -> None: + self._df = df + self._schema = schema + self._max_sample = max_sample + self._contamination = contamination + + def _prepare_data(self) -> tuple[np.ndarray, pd.DataFrame, list[str]] | None: + """Prepare and scale numeric data.""" + cols = self._schema.numeric_columns + if len(cols) < 2: + return None + + try: + from sklearn.preprocessing import StandardScaler + except ImportError: + return None + + df_clean = self._df[cols].dropna() + if len(df_clean) < 20: + return None + + if len(df_clean) > self._max_sample: + df_clean = df_clean.sample(self._max_sample, random_state=42) + + scaler = StandardScaler() + X = scaler.fit_transform(df_clean) + return X, df_clean, cols + + # ── Isolation Forest ────────────────────────────────── + + def isolation_forest(self) -> dict[str, Any]: + """Detect anomalies using Isolation Forest. + + Isolation Forest isolates observations by randomly selecting a + feature and then randomly selecting a split value. Anomalies + require fewer splits (shorter path length). + + Returns: + Dictionary with anomaly_count, anomaly_ratio, scores_summary. + """ + prepared = self._prepare_data() + if prepared is None: + return {} + + X, df_clean, cols = prepared + + try: + from sklearn.ensemble import IsolationForest + except ImportError: + return {} + + try: + iso = IsolationForest( + contamination=self._contamination, + random_state=42, + max_samples=min(256, len(X)), + n_estimators=100, + ) + labels = iso.fit_predict(X) # -1 = anomaly, 1 = normal + scores = iso.decision_function(X) + + n_anomaly = int((labels == -1).sum()) + + return { + "method": "Isolation Forest", + "anomaly_count": n_anomaly, + "anomaly_ratio": round(n_anomaly / len(X), 4), + "n_samples": len(X), + "score_mean": round(float(scores.mean()), 4), + "score_std": round(float(scores.std()), 4), + "score_min": round(float(scores.min()), 4), + "score_threshold": round(float(np.percentile(scores, self._contamination * 100)), 4), + "labels": labels, + "scores": scores, + } + except Exception as exc: + logger.debug("Isolation Forest failed: %s", exc) + return {} + + # ── Local Outlier Factor ────────────────────────────── + + def local_outlier_factor(self) -> dict[str, Any]: + """Detect anomalies using Local Outlier Factor (LOF). + + LOF measures the local deviation of density for each sample + compared to its neighbors. + + Returns: + Dictionary with anomaly_count, anomaly_ratio, scores summary. + """ + prepared = self._prepare_data() + if prepared is None: + return {} + + X, df_clean, cols = prepared + + try: + from sklearn.neighbors import LocalOutlierFactor + except ImportError: + return {} + + try: + n_neighbors = min(20, len(X) - 1) + lof = LocalOutlierFactor( + n_neighbors=n_neighbors, + contamination=self._contamination, + ) + labels = lof.fit_predict(X) # -1 = anomaly + scores = lof.negative_outlier_factor_ + + n_anomaly = int((labels == -1).sum()) + + return { + "method": "Local Outlier Factor", + "anomaly_count": n_anomaly, + "anomaly_ratio": round(n_anomaly / len(X), 4), + "n_samples": len(X), + "n_neighbors": n_neighbors, + "lof_mean": round(float(scores.mean()), 4), + "lof_std": round(float(scores.std()), 4), + "lof_min": round(float(scores.min()), 4), + "labels": labels, + "scores": scores, + } + except Exception as exc: + logger.debug("LOF failed: %s", exc) + return {} + + # ── Mahalanobis distance ────────────────────────────── + + def mahalanobis_distance(self) -> dict[str, Any]: + """Detect anomalies using Mahalanobis distance. + + Points with high Mahalanobis distance from the centroid + are potential multivariate outliers. + + Returns: + Dictionary with threshold, anomaly count, distances summary. + """ + cols = self._schema.numeric_columns + if len(cols) < 2: + return {} + + df_clean = self._df[cols].dropna() + if len(df_clean) < len(cols) + 5: + return {} + + if len(df_clean) > self._max_sample: + df_clean = df_clean.sample(self._max_sample, random_state=42) + + cols = cols[:30] # limit columns to avoid ill-conditioned matrices + df_clean = df_clean[cols] + data = df_clean.values + try: + mean = np.mean(data, axis=0) + cov = np.cov(data.T) + # Regularise to handle singular/near-singular covariance + cov += np.eye(cov.shape[0]) * 1e-6 + if np.linalg.cond(cov) > 1e10: + logger.debug("Covariance matrix ill-conditioned; skipping Mahalanobis.") + return {} + cov_inv = np.linalg.inv(cov) + + diff = data - mean + left = diff @ cov_inv + maha_sq = np.sum(left * diff, axis=1) + maha = np.sqrt(np.maximum(maha_sq, 0)) + + # Chi-squared threshold at 97.5% with p degrees of freedom + from scipy.stats import chi2 + + p = len(cols) + threshold = float(np.sqrt(chi2.ppf(0.975, p))) + + anomaly_mask = maha > threshold + n_anomaly = int(anomaly_mask.sum()) + + return { + "method": "Mahalanobis Distance", + "anomaly_count": n_anomaly, + "anomaly_ratio": round(n_anomaly / len(data), 4), + "threshold": round(threshold, 4), + "n_features": p, + "n_samples": len(data), + "distance_mean": round(float(maha.mean()), 4), + "distance_std": round(float(maha.std()), 4), + "distance_max": round(float(maha.max()), 4), + "distances": maha, + "labels": np.where(anomaly_mask, -1, 1), + } + except (np.linalg.LinAlgError, Exception) as exc: + logger.debug("Mahalanobis distance failed: %s", exc) + return {} + + # ── Consensus anomaly ───────────────────────────────── + + def consensus_anomaly(self) -> dict[str, Any]: + """Consensus anomaly detection combining multiple methods. + + An observation is flagged as anomalous if flagged by at least + 2 out of 3 methods (IF, LOF, Mahalanobis). + + Returns: + Dictionary with per-method counts, consensus count, and + agreement statistics. + """ + iso_result = self.isolation_forest() + lof_result = self.local_outlier_factor() + maha_result = self.mahalanobis_distance() + + methods = [] + if "labels" in iso_result: + methods.append(("isolation_forest", iso_result["labels"])) + if "labels" in lof_result: + methods.append(("local_outlier_factor", lof_result["labels"])) + if "labels" in maha_result: + methods.append(("mahalanobis", maha_result["labels"])) + + if len(methods) < 2: + return {} + + # Align lengths (should be same, but just in case) + min_len = min(len(labels) for _, labels in methods) + vote_matrix = np.zeros((min_len, len(methods))) + + for i, (_, labels) in enumerate(methods): + vote_matrix[:, i] = (labels[:min_len] == -1).astype(int) + + votes = vote_matrix.sum(axis=1) + # Consensus: flagged by >= 2 methods + consensus_mask = votes >= 2 + + per_method = {} + for name, labels in methods: + per_method[name] = int((labels[:min_len] == -1).sum()) + + return { + "methods_used": [name for name, _ in methods], + "per_method_counts": per_method, + "consensus_count": int(consensus_mask.sum()), + "consensus_ratio": round(float(consensus_mask.sum()) / min_len, 4), + "n_samples": min_len, + "consensus_threshold": 2, + "agreement_matrix": { + "all_agree_anomaly": int((votes == len(methods)).sum()), + "majority_anomaly": int(consensus_mask.sum()), + "any_anomaly": int((votes >= 1).sum()), + "no_anomaly": int((votes == 0).sum()), + }, + } + + # ── Summary ─────────────────────────────────────────── + + def summary_full(self) -> tuple[dict[str, Any], dict[str, Any]]: + """Return combined advanced anomaly results (stripped + full). + + Returns a tuple of (stripped_summary, full_results) so that + each method is only called once. + """ + result: dict[str, Any] = {} + full: dict[str, Any] = {} + + try: + iso = self.isolation_forest() + if iso: + result["isolation_forest"] = { + k: v for k, v in iso.items() if k not in ("labels", "scores") + } + full["isolation_forest"] = iso + except Exception as exc: + logger.debug("Isolation Forest skipped: %s", exc) + + try: + lof = self.local_outlier_factor() + if lof: + result["local_outlier_factor"] = { + k: v for k, v in lof.items() if k not in ("labels", "scores") + } + full["local_outlier_factor"] = lof + except Exception as exc: + logger.debug("LOF skipped: %s", exc) + + try: + maha = self.mahalanobis_distance() + if maha: + result["mahalanobis"] = { + k: v for k, v in maha.items() if k not in ("distances", "labels") + } + full["mahalanobis"] = maha + except Exception as exc: + logger.debug("Mahalanobis skipped: %s", exc) + + # Build consensus from already-computed results instead of re-running + try: + methods = [] + if "isolation_forest" in full and "labels" in full["isolation_forest"]: + methods.append(("isolation_forest", full["isolation_forest"]["labels"])) + if "local_outlier_factor" in full and "labels" in full["local_outlier_factor"]: + methods.append(("local_outlier_factor", full["local_outlier_factor"]["labels"])) + if "mahalanobis" in full and "labels" in full["mahalanobis"]: + methods.append(("mahalanobis", full["mahalanobis"]["labels"])) + + if len(methods) >= 2: + min_len = min(len(labels) for _, labels in methods) + vote_matrix = np.zeros((min_len, len(methods))) + for i, (_, labels) in enumerate(methods): + vote_matrix[:, i] = (labels[:min_len] == -1).astype(int) + votes = vote_matrix.sum(axis=1) + consensus_mask = votes >= 2 + + per_method = {} + for name, labels in methods: + per_method[name] = int((labels[:min_len] == -1).sum()) + + cons = { + "methods_used": [name for name, _ in methods], + "per_method_counts": per_method, + "consensus_count": int(consensus_mask.sum()), + "consensus_ratio": round(float(consensus_mask.sum()) / min_len, 4), + "n_samples": min_len, + "consensus_threshold": 2, + "agreement_matrix": { + "all_agree_anomaly": int((votes == len(methods)).sum()), + "majority_anomaly": int(consensus_mask.sum()), + "any_anomaly": int((votes >= 1).sum()), + "no_anomaly": int((votes == 0).sum()), + }, + } + result["consensus"] = cons + full["consensus"] = cons + except Exception as exc: + logger.debug("Consensus anomaly skipped: %s", exc) + + return result, full + + def summary(self) -> dict[str, Any]: + """Return combined advanced anomaly detection results (stripped).""" + result, _ = self.summary_full() + return result diff --git a/f2a/stats/advanced_correlation.py b/f2a/stats/advanced_correlation.py new file mode 100644 index 0000000..e43ce4e --- /dev/null +++ b/f2a/stats/advanced_correlation.py @@ -0,0 +1,369 @@ +"""Advanced correlation analysis module. + +Provides partial correlation matrix, mutual information matrix, +bootstrap correlation confidence intervals, and correlation network data. + +References: + - Székely et al. (2007) — distance correlation + - Reshef et al. (2011) — mutual information concepts +""" + +from __future__ import annotations + +from typing import Any + +import numpy as np +import pandas as pd + +from f2a.core.schema import DataSchema +from f2a.utils.logging import get_logger + +logger = get_logger(__name__) + + +class AdvancedCorrelationStats: + """Advanced correlation analysis for numeric columns. + + Args: + df: Target DataFrame. + schema: Data schema. + bootstrap_iterations: Number of bootstrap resamples for CI. + max_sample: Max rows to sample for expensive operations. + """ + + def __init__( + self, + df: pd.DataFrame, + schema: DataSchema, + bootstrap_iterations: int = 1000, + max_sample: int = 5000, + ) -> None: + self._df = df + self._schema = schema + self._bootstrap_n = bootstrap_iterations + self._max_sample = max_sample + + # ── Partial correlation ─────────────────────────────── + + def partial_correlation_matrix(self) -> pd.DataFrame: + """Compute the partial correlation matrix. + + Partial correlation measures the linear relationship between two + variables after removing the effect of all other variables. + Computed via the inverse of the correlation matrix. + + Returns: + Square DataFrame of partial correlations. + """ + cols = self._schema.numeric_columns + if len(cols) < 3: + return pd.DataFrame() + + cols = cols[:30] # limit + df_clean = self._df[cols].dropna() + if len(df_clean) < len(cols) + 2: + return pd.DataFrame() + + corr = df_clean.corr() + if corr.isna().any().any(): + logger.warning("NaN in correlation matrix (zero-variance columns); skipping partial correlation.") + return pd.DataFrame() + try: + precision = np.linalg.inv(corr.values) + except np.linalg.LinAlgError: + logger.warning("Singular correlation matrix; partial correlation unavailable.") + return pd.DataFrame() + + # Partial corr: -P_ij / sqrt(P_ii * P_jj) + d = np.sqrt(np.abs(np.diag(precision))) # abs to handle numerical noise + d[d == 0] = 1e-15 # avoid division by zero + partial = -precision / np.outer(d, d) + np.fill_diagonal(partial, 1.0) + + return pd.DataFrame( + np.round(partial, 4), + index=cols, + columns=cols, + ) + + # ── Mutual information matrix ───────────────────────── + + def mutual_information_matrix(self) -> pd.DataFrame: + """Compute pairwise mutual information between numeric columns. + + Uses sklearn's ``mutual_info_regression`` to estimate MI for + each pair of columns. + + Returns: + Square DataFrame of MI values. + """ + cols = self._schema.numeric_columns + if len(cols) < 2: + return pd.DataFrame() + + try: + from sklearn.feature_selection import mutual_info_regression + except ImportError: + logger.info("scikit-learn not available for MI computation.") + return pd.DataFrame() + + cols = cols[:30] # limit + df_clean = self._df[cols].dropna() + if len(df_clean) < 30: + return pd.DataFrame() + + # Sample for speed + if len(df_clean) > self._max_sample: + df_clean = df_clean.sample(self._max_sample, random_state=42) + + n = len(cols) + mi_matrix = np.zeros((n, n)) + + for i, col in enumerate(cols): + X = df_clean.drop(columns=[col]).values + y = df_clean[col].values + try: + mi = mutual_info_regression(X, y, random_state=42, n_neighbors=5) + other_cols = [c for c in cols if c != col] + for j, other in enumerate(other_cols): + idx = cols.index(other) + mi_matrix[i, idx] = float(mi[j]) + except Exception: + continue + + # Symmetrize + mi_matrix = (mi_matrix + mi_matrix.T) / 2 + np.fill_diagonal(mi_matrix, 0.0) + + return pd.DataFrame( + np.round(mi_matrix, 4), + index=cols, + columns=cols, + ) + + # ── Bootstrap correlation CI ────────────────────────── + + def bootstrap_correlation_ci( + self, + alpha: float = 0.05, + ) -> pd.DataFrame: + """Compute bootstrap confidence intervals for Pearson correlations. + + For each column pair, resamples ``bootstrap_iterations`` times + and reports the ``alpha/2`` and ``1 - alpha/2`` percentile bounds. + + Args: + alpha: Significance level (default 0.05 → 95% CI). + + Returns: + DataFrame with col_a, col_b, r, ci_lower, ci_upper, ci_width. + """ + cols = self._schema.numeric_columns + if len(cols) < 2: + return pd.DataFrame() + + cols = cols[:15] # limit pairs + df_clean = self._df[cols].dropna() + n = len(df_clean) + if n < 20: + return pd.DataFrame() + + # Sample for speed + if n > self._max_sample: + df_clean = df_clean.sample(self._max_sample, random_state=42) + n = len(df_clean) + + rng = np.random.default_rng(42) + rows: list[dict] = [] + + for i in range(len(cols)): + for j in range(i + 1, len(cols)): + x = df_clean[cols[i]].values + y = df_clean[cols[j]].values + + # Point estimate + r_point = float(np.corrcoef(x, y)[0, 1]) + + # Bootstrap + boot_corrs = np.empty(self._bootstrap_n) + for b in range(self._bootstrap_n): + idx = rng.integers(0, n, size=n) + bx, by = x[idx], y[idx] + std_x, std_y = bx.std(), by.std() + if std_x == 0 or std_y == 0: + boot_corrs[b] = 0.0 + else: + boot_corrs[b] = float(np.corrcoef(bx, by)[0, 1]) + + lower = float(np.percentile(boot_corrs, 100 * alpha / 2)) + upper = float(np.percentile(boot_corrs, 100 * (1 - alpha / 2))) + + rows.append({ + "col_a": cols[i], + "col_b": cols[j], + "pearson_r": round(r_point, 4), + "ci_lower": round(lower, 4), + "ci_upper": round(upper, 4), + "ci_width": round(upper - lower, 4), + "significant": not (lower <= 0 <= upper), + }) + + return pd.DataFrame(rows) if rows else pd.DataFrame() + + # ── Correlation network data ────────────────────────── + + def correlation_network(self, threshold: float = 0.5) -> dict[str, Any]: + """Build correlation network data for visualization. + + Nodes are columns; edges exist where |r| >= threshold. + + Args: + threshold: Minimum absolute correlation for an edge. + + Returns: + Dictionary with ``nodes`` (list of names) and ``edges`` + (list of {source, target, weight} dicts). + """ + cols = self._schema.numeric_columns + if len(cols) < 2: + return {"nodes": [], "edges": []} + + cols = cols[:30] + corr = self._df[cols].dropna().corr() + + edges: list[dict[str, Any]] = [] + for i in range(len(cols)): + for j in range(i + 1, len(cols)): + r = float(corr.iloc[i, j]) + if abs(r) >= threshold: + edges.append({ + "source": cols[i], + "target": cols[j], + "weight": round(r, 4), + "abs_weight": round(abs(r), 4), + }) + + # Only include nodes that have at least one edge + connected = set() + for e in edges: + connected.add(e["source"]) + connected.add(e["target"]) + + return { + "nodes": sorted(connected), + "edges": edges, + "threshold": threshold, + "n_edges": len(edges), + } + + # ── Distance correlation ────────────────────────────── + + def distance_correlation_matrix(self) -> pd.DataFrame: + """Compute pairwise distance correlations (Székely et al., 2007). + + Distance correlation can detect non-linear dependencies + that Pearson correlation misses. + + Returns: + Square DataFrame of distance correlations. + """ + cols = self._schema.numeric_columns + if len(cols) < 2: + return pd.DataFrame() + + cols = cols[:15] # expensive O(n^2) per pair + df_clean = self._df[cols].dropna() + if len(df_clean) < 10: + return pd.DataFrame() + + # Sample for speed + if len(df_clean) > min(self._max_sample, 2000): + df_clean = df_clean.sample(min(self._max_sample, 2000), random_state=42) + + n = len(cols) + matrix = np.eye(n) + + for i in range(n): + for j in range(i + 1, n): + dc = self._dcor(df_clean[cols[i]].values, df_clean[cols[j]].values) + matrix[i, j] = dc + matrix[j, i] = dc + + return pd.DataFrame( + np.round(matrix, 4), + index=cols, + columns=cols, + ) + + @staticmethod + def _dcor(x: np.ndarray, y: np.ndarray) -> float: + """Compute distance correlation between two 1-D arrays.""" + n = len(x) + if n < 4: + return 0.0 + + a = np.abs(x[:, None] - x[None, :]) + b = np.abs(y[:, None] - y[None, :]) + + # Double centering + a_row = a.mean(axis=1, keepdims=True) + a_col = a.mean(axis=0, keepdims=True) + a_grand = a.mean() + A = a - a_row - a_col + a_grand + + b_row = b.mean(axis=1, keepdims=True) + b_col = b.mean(axis=0, keepdims=True) + b_grand = b.mean() + B = b - b_row - b_col + b_grand + + dcov2 = (A * B).mean() + dvar_x = (A * A).mean() + dvar_y = (B * B).mean() + + if dvar_x <= 0 or dvar_y <= 0: + return 0.0 + + return float(np.sqrt(max(dcov2, 0) / np.sqrt(dvar_x * dvar_y))) + + # ── Summary ─────────────────────────────────────────── + + def summary(self) -> dict[str, Any]: + """Return combined advanced correlation results.""" + result: dict[str, Any] = {} + + try: + pcm = self.partial_correlation_matrix() + if not pcm.empty: + result["partial_correlation"] = pcm + except Exception as exc: + logger.debug("Partial correlation skipped: %s", exc) + + try: + mi = self.mutual_information_matrix() + if not mi.empty: + result["mutual_information"] = mi + except Exception as exc: + logger.debug("MI matrix skipped: %s", exc) + + try: + bci = self.bootstrap_correlation_ci() + if not bci.empty: + result["bootstrap_ci"] = bci + except Exception as exc: + logger.debug("Bootstrap CI skipped: %s", exc) + + try: + net = self.correlation_network() + if net.get("edges"): + result["network"] = net + except Exception as exc: + logger.debug("Correlation network skipped: %s", exc) + + try: + dc = self.distance_correlation_matrix() + if not dc.empty: + result["distance_correlation"] = dc + except Exception as exc: + logger.debug("Distance correlation skipped: %s", exc) + + return result diff --git a/f2a/stats/advanced_dimreduction.py b/f2a/stats/advanced_dimreduction.py new file mode 100644 index 0000000..f4ccf5a --- /dev/null +++ b/f2a/stats/advanced_dimreduction.py @@ -0,0 +1,337 @@ +"""Advanced dimensionality reduction module. + +Provides t-SNE, UMAP (optional), and Factor Analysis for +non-linear dimensionality reduction and latent factor discovery. + +References: + - van der Maaten & Hinton (2008) — t-SNE + - McInnes et al. (2018) — UMAP + - Spearman (1904) — Factor Analysis +""" + +from __future__ import annotations + +from typing import Any + +import numpy as np +import pandas as pd + +from f2a.core.schema import DataSchema +from f2a.utils.logging import get_logger + +logger = get_logger(__name__) + + +class AdvancedDimReductionStats: + """Advanced dimensionality reduction analysis. + + Args: + df: Target DataFrame. + schema: Data schema. + tsne_perplexity: t-SNE perplexity parameter. + max_sample: Max rows to sample. + """ + + def __init__( + self, + df: pd.DataFrame, + schema: DataSchema, + tsne_perplexity: float = 30.0, + max_sample: int = 5000, + ) -> None: + self._df = df + self._schema = schema + self._tsne_perplexity = tsne_perplexity + self._max_sample = max_sample + + def _prepare_data(self) -> tuple[np.ndarray, pd.DataFrame, list[str]] | None: + """Scale and sample numeric data.""" + cols = self._schema.numeric_columns + if len(cols) < 3: + return None + + try: + from sklearn.preprocessing import StandardScaler + except ImportError: + return None + + df_clean = self._df[cols].dropna() + if len(df_clean) < 20: + return None + + if len(df_clean) > self._max_sample: + df_clean = df_clean.sample(self._max_sample, random_state=42) + + scaler = StandardScaler() + X = scaler.fit_transform(df_clean) + return X, df_clean, cols + + # ── t-SNE ───────────────────────────────────────────── + + def tsne_2d(self) -> dict[str, Any]: + """Compute t-SNE 2D embedding. + + t-SNE (t-distributed Stochastic Neighbor Embedding) is excellent + for visualizing high-dimensional data in 2D. + + Returns: + Dictionary with embedding coordinates, parameters. + """ + prepared = self._prepare_data() + if prepared is None: + return {} + + X, df_clean, cols = prepared + + try: + from sklearn.manifold import TSNE + except ImportError: + return {} + + perplexity = min(self._tsne_perplexity, max(5, len(X) / 4)) + + try: + tsne = TSNE( + n_components=2, + perplexity=perplexity, + random_state=42, + max_iter=1000, + learning_rate="auto", + init="pca", + ) + embedding = tsne.fit_transform(X) + + return { + "method": "t-SNE", + "embedding": pd.DataFrame( + embedding, + columns=["tsne_1", "tsne_2"], + ), + "perplexity": perplexity, + "kl_divergence": round(float(tsne.kl_divergence_), 4), + "n_samples": len(X), + "n_features": X.shape[1], + } + except Exception as exc: + logger.debug("t-SNE failed: %s", exc) + return {} + + # ── UMAP ────────────────────────────────────────────── + + def umap_2d(self) -> dict[str, Any]: + """Compute UMAP 2D embedding (if umap-learn is installed). + + UMAP (Uniform Manifold Approximation and Projection) preserves + both local and global structure better than t-SNE. + + Returns: + Dictionary with embedding coordinates, parameters. + """ + prepared = self._prepare_data() + if prepared is None: + return {} + + X, df_clean, cols = prepared + + try: + from umap import UMAP + except ImportError: + logger.info("umap-learn not installed; UMAP analysis skipped.") + return {} + + try: + n_neighbors = min(15, max(2, len(X) // 10)) + reducer = UMAP( + n_components=2, + n_neighbors=n_neighbors, + min_dist=0.1, + random_state=42, + ) + embedding = reducer.fit_transform(X) + + return { + "method": "UMAP", + "embedding": pd.DataFrame( + embedding, + columns=["umap_1", "umap_2"], + ), + "n_neighbors": n_neighbors, + "min_dist": 0.1, + "n_samples": len(X), + "n_features": X.shape[1], + } + except Exception as exc: + logger.debug("UMAP failed: %s", exc) + return {} + + # ── Factor Analysis ─────────────────────────────────── + + def factor_analysis(self, n_factors: int | None = None) -> dict[str, Any]: + """Perform Factor Analysis to discover latent factors. + + Factor Analysis models observed variables as linear combinations + of unobserved latent factors plus error terms. + + Args: + n_factors: Number of factors. Auto-detected if None. + + Returns: + Dictionary with loadings, variance explained, factor scores. + """ + prepared = self._prepare_data() + if prepared is None: + return {} + + X, df_clean, cols = prepared + + try: + from sklearn.decomposition import FactorAnalysis + except ImportError: + return {} + + # Auto-detect n_factors using eigenvalue > 1 rule (Kaiser criterion) + if n_factors is None: + cov = np.cov(X.T) + eigenvalues = np.linalg.eigvalsh(cov)[::-1] + n_factors = max(1, int((eigenvalues > 1).sum())) + n_factors = min(n_factors, len(cols) - 1, 10) + + if n_factors < 1: + return {} + + try: + fa = FactorAnalysis(n_components=n_factors, random_state=42) + fa.fit(X) + + loadings = pd.DataFrame( + fa.components_.T, + index=cols, + columns=[f"factor_{i + 1}" for i in range(n_factors)], + ).round(4) + + # Variance explained by each factor (approximate) + factor_var = np.sum(fa.components_ ** 2, axis=1) + total_var = np.sum(np.var(X, axis=0)) + var_explained = factor_var / total_var + + noise_variance = pd.DataFrame({ + "column": cols, + "noise_variance": np.round(fa.noise_variance_, 4), + "communality": np.round( + 1 - fa.noise_variance_ / np.maximum(np.var(X, axis=0), 1e-15), 4 + ), + }).set_index("column") + + return { + "method": "Factor Analysis", + "n_factors": n_factors, + "loadings": loadings, + "variance_explained": [round(float(v), 4) for v in var_explained], + "total_variance_explained": round(float(var_explained.sum()), 4), + "noise_variance": noise_variance, + "n_samples": len(X), + "n_features": len(cols), + } + except Exception as exc: + logger.debug("Factor Analysis failed: %s", exc) + return {} + + # ── Feature contributions ───────────────────────────── + + def feature_contribution(self) -> pd.DataFrame: + """Analyze feature contributions across dimensionality reduction. + + Computes how much each feature contributes to the variance + captured by PCA components. + + Returns: + DataFrame with feature importance across components. + """ + cols = self._schema.numeric_columns + if len(cols) < 3: + return pd.DataFrame() + + try: + from sklearn.decomposition import PCA + from sklearn.preprocessing import StandardScaler + except ImportError: + return pd.DataFrame() + + df_clean = self._df[cols].dropna() + if len(df_clean) < len(cols) + 1: + return pd.DataFrame() + + if len(df_clean) > self._max_sample: + df_clean = df_clean.sample(self._max_sample, random_state=42) + + scaler = StandardScaler() + X = scaler.fit_transform(df_clean) + + n_comp = min(5, len(cols), len(df_clean) - 1) + pca = PCA(n_components=n_comp) + pca.fit(X) + + # Weighted contribution: |loading| * variance_explained + contributions = np.zeros(len(cols)) + for i in range(n_comp): + contributions += np.abs(pca.components_[i]) * pca.explained_variance_ratio_[i] + + result = pd.DataFrame({ + "column": cols, + "contribution_score": np.round(contributions, 4), + "rank": np.argsort(-contributions) + 1, + }).sort_values("contribution_score", ascending=False).set_index("column") + + return result + + # ── Summary ─────────────────────────────────────────── + + def summary(self) -> dict[str, Any]: + """Return combined dimensionality reduction results.""" + result: dict[str, Any] = {} + + try: + tsne = self.tsne_2d() + if tsne: + # Store summary without large embedding + result["tsne"] = { + k: v for k, v in tsne.items() if k != "embedding" + } + if "embedding" in tsne: + result["tsne_embedding"] = tsne["embedding"] + except Exception as exc: + logger.debug("t-SNE skipped: %s", exc) + + try: + umap_res = self.umap_2d() + if umap_res: + result["umap"] = { + k: v for k, v in umap_res.items() if k != "embedding" + } + if "embedding" in umap_res: + result["umap_embedding"] = umap_res["embedding"] + except Exception as exc: + logger.debug("UMAP skipped: %s", exc) + + try: + fa = self.factor_analysis() + if fa: + result["factor_analysis"] = { + k: v for k, v in fa.items() + if k not in ("loadings", "noise_variance") + } + if "loadings" in fa: + result["factor_loadings"] = fa["loadings"] + if "noise_variance" in fa: + result["factor_noise"] = fa["noise_variance"] + except Exception as exc: + logger.debug("Factor Analysis skipped: %s", exc) + + try: + fc = self.feature_contribution() + if not fc.empty: + result["feature_contribution"] = fc + except Exception as exc: + logger.debug("Feature contribution skipped: %s", exc) + + return result diff --git a/f2a/stats/advanced_distribution.py b/f2a/stats/advanced_distribution.py new file mode 100644 index 0000000..fb989e2 --- /dev/null +++ b/f2a/stats/advanced_distribution.py @@ -0,0 +1,348 @@ +"""Advanced distribution analysis module. + +Provides best-fit distribution testing, power-transform recommendation, +Jarque-Bera normality test, ECDF computation, and KDE bandwidth analysis. + +References: + - Box & Cox (1964) — power transform + - Jarque & Bera (1987) — normality test + - Silverman (1986) — KDE bandwidth selection +""" + +from __future__ import annotations + +from typing import Any + +import numpy as np +import pandas as pd +from scipy import stats as sp_stats + +from f2a.core.schema import DataSchema +from f2a.utils.logging import get_logger + +logger = get_logger(__name__) + +# Candidate distributions for best-fit analysis +_CANDIDATE_DISTRIBUTIONS = [ + ("norm", sp_stats.norm), + ("lognorm", sp_stats.lognorm), + ("expon", sp_stats.expon), + ("gamma", sp_stats.gamma), + ("beta", sp_stats.beta), + ("weibull_min", sp_stats.weibull_min), + ("uniform", sp_stats.uniform), +] + + +class AdvancedDistributionStats: + """Advanced distribution analysis for numeric columns. + + Args: + df: Target DataFrame. + schema: Data schema. + n_fits: Number of candidate distributions to fit. + max_sample: Max rows to sample for expensive operations. + """ + + def __init__( + self, + df: pd.DataFrame, + schema: DataSchema, + n_fits: int = 7, + max_sample: int = 5000, + ) -> None: + self._df = df + self._schema = schema + self._n_fits = min(n_fits, len(_CANDIDATE_DISTRIBUTIONS)) + self._max_sample = max_sample + + # ── Best-fit distribution ───────────────────────────── + + def best_fit(self) -> pd.DataFrame: + """Fit candidate distributions and rank by AIC/BIC. + + For each numeric column, fits up to ``n_fits`` scipy distributions, + computes AIC and BIC, and returns the best match. + + Returns: + DataFrame with columns: column, best_dist, aic, bic, ks_stat, ks_p, + params (per-column best). + """ + cols = self._schema.numeric_columns + if not cols: + return pd.DataFrame() + + rows: list[dict] = [] + candidates = _CANDIDATE_DISTRIBUTIONS[: self._n_fits] + + for col in cols: + series = self._df[col].dropna() + if len(series) < 20: + continue + + sample = ( + series.sample(self._max_sample, random_state=42) + if len(series) > self._max_sample + else series + ) + data = sample.values + + best: dict[str, Any] | None = None + + for name, dist in candidates: + try: + params = dist.fit(data) + # Log-likelihood + ll = np.sum(dist.logpdf(data, *params)) + if not np.isfinite(ll): + continue + k = len(params) + n = len(data) + aic = 2 * k - 2 * ll + bic = k * np.log(n) - 2 * ll + + ks_stat, ks_p = sp_stats.kstest(data, name, args=params) + + entry = { + "dist_name": name, + "aic": float(aic), + "bic": float(bic), + "ks_stat": float(ks_stat), + "ks_p": float(ks_p), + "params": params, + } + if best is None or aic < best["aic"]: + best = entry + except Exception: + continue + + if best is not None: + rows.append({ + "column": col, + "best_distribution": best["dist_name"], + "aic": round(best["aic"], 2), + "bic": round(best["bic"], 2), + "ks_statistic": round(best["ks_stat"], 4), + "ks_p_value": round(best["ks_p"], 6), + "fit_quality": ( + "good" if best["ks_p"] > 0.05 + else "moderate" if best["ks_p"] > 0.01 + else "poor" + ), + }) + + return pd.DataFrame(rows).set_index("column") if rows else pd.DataFrame() + + # ── Jarque-Bera normality test ──────────────────────── + + def jarque_bera(self) -> pd.DataFrame: + """Perform Jarque-Bera test for normality on each numeric column. + + The JB test jointly tests whether skewness and kurtosis + match a normal distribution. H0: data is normally distributed. + + Returns: + DataFrame with jb_stat, p_value, is_normal columns. + """ + cols = self._schema.numeric_columns + if not cols: + return pd.DataFrame() + + rows: list[dict] = [] + for col in cols: + series = self._df[col].dropna() + if len(series) < 8: + continue + try: + jb_stat, p_val = sp_stats.jarque_bera(series) + rows.append({ + "column": col, + "jb_statistic": round(float(jb_stat), 4), + "p_value": round(float(p_val), 6), + "is_normal_0.05": float(p_val) > 0.05, + "skewness": round(float(series.skew()), 4), + "kurtosis": round(float(series.kurtosis()), 4), + }) + except Exception: + continue + + return pd.DataFrame(rows).set_index("column") if rows else pd.DataFrame() + + # ── Power transform recommendation ──────────────────── + + def power_transform_recommendation(self) -> pd.DataFrame: + """Recommend power transformations (Box-Cox / Yeo-Johnson). + + Box-Cox requires strictly positive data; Yeo-Johnson works for any data. + Reports the optimal lambda and post-transform skewness. + + Returns: + DataFrame with method, lambda, original_skew, transformed_skew. + """ + cols = self._schema.numeric_columns + if not cols: + return pd.DataFrame() + + rows: list[dict] = [] + for col in cols: + series = self._df[col].dropna() + if len(series) < 10: + continue + + original_skew = float(series.skew()) + data = series.values + + # Try Box-Cox (positive data only) + bc_lambda = None + bc_skew = None + if (data > 0).all(): + try: + transformed, lmbda = sp_stats.boxcox(data) + bc_lambda = round(float(lmbda), 4) + bc_skew = round(float(pd.Series(transformed).skew()), 4) + except Exception: + pass + + # Yeo-Johnson (any data) + yj_lambda = None + yj_skew = None + try: + transformed, lmbda = sp_stats.yeojohnson(data) + yj_lambda = round(float(lmbda), 4) + yj_skew = round(float(pd.Series(transformed).skew()), 4) + except Exception: + pass + + # Recommendation + if bc_skew is not None and abs(bc_skew) < (abs(yj_skew) if yj_skew is not None else float("inf")): + recommended = "box-cox" + rec_lambda = bc_lambda + rec_skew = bc_skew + elif yj_skew is not None: + recommended = "yeo-johnson" + rec_lambda = yj_lambda + rec_skew = yj_skew + else: + recommended = "none" + rec_lambda = None + rec_skew = None + + needs_transform = abs(original_skew) > 0.5 + + rows.append({ + "column": col, + "original_skewness": round(original_skew, 4), + "recommended_method": recommended, + "optimal_lambda": rec_lambda, + "transformed_skewness": rec_skew, + "needs_transform": needs_transform, + "improvement": ( + round(abs(original_skew) - abs(rec_skew), 4) + if rec_skew is not None + else None + ), + }) + + return pd.DataFrame(rows).set_index("column") if rows else pd.DataFrame() + + # ── ECDF data ───────────────────────────────────────── + + def ecdf(self) -> dict[str, pd.DataFrame]: + """Compute Empirical Cumulative Distribution Function for each column. + + Returns: + Dictionary mapping column name to DataFrame with x, ecdf columns. + """ + cols = self._schema.numeric_columns + result: dict[str, pd.DataFrame] = {} + for col in cols: + series = self._df[col].dropna().sort_values() + if len(series) < 2: + continue + n = len(series) + # Subsample for very large data + if n > self._max_sample: + indices = np.linspace(0, n - 1, self._max_sample, dtype=int) + series = series.iloc[indices] + n = len(series) + result[col] = pd.DataFrame({ + "x": series.values, + "ecdf": np.arange(1, n + 1) / n, + }) + return result + + # ── KDE bandwidth analysis ──────────────────────────── + + def kde_analysis(self) -> pd.DataFrame: + """Compute optimal KDE bandwidth using Silverman's rule of thumb. + + Returns: + DataFrame with column, silverman_bw, scotts_bw, n. + """ + cols = self._schema.numeric_columns + if not cols: + return pd.DataFrame() + + rows: list[dict] = [] + for col in cols: + series = self._df[col].dropna() + n = len(series) + if n < 5: + continue + + std = float(series.std()) + iqr = float(series.quantile(0.75) - series.quantile(0.25)) + + # Silverman's rule: h = 0.9 * min(std, IQR/1.34) * n^(-1/5) + spread = min(std, iqr / 1.34) if iqr > 0 else std + silverman = 0.9 * spread * (n ** (-0.2)) if spread > 0 else None + + # Scott's rule: h = 3.49 * std * n^(-1/3) + scotts = 3.49 * std * (n ** (-1 / 3)) if std > 0 else None + + rows.append({ + "column": col, + "n": n, + "std": round(std, 4), + "iqr": round(iqr, 4), + "silverman_bandwidth": round(silverman, 4) if silverman else None, + "scotts_bandwidth": round(scotts, 4) if scotts else None, + }) + + return pd.DataFrame(rows).set_index("column") if rows else pd.DataFrame() + + # ── Combined summary ────────────────────────────────── + + def summary(self) -> dict[str, Any]: + """Return a combined advanced distribution analysis summary.""" + result: dict[str, Any] = {} + + try: + bf = self.best_fit() + if not bf.empty: + result["best_fit"] = bf + except Exception as exc: + logger.debug("Best-fit analysis skipped: %s", exc) + + try: + jb = self.jarque_bera() + if not jb.empty: + result["jarque_bera"] = jb + except Exception as exc: + logger.debug("Jarque-Bera test skipped: %s", exc) + + try: + pt = self.power_transform_recommendation() + if not pt.empty: + result["power_transform"] = pt + except Exception as exc: + logger.debug("Power transform analysis skipped: %s", exc) + + try: + kde = self.kde_analysis() + if not kde.empty: + result["kde_bandwidth"] = kde + except Exception as exc: + logger.debug("KDE analysis skipped: %s", exc) + + return result diff --git a/f2a/stats/categorical.py b/f2a/stats/categorical.py new file mode 100644 index 0000000..014c86d --- /dev/null +++ b/f2a/stats/categorical.py @@ -0,0 +1,146 @@ +"""Categorical data analysis module. + +Computes entropy, chi-square independence tests, and frequency analytics +for categorical columns. +""" + +from __future__ import annotations + +import numpy as np +import pandas as pd +from scipy.stats import chi2_contingency + +from f2a.core.schema import DataSchema +from f2a.utils.logging import get_logger + +logger = get_logger(__name__) + + +class CategoricalStats: + """Analyse categorical columns in depth. + + Args: + df: Target DataFrame. + schema: Data schema. + """ + + def __init__(self, df: pd.DataFrame, schema: DataSchema) -> None: + self._df = df + self._schema = schema + + # ── Frequency ───────────────────────────────────────── + + def frequency_table(self, column: str, top_n: int = 20) -> pd.DataFrame: + """Return a frequency table for a single column. + + Args: + column: Column name. + top_n: Max categories to show. + """ + series = self._df[column] + vc = series.value_counts() + total = int(series.count()) + + df = pd.DataFrame({ + "value": vc.index[:top_n], + "count": vc.values[:top_n], + "percentage": (vc.values[:top_n] / total * 100).round(2) if total > 0 else 0, + }) + if len(vc) > top_n: + other_count = int(vc.values[top_n:].sum()) + other_row = pd.DataFrame([{ + "value": f"(other {len(vc) - top_n} categories)", + "count": other_count, + "percentage": round(other_count / total * 100, 2) if total > 0 else 0, + }]) + df = pd.concat([df, other_row], ignore_index=True) + return df + + # ── Entropy ─────────────────────────────────────────── + + def entropy_summary(self) -> pd.DataFrame: + """Compute Shannon entropy for each categorical column.""" + cols = self._schema.categorical_columns + if not cols: + return pd.DataFrame() + + rows: list[dict] = [] + for col in cols: + vc = self._df[col].value_counts(normalize=True) + entropy = float(-np.sum(vc * np.log2(vc + 1e-15))) + max_entropy = float(np.log2(len(vc))) if len(vc) > 1 else 0.0 + rows.append({ + "column": col, + "unique_values": int(self._df[col].nunique()), + "entropy": round(entropy, 4), + "max_entropy": round(max_entropy, 4), + "normalized_entropy": round(entropy / max_entropy, 4) if max_entropy > 0 else 0.0, + }) + + return pd.DataFrame(rows).set_index("column") if rows else pd.DataFrame() + + # ── Chi-square independence ─────────────────────────── + + def chi_square_matrix(self) -> pd.DataFrame: + """Chi-square independence test p-values between categorical pairs. + + Returns: + Square DataFrame of p-values. A low p-value (<0.05) signals + a statistically significant association between two columns. + """ + cols = self._schema.categorical_columns + if len(cols) < 2: + return pd.DataFrame() + + # Limit to prevent combinatorial explosion + cols = cols[:15] + n = len(cols) + matrix = pd.DataFrame(np.ones((n, n)), index=cols, columns=cols) + + for i in range(n): + for j in range(i + 1, n): + try: + ct = pd.crosstab(self._df[cols[i]], self._df[cols[j]]) + if ct.size > 0 and ct.sum().sum() > 0: + _, p, _, _ = chi2_contingency(ct) + matrix.iloc[i, j] = round(p, 6) + matrix.iloc[j, i] = round(p, 6) + except Exception: + matrix.iloc[i, j] = np.nan + matrix.iloc[j, i] = np.nan + + return matrix + + # ── Combined summary ────────────────────────────────── + + def summary(self) -> pd.DataFrame: + """Return a combined categorical analysis summary table.""" + cols = self._schema.categorical_columns + if not cols: + return pd.DataFrame() + + rows: list[dict] = [] + for col in cols: + series = self._df[col] + vc = series.value_counts() + top = vc.index[0] if len(vc) > 0 else None + top_freq = int(vc.iloc[0]) if len(vc) > 0 else 0 + count = int(series.count()) + + # Entropy + vc_norm = series.value_counts(normalize=True) + entropy = float(-np.sum(vc_norm * np.log2(vc_norm + 1e-15))) + max_entropy = float(np.log2(len(vc_norm))) if len(vc_norm) > 1 else 0.0 + + rows.append({ + "column": col, + "count": count, + "unique": int(series.nunique()), + "top_value": str(top)[:50] if top is not None else None, + "top_frequency": top_freq, + "top_%": round(top_freq / count * 100, 2) if count > 0 else 0.0, + "entropy": round(entropy, 4), + "norm_entropy": round(entropy / max_entropy, 4) if max_entropy > 0 else 0.0, + }) + + return pd.DataFrame(rows).set_index("column") if rows else pd.DataFrame() diff --git a/f2a/stats/clustering.py b/f2a/stats/clustering.py new file mode 100644 index 0000000..d97bf64 --- /dev/null +++ b/f2a/stats/clustering.py @@ -0,0 +1,354 @@ +"""Clustering analysis module. + +Provides K-Means (with elbow + silhouette), DBSCAN (auto-eps), +hierarchical clustering, and cluster profiling. + +References: + - MacQueen (1967) — K-Means + - Ester et al. (1996) — DBSCAN + - Rousseeuw (1987) — silhouette score +""" + +from __future__ import annotations + +from typing import Any + +import numpy as np +import pandas as pd + +from f2a.core.schema import DataSchema +from f2a.utils.logging import get_logger + +logger = get_logger(__name__) + + +class ClusteringStats: + """Clustering analysis for numeric columns. + + Args: + df: Target DataFrame. + schema: Data schema. + max_k: Maximum k for K-Means elbow search. + max_sample: Max rows to sample. + """ + + def __init__( + self, + df: pd.DataFrame, + schema: DataSchema, + max_k: int = 10, + max_sample: int = 5000, + ) -> None: + self._df = df + self._schema = schema + self._max_k = max_k + self._max_sample = max_sample + + def _prepare_data(self) -> tuple[np.ndarray, list[str]] | None: + """Scale and sample numeric data for clustering.""" + cols = self._schema.numeric_columns + if len(cols) < 2: + return None + + try: + from sklearn.preprocessing import StandardScaler + except ImportError: + logger.info("scikit-learn not available for clustering.") + return None + + df_clean = self._df[cols].dropna() + if len(df_clean) < 10: + return None + + if len(df_clean) > self._max_sample: + df_clean = df_clean.sample(self._max_sample, random_state=42) + + scaler = StandardScaler() + X = scaler.fit_transform(df_clean) + return X, cols + + # ── K-Means with elbow & silhouette ─────────────────── + + def kmeans_analysis(self) -> dict[str, Any]: + """Perform K-Means clustering with elbow and silhouette analysis. + + Returns: + Dictionary with: + - ``elbow_data``: DataFrame (k, inertia, silhouette) + - ``optimal_k``: best k by silhouette score + - ``labels``: cluster labels for optimal k + - ``cluster_sizes``: dict of cluster → count + """ + prepared = self._prepare_data() + if prepared is None: + return {} + + X, cols = prepared + + try: + from sklearn.cluster import KMeans + from sklearn.metrics import silhouette_score + except ImportError: + return {} + + max_k = min(self._max_k, len(X) - 1) + if max_k < 2: + return {} + + rows: list[dict] = [] + best_score = -1.0 + best_k = 2 + best_labels: np.ndarray | None = None + + for k in range(2, max_k + 1): + try: + km = KMeans(n_clusters=k, random_state=42, n_init=10, max_iter=300) + labels = km.fit_predict(X) + inertia = float(km.inertia_) + sil = float(silhouette_score(X, labels)) + + rows.append({ + "k": k, + "inertia": round(inertia, 2), + "silhouette_score": round(sil, 4), + }) + + if sil > best_score: + best_score = sil + best_k = k + best_labels = labels + except Exception: + continue + + if not rows: + return {} + + elbow_df = pd.DataFrame(rows).set_index("k") + + # Cluster sizes + sizes: dict[str, int] = {} + if best_labels is not None: + unique, counts = np.unique(best_labels, return_counts=True) + sizes = {f"cluster_{int(u)}": int(c) for u, c in zip(unique, counts)} + + return { + "elbow_data": elbow_df, + "optimal_k": best_k, + "best_silhouette": round(best_score, 4), + "cluster_sizes": sizes, + "n_samples": len(X), + } + + # ── DBSCAN with auto-eps ────────────────────────────── + + def dbscan_analysis(self) -> dict[str, Any]: + """Perform DBSCAN clustering with automated eps selection. + + Uses the k-distance graph method to estimate eps. + + Returns: + Dictionary with labels, n_clusters, n_noise, cluster_sizes. + """ + prepared = self._prepare_data() + if prepared is None: + return {} + + X, cols = prepared + + try: + from sklearn.cluster import DBSCAN + from sklearn.neighbors import NearestNeighbors + except ImportError: + return {} + + # Auto-eps via k-distance graph (k = min_samples) + min_samples = max(2, min(5, len(X) // 20)) + try: + nn = NearestNeighbors(n_neighbors=min_samples) + nn.fit(X) + distances, _ = nn.kneighbors(X) + k_distances = np.sort(distances[:, -1]) + + # Estimate eps at the "elbow" using maximum curvature + n = len(k_distances) + if n < 10: + return {} + + # Simple elbow: point of maximum second derivative + second_deriv = np.diff(k_distances, n=2) + elbow_idx = int(np.argmax(second_deriv)) + 1 + eps = float(k_distances[elbow_idx]) + eps = max(eps, 0.1) # minimum eps + + db = DBSCAN(eps=eps, min_samples=min_samples) + labels = db.fit_predict(X) + + n_clusters = len(set(labels) - {-1}) + n_noise = int((labels == -1).sum()) + + sizes: dict[str, int] = {} + unique, counts = np.unique(labels, return_counts=True) + for u, c in zip(unique, counts): + lbl = "noise" if u == -1 else f"cluster_{int(u)}" + sizes[lbl] = int(c) + + return { + "eps": round(eps, 4), + "min_samples": min_samples, + "n_clusters": n_clusters, + "n_noise": n_noise, + "noise_ratio": round(n_noise / len(X), 4), + "cluster_sizes": sizes, + "n_samples": len(X), + } + except Exception as exc: + logger.debug("DBSCAN failed: %s", exc) + return {} + + # ── Hierarchical clustering ─────────────────────────── + + def hierarchical_analysis(self) -> dict[str, Any]: + """Perform hierarchical (agglomerative) clustering. + + Returns: + Dictionary with n_clusters (auto), labels, linkage method, + and dendrogram data. + """ + prepared = self._prepare_data() + if prepared is None: + return {} + + X, cols = prepared + + try: + from sklearn.cluster import AgglomerativeClustering + from sklearn.metrics import silhouette_score + from scipy.cluster.hierarchy import linkage + except ImportError: + return {} + + # Try different n_clusters, pick best silhouette + best_k = 2 + best_score = -1.0 + + max_k = min(self._max_k, len(X) - 1) + for k in range(2, max_k + 1): + try: + agg = AgglomerativeClustering(n_clusters=k, linkage="ward") + labels = agg.fit_predict(X) + score = float(silhouette_score(X, labels)) + if score > best_score: + best_score = score + best_k = k + except Exception: + continue + + # Final fit with best k + try: + agg = AgglomerativeClustering(n_clusters=best_k, linkage="ward") + labels = agg.fit_predict(X) + + sizes: dict[str, int] = {} + unique, counts = np.unique(labels, return_counts=True) + for u, c in zip(unique, counts): + sizes[f"cluster_{int(u)}"] = int(c) + + # Linkage matrix for dendrogram + Z = linkage(X[:min(500, len(X))], method="ward") + + return { + "optimal_k": best_k, + "silhouette_score": round(best_score, 4), + "linkage_method": "ward", + "cluster_sizes": sizes, + "linkage_matrix": Z, + "n_samples": len(X), + } + except Exception as exc: + logger.debug("Hierarchical clustering failed: %s", exc) + return {} + + # ── Cluster profiling ───────────────────────────────── + + def cluster_profiles(self, kmeans_result: dict[str, Any] | None = None) -> pd.DataFrame: + """Profile clusters by computing per-cluster mean of each feature. + + Uses the optimal K-Means clustering result. + + Args: + kmeans_result: Pre-computed K-Means result (avoids re-running). + + Returns: + DataFrame with cluster labels as index, feature means as columns. + """ + if kmeans_result is None: + kmeans_result = self.kmeans_analysis() + if not kmeans_result: + return pd.DataFrame() + + prepared = self._prepare_data() + if prepared is None: + return pd.DataFrame() + + X, cols = prepared + optimal_k = kmeans_result["optimal_k"] + + try: + from sklearn.cluster import KMeans + except ImportError: + return pd.DataFrame() + + try: + km = KMeans(n_clusters=optimal_k, random_state=42, n_init=10) + labels = km.fit_predict(X) + + # Build profiles using original (unscaled) data + df_clean = self._df[cols].dropna() + if len(df_clean) > self._max_sample: + df_clean = df_clean.sample(self._max_sample, random_state=42) + + df_clean = df_clean.copy() + df_clean["cluster"] = labels[: len(df_clean)] + + profiles = df_clean.groupby("cluster").mean().round(4) + profiles.index = [f"cluster_{i}" for i in profiles.index] + + return profiles + except Exception: + return pd.DataFrame() + + # ── Summary ─────────────────────────────────────────── + + def summary(self) -> dict[str, Any]: + """Return combined clustering analysis results.""" + result: dict[str, Any] = {} + + try: + km = self.kmeans_analysis() + if km: + result["kmeans"] = km + except Exception as exc: + logger.debug("K-Means analysis skipped: %s", exc) + + try: + db = self.dbscan_analysis() + if db: + result["dbscan"] = db + except Exception as exc: + logger.debug("DBSCAN analysis skipped: %s", exc) + + try: + hc = self.hierarchical_analysis() + if hc: + result["hierarchical"] = hc + except Exception as exc: + logger.debug("Hierarchical analysis skipped: %s", exc) + + try: + cp = self.cluster_profiles(kmeans_result=km if "kmeans" in result else None) + if not cp.empty: + result["profiles"] = cp + except Exception as exc: + logger.debug("Cluster profiling skipped: %s", exc) + + return result diff --git a/f2a/stats/column_role.py b/f2a/stats/column_role.py new file mode 100644 index 0000000..f48aaf9 --- /dev/null +++ b/f2a/stats/column_role.py @@ -0,0 +1,283 @@ +"""Column role classification — auto-detect the semantic role of each column. + +Infers whether a column acts as an ID, timestamp, numeric feature, categorical +feature, ordinal feature, binary variable, text field, constant, or potential +target variable. Each assignment comes with a confidence score and evidence +so downstream consumers (ML readiness, insight engine) can make informed +decisions. +""" + +from __future__ import annotations + +import re +from dataclasses import dataclass, field +from typing import Any + +import numpy as np +import pandas as pd + +from f2a.core.schema import DataSchema +from f2a.utils.logging import get_logger + +logger = get_logger(__name__) + +# ===================================================================== +# Data classes +# ===================================================================== + +@dataclass +class ColumnRole: + """Role assignment for a single column.""" + + column: str + primary_role: str # id | timestamp | numeric_feature | categorical_feature | ordinal_feature | binary | text | constant | target_candidate + confidence: float # 0-1 + secondary_role: str | None = None + properties: dict[str, Any] = field(default_factory=dict) + + def to_dict(self) -> dict[str, Any]: + return { + "column": self.column, + "primary_role": self.primary_role, + "confidence": self.confidence, + "secondary_role": self.secondary_role, + "properties": self.properties, + } + + +# Regex patterns for name-based heuristics +_ID_PATTERNS = re.compile( + r"(^id$|_id$|^pk$|^key$|^index$|^uid$|^uuid$|^guid$|^row_?num|^seq)", + re.IGNORECASE, +) +_TIME_PATTERNS = re.compile( + r"(date|time|_at$|_ts$|timestamp|created|updated|modified|year|month|day)", + re.IGNORECASE, +) +_TARGET_PATTERNS = re.compile( + r"(^target$|^label$|^y$|^class$|^outcome$|^response$|^result$|^is_|^has_)", + re.IGNORECASE, +) +_ORDINAL_PATTERNS = re.compile( + r"(level|grade|rating|rank|score|priority|stage|phase|tier|degree)", + re.IGNORECASE, +) + + +class ColumnRoleClassifier: + """Automatically assign a semantic role to every column in the dataset. + + Parameters + ---------- + df : pd.DataFrame + The analysis DataFrame. + schema : DataSchema + Column type metadata. + """ + + def __init__(self, df: pd.DataFrame, schema: DataSchema) -> None: + self._df = df + self._schema = schema + + def classify(self) -> list[ColumnRole]: + """Return a role assignment for each column.""" + roles: list[ColumnRole] = [] + for col_info in self._schema.columns: + role = self._classify_single(col_info) + roles.append(role) + return roles + + def summary(self) -> pd.DataFrame: + """Summary table: column × role × confidence.""" + roles = self.classify() + rows = [r.to_dict() for r in roles] + df = pd.DataFrame(rows) + if not df.empty: + df = df.set_index("column") + return df + + # ------------------------------------------------------------------ + + def _classify_single(self, col_info: Any) -> ColumnRole: + col_name = col_info.name + dtype = str(col_info.dtype) + inferred = col_info.inferred_type # "numeric", "categorical", "text", "datetime", "boolean" + n_unique = col_info.n_unique + n_missing = col_info.n_missing + n_total = self._schema.n_rows + + unique_ratio = n_unique / max(n_total, 1) + + # 1. Constant + if n_unique <= 1: + return ColumnRole( + column=col_name, + primary_role="constant", + confidence=1.0, + properties={"n_unique": n_unique}, + ) + + # 2. Binary + if n_unique == 2: + conf = 0.9 + secondary = None + if _TARGET_PATTERNS.search(col_name): + secondary = "target_candidate" + conf = 0.85 + return ColumnRole( + column=col_name, + primary_role="binary", + confidence=conf, + secondary_role=secondary, + properties={"n_unique": 2, "values": self._top_values(col_name, 2)}, + ) + + # 3. Datetime / timestamp + if inferred == "datetime": + return ColumnRole( + column=col_name, + primary_role="timestamp", + confidence=0.95, + properties={"dtype": dtype}, + ) + if _TIME_PATTERNS.search(col_name) and inferred == "numeric": + # Possibly an epoch timestamp + series = pd.to_numeric(self._df[col_name], errors="coerce").dropna() + if not series.empty and self._is_monotonic(series): + return ColumnRole( + column=col_name, + primary_role="timestamp", + confidence=0.70, + properties={"dtype": dtype, "hint": "monotonic numeric with time-like name"}, + ) + + # 4. ID-like + if self._is_id_like(col_name, unique_ratio, n_unique, inferred): + conf = 0.6 + if _ID_PATTERNS.search(col_name): + conf = 0.9 + elif unique_ratio > 0.99: + conf = 0.85 + return ColumnRole( + column=col_name, + primary_role="id", + confidence=conf, + properties={"unique_ratio": round(unique_ratio, 4)}, + ) + + # 5. Text + if inferred == "text": + return ColumnRole( + column=col_name, + primary_role="text", + confidence=0.9, + properties={"avg_length": self._avg_str_length(col_name)}, + ) + + # 6. Ordinal feature + if self._is_ordinal(col_name, inferred, n_unique, n_total): + conf = 0.7 + if _ORDINAL_PATTERNS.search(col_name): + conf = 0.85 + return ColumnRole( + column=col_name, + primary_role="ordinal_feature", + confidence=conf, + properties={"n_unique": n_unique}, + ) + + # 7. Target candidate (categorical with specific naming) + if _TARGET_PATTERNS.search(col_name) and n_unique <= 20: + return ColumnRole( + column=col_name, + primary_role="target_candidate", + confidence=0.7, + properties={"n_unique": n_unique, "inferred_type": inferred}, + ) + + # 8. Categorical feature + if inferred == "categorical" or inferred == "boolean": + return ColumnRole( + column=col_name, + primary_role="categorical_feature", + confidence=0.85, + properties={"n_unique": n_unique, "unique_ratio": round(unique_ratio, 4)}, + ) + + # 9. Numeric feature (default for numeric) + if inferred == "numeric": + secondary = None + if _TARGET_PATTERNS.search(col_name): + secondary = "target_candidate" + return ColumnRole( + column=col_name, + primary_role="numeric_feature", + confidence=0.85, + secondary_role=secondary, + properties={"dtype": dtype}, + ) + + # Fallback + return ColumnRole( + column=col_name, + primary_role="numeric_feature" if inferred == "numeric" else "categorical_feature", + confidence=0.5, + properties={"inferred_type": inferred}, + ) + + # ------------------------------------------------------------------ + # Heuristics + # ------------------------------------------------------------------ + + def _is_id_like(self, col_name: str, unique_ratio: float, n_unique: int, inferred: str) -> bool: + if _ID_PATTERNS.search(col_name): + return unique_ratio > 0.8 + if unique_ratio > 0.95 and n_unique > 20: + if inferred in ("text", "categorical"): + return True + if inferred == "numeric": + series = pd.to_numeric(self._df[col_name], errors="coerce").dropna() + if not series.empty and self._is_monotonic(series): + return True + return False + + @staticmethod + def _is_monotonic(series: pd.Series) -> bool: + """Check if a numeric series is (roughly) monotonic.""" + if len(series) < 5: + return False + diffs = series.diff().dropna() + if diffs.empty: + return False + pos = (diffs >= 0).sum() + neg = (diffs <= 0).sum() + ratio = max(pos, neg) / len(diffs) + return ratio > 0.95 + + def _is_ordinal(self, col_name: str, inferred: str, n_unique: int, n_total: int) -> bool: + """Heuristic: integer column with small distinct count and ordinal-like name.""" + if n_unique > 20 or n_unique < 3: + return False + if _ORDINAL_PATTERNS.search(col_name): + return True + if inferred == "numeric" and col_name in self._df.columns: + series = pd.to_numeric(self._df[col_name], errors="coerce").dropna() + if not series.empty and (series == series.astype(int)).all(): + vals = sorted(series.unique()) + if len(vals) <= 15: + # Check if values are roughly consecutive + span = vals[-1] - vals[0] + if span > 0 and len(vals) / (span + 1) > 0.5: + return True + return False + + def _top_values(self, col_name: str, n: int = 5) -> list: + if col_name not in self._df.columns: + return [] + return self._df[col_name].dropna().value_counts().head(n).index.tolist() + + def _avg_str_length(self, col_name: str) -> float: + if col_name not in self._df.columns: + return 0.0 + s = self._df[col_name].dropna().astype(str) + return round(float(s.str.len().mean()), 1) if not s.empty else 0.0 diff --git a/f2a/stats/correlation.py b/f2a/stats/correlation.py new file mode 100644 index 0000000..b66b379 --- /dev/null +++ b/f2a/stats/correlation.py @@ -0,0 +1,157 @@ +"""Correlation analysis module.""" + +from __future__ import annotations + +import numpy as np +import pandas as pd + +from f2a.core.schema import DataSchema +from f2a.utils.logging import get_logger + +logger = get_logger(__name__) + + +class CorrelationStats: + """Analyze correlations between columns. + + Args: + df: Target DataFrame to analyze. + schema: Data schema. + """ + + def __init__(self, df: pd.DataFrame, schema: DataSchema) -> None: + self._df = df + self._schema = schema + + def pearson(self) -> pd.DataFrame: + """Return the Pearson correlation matrix.""" + cols = self._schema.numeric_columns + if len(cols) < 2: + return pd.DataFrame() + return self._df[cols].corr(method="pearson") + + def spearman(self) -> pd.DataFrame: + """Return the Spearman rank correlation matrix.""" + cols = self._schema.numeric_columns + if len(cols) < 2: + return pd.DataFrame() + return self._df[cols].corr(method="spearman") + + def kendall(self) -> pd.DataFrame: + """Return the Kendall tau correlation matrix.""" + cols = self._schema.numeric_columns + if len(cols) < 2: + return pd.DataFrame() + # Kendall is expensive — limit columns + cols = cols[:15] + return self._df[cols].corr(method="kendall") + + def cramers_v_matrix(self) -> pd.DataFrame: + """Return the Cramer's V matrix for categorical columns.""" + cols = self._schema.categorical_columns + if len(cols) < 2: + return pd.DataFrame() + + cols = cols[:15] + n = len(cols) + matrix = pd.DataFrame(np.ones((n, n)), index=cols, columns=cols) + + for i in range(n): + for j in range(i + 1, n): + v = self._cramers_v(self._df[cols[i]], self._df[cols[j]]) + matrix.iloc[i, j] = v + matrix.iloc[j, i] = v + + return matrix + + def vif(self) -> pd.DataFrame: + """Compute Variance Inflation Factor for numeric columns. + + VIF > 5 suggests moderate multicollinearity; + VIF > 10 suggests severe multicollinearity. + + Uses the inverse-correlation-matrix diagonal method. + """ + cols = self._schema.numeric_columns + if len(cols) < 2: + return pd.DataFrame() + + df_clean = self._df[cols].dropna() + if len(df_clean) < len(cols) + 1: + return pd.DataFrame() + + corr = df_clean.corr() + try: + corr_inv = np.linalg.inv(corr.values) + vif_values = np.diag(corr_inv) + except np.linalg.LinAlgError: + logger.warning("Singular correlation matrix; VIF cannot be computed.") + return pd.DataFrame() + + rows: list[dict] = [] + for col, vif_val in zip(cols, vif_values): + severity = ( + "severe" if vif_val > 10 + else "moderate" if vif_val > 5 + else "low" + ) + rows.append({ + "column": col, + "VIF": round(float(vif_val), 2), + "multicollinearity": severity, + }) + + return ( + pd.DataFrame(rows) + .set_index("column") + .sort_values("VIF", ascending=False) + ) + + def high_correlations(self, threshold: float = 0.9) -> list[tuple[str, str, float]]: + """Return pairs with high correlation. + + Args: + threshold: Absolute correlation coefficient threshold. + + Returns: + List of ``(col_a, col_b, correlation)`` tuples. + """ + corr = self.pearson() + if corr.empty: + return [] + + pairs: list[tuple[str, str, float]] = [] + cols = corr.columns + for i in range(len(cols)): + for j in range(i + 1, len(cols)): + val = corr.iloc[i, j] + if abs(val) >= threshold: + pairs.append((cols[i], cols[j], round(float(val), 4))) + + if pairs: + logger.warning( + "Multicollinearity warning: %d column pairs have |r| >= %.2f.", + len(pairs), + threshold, + ) + + return pairs + + # ── Internal helpers ──────────────────────────────── + + @staticmethod + def _cramers_v(x: pd.Series, y: pd.Series) -> float: + """Compute Cramer's V between two categorical variables.""" + confusion = pd.crosstab(x, y) + n = confusion.sum().sum() + if n == 0: + return 0.0 + + from scipy.stats import chi2_contingency + + chi2, _, _, _ = chi2_contingency(confusion) + min_dim = min(confusion.shape) - 1 + if min_dim == 0: + return 0.0 + + return float(np.sqrt(chi2 / (n * min_dim))) diff --git a/f2a/stats/cross_analysis.py b/f2a/stats/cross_analysis.py new file mode 100644 index 0000000..95497ae --- /dev/null +++ b/f2a/stats/cross_analysis.py @@ -0,0 +1,497 @@ +"""Cross-dimensional analysis — discovers patterns across analysis boundaries. + +Instead of treating each analysis (correlation, cluster, outlier, missing, …) +in isolation, this module crosses two or more dimensions to reveal composite +patterns that single-axis analyses miss: + +* **Outlier × Cluster**: Are anomalies concentrated in specific clusters? +* **Missing × Correlation**: Is missingness systematic (MAR) or random (MCAR)? +* **Distribution × Outlier**: Which outlier method is appropriate given tail shape? +* **Cluster × Correlation (Simpson's Paradox)**: Does aggregation mask reversed relationships? +* **Feature Importance × Missing**: Are critical features losing information? +* **Dim-Reduction × Cluster × Anomaly**: Unified 2-D embedding overlay. +""" + +from __future__ import annotations + +from typing import Any + +import numpy as np +import pandas as pd +from scipy import stats as sp_stats + +from f2a.core.schema import DataSchema +from f2a.utils.logging import get_logger + +logger = get_logger(__name__) + + +class CrossAnalysis: + """Run all cross-dimensional analyses given pre-computed stats. + + Parameters + ---------- + df : pd.DataFrame + The (cleaned) analysis DataFrame. + schema : DataSchema + Inferred schema. + stats : StatsResult + Previously computed statistical results (basic + advanced). + max_cols : int + Maximum numeric columns to consider in expensive pairwise ops. + """ + + def __init__( + self, + df: pd.DataFrame, + schema: DataSchema, + stats: Any, + *, + max_cols: int = 20, + ) -> None: + self._df = df + self._schema = schema + self._stats = stats + self._max_cols = max_cols + + # ------------------------------------------------------------------ + # Public + # ------------------------------------------------------------------ + + def summary(self) -> dict[str, Any]: + """Execute all cross-analyses and return a combined dict.""" + result: dict[str, Any] = {} + + try: + r = self.outlier_by_cluster() + if r is not None: + result["outlier_by_cluster"] = r + except Exception as exc: + logger.debug("outlier_by_cluster failed: %s", exc) + + try: + r = self.missing_correlation() + if r is not None: + result["missing_correlation"] = r + except Exception as exc: + logger.debug("missing_correlation failed: %s", exc) + + try: + r = self.distribution_outlier_fitness() + if r is not None: + result["distribution_outlier_fitness"] = r + except Exception as exc: + logger.debug("distribution_outlier_fitness failed: %s", exc) + + try: + r = self.simpson_paradox() + if r is not None: + result["simpson_paradox"] = r + except Exception as exc: + logger.debug("simpson_paradox failed: %s", exc) + + try: + r = self.importance_vs_missing() + if r is not None: + result["importance_vs_missing"] = r + except Exception as exc: + logger.debug("importance_vs_missing failed: %s", exc) + + try: + r = self.unified_2d_embedding() + if r is not None: + result["unified_embedding"] = r + except Exception as exc: + logger.debug("unified_2d_embedding failed: %s", exc) + + return result + + # ------------------------------------------------------------------ + # X1. Outlier × Cluster + # ------------------------------------------------------------------ + + def outlier_by_cluster(self) -> dict[str, Any] | None: + """Per-cluster anomaly rates from consensus anomaly + K-Means labels.""" + adv = self._stats.advanced_stats + clustering = adv.get("clustering", {}) + anomaly_full = adv.get("advanced_anomaly_full", {}) + + km = clustering.get("kmeans") + iso = anomaly_full.get("isolation_forest") + if not km or not iso: + return None + + labels_cluster = km.get("labels") + labels_anomaly = iso.get("labels") + if labels_cluster is None or labels_anomaly is None: + return None + + # Align lengths (both should be n_samples after sampling) + n = min(len(labels_cluster), len(labels_anomaly)) + if n == 0: + return None + + c_labels = np.asarray(labels_cluster[:n]) + a_labels = np.asarray(labels_anomaly[:n]) + + anomaly_mask = a_labels == -1 + unique_clusters = np.unique(c_labels) + + rows = [] + for cl in unique_clusters: + cl_mask = c_labels == cl + cl_size = int(cl_mask.sum()) + cl_anomalies = int((cl_mask & anomaly_mask).sum()) + rows.append({ + "cluster": f"cluster_{cl}" if cl >= 0 else "noise", + "size": cl_size, + "anomaly_count": cl_anomalies, + "anomaly_rate": round(cl_anomalies / max(cl_size, 1), 4), + }) + + df_result = pd.DataFrame(rows) + + # Chi-square test for uniform anomaly distribution + expected_rate = anomaly_mask.sum() / max(n, 1) + chi2_p = None + if len(unique_clusters) >= 2 and anomaly_mask.sum() > 0: + observed = df_result["anomaly_count"].values + expected = df_result["size"].values * expected_rate + expected = np.where(expected < 1, 1, expected) + try: + chi2, p = sp_stats.chisquare(observed, f_exp=expected) + chi2_p = float(p) + except Exception: + pass + + return { + "table": df_result, + "overall_anomaly_rate": float(expected_rate), + "chi2_uniform_p": chi2_p, + "is_uniform": chi2_p is not None and chi2_p > 0.05, + } + + # ------------------------------------------------------------------ + # X2. Missing × Correlation (MAR detection) + # ------------------------------------------------------------------ + + def missing_correlation(self) -> dict[str, Any] | None: + """Correlate missing-indicators with numeric columns to diagnose MAR.""" + mi = self._stats.missing_info + if mi.empty or "missing_ratio" not in mi.columns: + return None + + # Columns with any missing + miss_cols = mi[mi["missing_ratio"] > 0].index.tolist() + if not miss_cols: + return None + + num_cols = self._schema.numeric_columns[:self._max_cols] + if not num_cols: + return None + + # Build indicator matrix + indicators = pd.DataFrame(index=self._df.index) + for col in miss_cols: + if col in self._df.columns: + indicators[f"{col}_missing"] = self._df[col].isna().astype(int) + + if indicators.empty: + return None + + # Correlate indicators with numeric columns + num_data = self._df[num_cols].apply(pd.to_numeric, errors="coerce") + + corr_matrix = pd.DataFrame( + np.nan, index=indicators.columns, columns=num_cols, + ) + mar_suspects: list[dict[str, Any]] = [] + + for ind_col in indicators.columns: + ind_series = indicators[ind_col] + if ind_series.sum() < 5 or ind_series.sum() == len(ind_series): + continue # too few or all missing + for num_col in num_cols: + valid = num_data[num_col].notna() & ind_series.notna() + if valid.sum() < 10: + continue + try: + r, p = sp_stats.pointbiserialr( + ind_series[valid].values, + num_data[num_col][valid].values, + ) + corr_matrix.loc[ind_col, num_col] = r + if abs(r) > 0.2 and p < 0.05: + mar_suspects.append({ + "missing_column": ind_col.replace("_missing", ""), + "correlated_with": num_col, + "correlation": round(float(r), 4), + "p_value": round(float(p), 6), + }) + except Exception: + continue + + # Diagnose MCAR vs MAR + diagnosis = "MCAR_likely" + if mar_suspects: + max_abs_r = max(abs(s["correlation"]) for s in mar_suspects) + if max_abs_r > 0.4: + diagnosis = "MAR_strong" + elif max_abs_r > 0.2: + diagnosis = "MAR_moderate" + + # Imputation strategy recommendation + strategies: dict[str, str] = {} + for col in miss_cols: + ratio = float(mi.loc[col, "missing_ratio"]) if col in mi.index else 0 + is_numeric = col in self._schema.numeric_columns + has_mar = any(s["missing_column"] == col for s in mar_suspects) + + if ratio > 0.5: + strategies[col] = "drop_column" + elif has_mar: + strategies[col] = "knn_or_mice" if is_numeric else "model_based" + elif is_numeric: + strategies[col] = "median" + else: + strategies[col] = "mode" + + return { + "indicator_correlation": corr_matrix.dropna(how="all", axis=0).dropna(how="all", axis=1), + "mar_suspects": pd.DataFrame(mar_suspects) if mar_suspects else pd.DataFrame(), + "diagnosis": diagnosis, + "imputation_strategy": strategies, + } + + # ------------------------------------------------------------------ + # X3. Distribution × Outlier Method Fitness + # ------------------------------------------------------------------ + + def distribution_outlier_fitness(self) -> pd.DataFrame | None: + """Recommend the best outlier detection method per column based on distribution shape.""" + dist = self._stats.distribution_info + summary = self._stats.summary + if dist.empty or summary.empty: + return None + + rows = [] + for col in dist.index: + if col not in summary.index: + continue + + skew = dist.loc[col].get("skewness", 0) or 0 + kurt = dist.loc[col].get("kurtosis", 0) or 0 + is_normal = dist.loc[col].get("is_normal_0.05", False) + + abs_skew = abs(skew) + reasons = [] + + if is_normal and abs_skew < 1 and abs(kurt) < 3: + method = "zscore" + reasons.append("approximately normal distribution") + elif abs_skew > 2 or kurt > 7: + method = "isolation_forest" + reasons.append("heavy-tailed or highly skewed distribution") + if abs_skew > 2: + reasons.append(f"skewness={skew:.2f}") + if kurt > 7: + reasons.append(f"kurtosis={kurt:.1f}") + elif abs_skew > 1 or kurt > 3: + method = "iqr" + reasons.append("moderately skewed/heavy-tailed") + else: + method = "iqr" + reasons.append("moderate distribution shape") + + rows.append({ + "column": col, + "skewness": round(float(skew), 3), + "kurtosis": round(float(kurt), 3), + "is_normal": bool(is_normal), + "recommended_method": method, + "reason": "; ".join(reasons), + }) + + if not rows: + return None + return pd.DataFrame(rows).set_index("column") + + # ------------------------------------------------------------------ + # X4. Cluster × Correlation — Simpson's Paradox Detection + # ------------------------------------------------------------------ + + def simpson_paradox(self) -> dict[str, Any] | None: + """Detect Simpson's paradox: overall correlation direction reverses within clusters.""" + adv = self._stats.advanced_stats + clustering = adv.get("clustering", {}) + km = clustering.get("kmeans") + if not km: + return None + + cluster_labels = km.get("labels") + if cluster_labels is None: + return None + + pearson = self._stats.correlation_matrix + if pearson.empty: + return None + + num_cols = [c for c in pearson.columns if c in self._df.columns][:self._max_cols] + if len(num_cols) < 2: + return None + + n = min(len(cluster_labels), len(self._df)) + labels = np.asarray(cluster_labels[:n]) + df_sub = self._df.iloc[:n] + + unique_clusters = np.unique(labels) + if len(unique_clusters) < 2: + return None + + paradoxes: list[dict[str, Any]] = [] + + for i, c1 in enumerate(num_cols): + for c2 in num_cols[i + 1:]: + overall_r = pearson.loc[c1, c2] if c1 in pearson.index and c2 in pearson.columns else 0 + if abs(overall_r) < 0.1: + continue # skip negligible correlations + + cluster_corrs = {} + n_reversed = 0 + for cl in unique_clusters: + mask = labels == cl + if mask.sum() < 10: + continue + try: + x = pd.to_numeric(df_sub.loc[mask.nonzero()[0], c1], errors="coerce").dropna() + y = pd.to_numeric(df_sub.loc[mask.nonzero()[0], c2], errors="coerce").dropna() + common_idx = x.index.intersection(y.index) + if len(common_idx) < 10: + continue + r, _ = sp_stats.pearsonr(x.loc[common_idx], y.loc[common_idx]) + cluster_corrs[f"cluster_{cl}"] = round(float(r), 4) + if np.sign(r) != np.sign(overall_r) and abs(r) > 0.1: + n_reversed += 1 + except Exception: + continue + + if n_reversed > 0 and len(cluster_corrs) >= 2: + paradoxes.append({ + "col_a": c1, + "col_b": c2, + "overall_corr": round(float(overall_r), 4), + "cluster_corrs": cluster_corrs, + "n_reversed_clusters": n_reversed, + "is_paradox": True, + "paradox_strength": round( + n_reversed / max(len(cluster_corrs), 1), 3 + ), + }) + + if not paradoxes: + return None + + paradoxes.sort(key=lambda x: x["paradox_strength"], reverse=True) + return { + "paradoxes": pd.DataFrame(paradoxes), + "n_paradoxes": len(paradoxes), + } + + # ------------------------------------------------------------------ + # X5. Feature Importance × Missing Rate + # ------------------------------------------------------------------ + + def importance_vs_missing(self) -> pd.DataFrame | None: + """Cross-tabulate feature importance with missing rate.""" + fi = self._stats.feature_importance + mi = self._stats.missing_info + if fi.empty or mi.empty: + return None + + if "missing_ratio" not in mi.columns: + return None + + # Detect the importance column name + imp_col = None + for candidate in ["variance", "cv", "mean_abs_corr", "mutual_info"]: + if candidate in fi.columns: + imp_col = candidate + break + if imp_col is None and len(fi.columns) > 0: + imp_col = fi.columns[0] + if imp_col is None: + return None + + common_cols = list(set(fi.index) & set(mi.index)) + if not common_cols: + return None + + rows = [] + for col in common_cols: + importance = float(fi.loc[col, imp_col]) if col in fi.index else 0 + missing_ratio = float(mi.loc[col, "missing_ratio"]) if col in mi.index else 0 + risk = "none" + if missing_ratio > 0.3 and importance > fi[imp_col].median(): + risk = "high" + elif missing_ratio > 0.1 and importance > fi[imp_col].median(): + risk = "medium" + elif missing_ratio > 0.05: + risk = "low" + rows.append({ + "column": col, + "importance": round(importance, 4), + "missing_ratio": round(missing_ratio, 4), + "information_loss_risk": risk, + }) + + df_result = pd.DataFrame(rows).set_index("column") + df_result = df_result.sort_values("importance", ascending=False) + return df_result + + # ------------------------------------------------------------------ + # X6. Unified 2D Embedding (t-SNE/UMAP + Cluster + Anomaly overlay) + # ------------------------------------------------------------------ + + def unified_2d_embedding(self) -> dict[str, Any] | None: + """Prepare a unified 2D scatter dataset with cluster + anomaly labels.""" + adv = self._stats.advanced_stats + dr = adv.get("dimreduction", {}) + clustering = adv.get("clustering", {}) + anomaly_full = adv.get("advanced_anomaly_full", {}) + + # Get 2D coordinates (prefer t-SNE then UMAP) + coords = None + method = None + for key in ["tsne", "umap"]: + emb = dr.get(key) + if emb is not None and isinstance(emb, dict): + c = emb.get("coordinates") + if c is not None and hasattr(c, "shape") and len(c) > 0: + coords = np.asarray(c) + method = key + break + + if coords is None or coords.shape[1] < 2: + return None + + n = coords.shape[0] + result: dict[str, Any] = { + "x": coords[:, 0].tolist(), + "y": coords[:, 1].tolist(), + "method": method, + "n_points": n, + } + + # Add cluster labels + km = clustering.get("kmeans") + if km and km.get("labels") is not None: + cl = np.asarray(km["labels"]) + result["cluster_labels"] = cl[:n].tolist() if len(cl) >= n else cl.tolist() + + # Add anomaly labels + iso = anomaly_full.get("isolation_forest") + if iso and iso.get("labels") is not None: + al = np.asarray(iso["labels"]) + result["anomaly_labels"] = al[:n].tolist() if len(al) >= n else al.tolist() + + return result diff --git a/f2a/stats/descriptive.py b/f2a/stats/descriptive.py new file mode 100644 index 0000000..53cc5f0 --- /dev/null +++ b/f2a/stats/descriptive.py @@ -0,0 +1,125 @@ +"""Descriptive statistics analysis module.""" + +from __future__ import annotations + +import numpy as np +import pandas as pd + +from f2a.core.schema import DataSchema +from f2a.utils.type_inference import ColumnType + + +class DescriptiveStats: + """Compute descriptive statistics. + + Args: + df: Target DataFrame to analyze. + schema: Data schema. + """ + + def __init__(self, df: pd.DataFrame, schema: DataSchema) -> None: + self._df = df + self._schema = schema + + def summary(self) -> pd.DataFrame: + """Return overall summary statistics. + + Generates a unified summary table covering both numeric and categorical columns. + + Returns: + Summary statistics DataFrame. + """ + rows: list[dict] = [] + for col_info in self._schema.columns: + series = self._df[col_info.name] + row: dict = { + "column": col_info.name, + "type": col_info.inferred_type.value, + "count": int(series.count()), + "missing": col_info.n_missing, + "missing_%": round(col_info.missing_ratio * 100, 2), + "unique": col_info.n_unique, + } + + if col_info.inferred_type == ColumnType.NUMERIC: + row.update(self._numeric_stats(series)) + elif col_info.inferred_type in (ColumnType.CATEGORICAL, ColumnType.BOOLEAN): + row.update(self._categorical_stats(series)) + + rows.append(row) + + return pd.DataFrame(rows).set_index("column") + + def numeric_summary(self) -> pd.DataFrame: + """Return summary statistics for numeric columns only.""" + cols = self._schema.numeric_columns + if not cols: + return pd.DataFrame() + return self._df[cols].describe().T + + def categorical_summary(self) -> pd.DataFrame: + """Return summary statistics for categorical columns only.""" + cols = self._schema.categorical_columns + if not cols: + return pd.DataFrame() + + rows: list[dict] = [] + for col in cols: + series = self._df[col] + top_val = series.mode().iloc[0] if not series.mode().empty else None + rows.append( + { + "column": col, + "count": int(series.count()), + "unique": int(series.nunique()), + "top": top_val, + "freq": int(series.value_counts().iloc[0]) if top_val is not None else 0, + } + ) + return pd.DataFrame(rows).set_index("column") + + # ── Internal helpers ──────────────────────────────── + + @staticmethod + def _numeric_stats(series: pd.Series) -> dict: + """Return numeric column statistics as a dictionary.""" + desc = series.describe() + q1 = float(desc.get("25%", np.nan)) + q3 = float(desc.get("75%", np.nan)) + mean = float(series.mean()) + std = float(series.std()) + count = int(series.count()) + skew_val = float(series.skew()) if count >= 3 else np.nan + kurt_val = float(series.kurtosis()) if count >= 4 else np.nan + se = std / np.sqrt(count) if count > 0 else np.nan + cv = abs(std / mean) if mean != 0 else np.nan + mad = float((series - series.median()).abs().median()) + + return { + "mean": round(mean, 4), + "median": round(float(series.median()), 4), + "std": round(std, 4), + "se": round(float(se), 4), + "cv": round(float(cv), 4) if not np.isnan(cv) else None, + "mad": round(mad, 4), + "min": float(series.min()), + "max": float(series.max()), + "range": round(float(series.max() - series.min()), 4), + "p5": round(float(series.quantile(0.05)), 4), + "q1": round(q1, 4), + "q3": round(q3, 4), + "p95": round(float(series.quantile(0.95)), 4), + "iqr": round(q3 - q1, 4), + "skewness": round(skew_val, 4) if not np.isnan(skew_val) else None, + "kurtosis": round(kurt_val, 4) if not np.isnan(kurt_val) else None, + } + + @staticmethod + def _categorical_stats(series: pd.Series) -> dict: + """Return categorical column statistics as a dictionary.""" + vc = series.value_counts() + top_val = vc.index[0] if len(vc) > 0 else None + return { + "top": top_val, + "freq": int(vc.iloc[0]) if len(vc) > 0 else 0, + } diff --git a/f2a/stats/distribution.py b/f2a/stats/distribution.py new file mode 100644 index 0000000..e8ce2f4 --- /dev/null +++ b/f2a/stats/distribution.py @@ -0,0 +1,151 @@ +"""Distribution analysis module.""" + +from __future__ import annotations + +import numpy as np +import pandas as pd +from scipy import stats as sp_stats + +from f2a.core.schema import DataSchema + + +class DistributionStats: + """Analyze distribution characteristics of numeric columns. + + Args: + df: Target DataFrame to analyze. + schema: Data schema. + """ + + def __init__(self, df: pd.DataFrame, schema: DataSchema) -> None: + self._df = df + self._schema = schema + + def analyze(self) -> pd.DataFrame: + """Return distribution information for numeric columns. + + Returns: + DataFrame containing skewness, kurtosis, and normality test results. + """ + cols = self._schema.numeric_columns + if not cols: + return pd.DataFrame() + + rows: list[dict] = [] + for col in cols: + series = self._df[col].dropna() + if len(series) < 3: + continue + rows.append(self._analyze_column(col, series)) + + return pd.DataFrame(rows).set_index("column") if rows else pd.DataFrame() + + def quantile_table(self, quantiles: list[float] | None = None) -> pd.DataFrame: + """Return quantile table for numeric columns. + + Args: + quantiles: List of quantiles to compute. Defaults to + ``[0.05, 0.10, 0.25, 0.50, 0.75, 0.90, 0.95]``. + + Returns: + Quantile DataFrame. + """ + if quantiles is None: + quantiles = [0.05, 0.10, 0.25, 0.50, 0.75, 0.90, 0.95] + + cols = self._schema.numeric_columns + if not cols: + return pd.DataFrame() + + return self._df[cols].quantile(quantiles) + + @staticmethod + def _analyze_column(col: str, series: pd.Series) -> dict: + """Analyze the distribution of a single numeric column.""" + skew = float(series.skew()) + kurt = float(series.kurtosis()) + + n = len(series) + + # ── Normality tests ────────────────────────────── + shapiro_p: float | None = None + dagostino_p: float | None = None + ks_p: float | None = None + anderson_stat: float | None = None + anderson_critical: float | None = None + + # Shapiro-Wilk (best for n <= 5000) + if 3 <= n <= 5000: + try: + _, shapiro_p = sp_stats.shapiro(series) + except Exception: + pass + + # D'Agostino-Pearson (good for n > 20) + if n > 20: + try: + _, dagostino_p = sp_stats.normaltest(series) + except Exception: + pass + + # Kolmogorov-Smirnov + if n >= 5: + try: + mean, std = series.mean(), series.std() + if std > 0: + _, ks_p = sp_stats.kstest(series, "norm", args=(mean, std)) + except Exception: + pass + + # Anderson-Darling + if n >= 8: + try: + ad = sp_stats.anderson(series, "norm") + anderson_stat = float(ad.statistic) + # Use the 5% significance level critical value + anderson_critical = float(ad.critical_values[2]) # index 2 = 5% + except Exception: + pass + + # Primary normality verdict (prefer Shapiro for small, D'Agostino for large) + primary_p: float | None = None + primary_test: str = "n/a" + if shapiro_p is not None: + primary_p = shapiro_p + primary_test = "shapiro" + elif dagostino_p is not None: + primary_p = dagostino_p + primary_test = "dagostino" + + # Skewness interpretation + if abs(skew) < 0.5: + skew_label = "symmetric" + elif abs(skew) < 1.0: + skew_label = "moderate skew" + else: + skew_label = "high skew" + + # Kurtosis interpretation (excess kurtosis: 0 = normal) + if abs(kurt) < 0.5: + kurt_label = "mesokurtic" + elif kurt > 0: + kurt_label = "leptokurtic" + else: + kurt_label = "platykurtic" + + return { + "column": col, + "n": n, + "skewness": round(skew, 4), + "skew_type": skew_label, + "kurtosis": round(kurt, 4), + "kurt_type": kurt_label, + "normality_test": primary_test, + "normality_p": round(primary_p, 6) if primary_p is not None else None, + "is_normal_0.05": primary_p > 0.05 if primary_p is not None else None, + "shapiro_p": round(shapiro_p, 6) if shapiro_p is not None else None, + "dagostino_p": round(dagostino_p, 6) if dagostino_p is not None else None, + "ks_p": round(ks_p, 6) if ks_p is not None else None, + "anderson_stat": round(anderson_stat, 4) if anderson_stat is not None else None, + "anderson_5pct_cv": round(anderson_critical, 4) if anderson_critical is not None else None, + } diff --git a/f2a/stats/duplicates.py b/f2a/stats/duplicates.py new file mode 100644 index 0000000..10c86ff --- /dev/null +++ b/f2a/stats/duplicates.py @@ -0,0 +1,61 @@ +"""Duplicate detection module.""" + +from __future__ import annotations + +from typing import Any + +import pandas as pd + +from f2a.core.schema import DataSchema + + +class DuplicateStats: + """Detect and analyse duplicate rows and column uniqueness. + + Args: + df: Target DataFrame. + schema: Data schema. + """ + + def __init__(self, df: pd.DataFrame, schema: DataSchema) -> None: + self._df = df + self._schema = schema + + def exact_duplicates(self) -> dict[str, Any]: + """Count exact duplicate rows. + + Returns: + Dictionary with ``total_rows``, ``duplicate_rows``, + ``unique_rows``, ``duplicate_ratio``. + """ + n = len(self._df) + n_dup = int(self._df.duplicated().sum()) + return { + "total_rows": n, + "duplicate_rows": n_dup, + "unique_rows": n - n_dup, + "duplicate_ratio": round(n_dup / n, 4) if n > 0 else 0.0, + } + + def column_uniqueness(self) -> pd.DataFrame: + """Return per-column uniqueness statistics. + + Returns: + DataFrame indexed by column name with uniqueness metrics. + """ + rows: list[dict] = [] + for col in self._df.columns: + n_unique = int(self._df[col].nunique()) + n_total = int(self._df[col].count()) + rows.append({ + "column": col, + "unique_values": n_unique, + "total_non_null": n_total, + "uniqueness_ratio": round(n_unique / n_total, 4) if n_total > 0 else 0.0, + "is_unique_key": n_unique == n_total > 0, + }) + return pd.DataFrame(rows).set_index("column") if rows else pd.DataFrame() + + def summary(self) -> dict[str, Any]: + """Return concise duplicate summary.""" + return self.exact_duplicates() diff --git a/f2a/stats/feature_importance.py b/f2a/stats/feature_importance.py new file mode 100644 index 0000000..d61c852 --- /dev/null +++ b/f2a/stats/feature_importance.py @@ -0,0 +1,140 @@ +"""Feature importance analysis module. + +Ranks features by variance, correlation, and mutual information. +""" + +from __future__ import annotations + +import numpy as np +import pandas as pd + +from f2a.core.schema import DataSchema +from f2a.utils.logging import get_logger + +logger = get_logger(__name__) + + +class FeatureImportanceStats: + """Compute feature importance rankings for numeric columns. + + Args: + df: Target DataFrame. + schema: Data schema. + """ + + def __init__(self, df: pd.DataFrame, schema: DataSchema) -> None: + self._df = df + self._schema = schema + + # ── Variance-based ranking ──────────────────────────── + + def variance_ranking(self) -> pd.DataFrame: + """Rank numeric features by normalised variance (coefficient of variation). + + Returns: + DataFrame sorted by variance (descending). + """ + cols = self._schema.numeric_columns + if not cols: + return pd.DataFrame() + + rows: list[dict] = [] + for col in cols: + series = self._df[col].dropna() + if len(series) < 2: + continue + mean = float(series.mean()) + std = float(series.std()) + cv = abs(std / mean) if mean != 0 else None + rows.append({ + "column": col, + "variance": round(float(series.var()), 4), + "std": round(std, 4), + "cv": round(cv, 4) if cv is not None else None, + "range": round(float(series.max() - series.min()), 4), + }) + + if not rows: + return pd.DataFrame() + return pd.DataFrame(rows).sort_values("variance", ascending=False).set_index("column") + + # ── Correlation-with-all ranking ────────────────────── + + def mean_abs_correlation(self) -> pd.DataFrame: + """Rank features by mean absolute correlation with all other features. + + Columns with higher mean |r| are more *connected* to the rest of the + dataset and may be more informative (or redundant). + """ + cols = self._schema.numeric_columns + if len(cols) < 2: + return pd.DataFrame() + + corr = self._df[cols].corr(method="pearson").abs() + # Exclude self-correlation + np.fill_diagonal(corr.values, 0) + mean_corr = corr.mean() + + df = pd.DataFrame({ + "column": mean_corr.index, + "mean_abs_corr": mean_corr.values.round(4), + }).sort_values("mean_abs_corr", ascending=False).set_index("column") + + return df + + # ── Mutual information ──────────────────────────────── + + def mutual_information(self) -> pd.DataFrame: + """Compute average mutual-information score per numeric feature. + + Requires ``scikit-learn``. Returns an empty DataFrame if unavailable. + """ + cols = self._schema.numeric_columns + if len(cols) < 2: + return pd.DataFrame() + + try: + from sklearn.feature_selection import mutual_info_regression + except ImportError: + logger.info("scikit-learn not installed; skipping mutual-information analysis.") + return pd.DataFrame() + + cols = cols[:15] # limit to avoid expensive computation + df_clean = self._df[cols].dropna() + if len(df_clean) < 30: + return pd.DataFrame() + + # For each column, compute MI against all others, then average + mi_scores: dict[str, float] = {col: 0.0 for col in cols} + n_pairs = 0 + for col in cols: + X = df_clean.drop(columns=[col]) + y = df_clean[col] + try: + mi = mutual_info_regression(X, y, random_state=42, n_neighbors=5) + for other_col, mi_val in zip(X.columns, mi): + mi_scores[other_col] += float(mi_val) + mi_scores[col] += float(mi_val) + n_pairs += len(X.columns) + except Exception: + continue + + if n_pairs == 0: + return pd.DataFrame() + + # Average + for col in mi_scores: + mi_scores[col] /= max(1, len(cols) - 1) + + df_result = pd.DataFrame({ + "column": list(mi_scores.keys()), + "avg_mutual_info": [round(v, 4) for v in mi_scores.values()], + }).sort_values("avg_mutual_info", ascending=False).set_index("column") + + return df_result + + # ── Combined summary ────────────────────────────────── + + def summary(self) -> pd.DataFrame: + """Return a combined feature-importance summary (variance-based).""" + return self.variance_ranking() diff --git a/f2a/stats/feature_insights.py b/f2a/stats/feature_insights.py new file mode 100644 index 0000000..4c4bd14 --- /dev/null +++ b/f2a/stats/feature_insights.py @@ -0,0 +1,378 @@ +"""Feature engineering insights module. + +Provides interaction detection, monotonic relationship analysis, +optimal binning, cardinality analysis, and data leakage detection. + +References: + - Friedman & Popescu (2008) — interaction detection + - Fayyad & Irani (1993) — entropy-based binning +""" + +from __future__ import annotations + +from typing import Any + +import numpy as np +import pandas as pd + +from f2a.core.schema import DataSchema +from f2a.utils.logging import get_logger + +logger = get_logger(__name__) + + +class FeatureInsightsStats: + """Feature engineering recommendations and insights. + + Args: + df: Target DataFrame. + schema: Data schema. + max_sample: Max rows to sample. + """ + + def __init__( + self, + df: pd.DataFrame, + schema: DataSchema, + max_sample: int = 5000, + ) -> None: + self._df = df + self._schema = schema + self._max_sample = max_sample + + # ── Interaction detection ───────────────────────────── + + def interaction_detection(self) -> pd.DataFrame: + """Detect potential feature interactions. + + For each pair of numeric features, computes the correlation of + their product with each feature individually. High product- + correlation suggests a meaningful interaction term. + + Returns: + DataFrame with col_a, col_b, interaction_strength, and + correlation of the product with each original feature. + """ + cols = self._schema.numeric_columns + if len(cols) < 2: + return pd.DataFrame() + + cols = cols[:15] + df_clean = self._df[cols].dropna() + if len(df_clean) < 30: + return pd.DataFrame() + + if len(df_clean) > self._max_sample: + df_clean = df_clean.sample(self._max_sample, random_state=42) + + rows: list[dict] = [] + for i in range(len(cols)): + for j in range(i + 1, len(cols)): + a = df_clean[cols[i]] + b = df_clean[cols[j]] + + # Product interaction + product = a * b + if product.std() == 0 or a.std() == 0 or b.std() == 0: + continue + + # How much does the product correlate beyond individual features? + r_prod_a = float(product.corr(a)) + r_prod_b = float(product.corr(b)) + r_ab = float(a.corr(b)) + + # Interaction strength: residual correlation after removing linear + interaction_strength = max(abs(r_prod_a), abs(r_prod_b)) - abs(r_ab) + + if abs(interaction_strength) > 0.1: + rows.append({ + "col_a": cols[i], + "col_b": cols[j], + "interaction_strength": round(interaction_strength, 4), + "corr_product_a": round(r_prod_a, 4), + "corr_product_b": round(r_prod_b, 4), + "corr_a_b": round(r_ab, 4), + "recommendation": ( + "Strong interaction" + if interaction_strength > 0.3 + else "Moderate interaction" + ), + }) + + if not rows: + return pd.DataFrame() + + return pd.DataFrame(rows).sort_values( + "interaction_strength", ascending=False + ).reset_index(drop=True) + + # ── Monotonic relationship detection ────────────────── + + def monotonic_detection(self) -> pd.DataFrame: + """Detect monotonic relationships using Spearman correlation. + + A high |Spearman| but low |Pearson| suggests a non-linear + monotonic relationship. + + Returns: + DataFrame with col_a, col_b, pearson, spearman, monotonic_gap. + """ + cols = self._schema.numeric_columns + if len(cols) < 2: + return pd.DataFrame() + + cols = cols[:20] + df_clean = self._df[cols].dropna() + if len(df_clean) < 20: + return pd.DataFrame() + + pearson = df_clean.corr(method="pearson") + spearman = df_clean.corr(method="spearman") + + rows: list[dict] = [] + for i in range(len(cols)): + for j in range(i + 1, len(cols)): + r_p = float(pearson.iloc[i, j]) + r_s = float(spearman.iloc[i, j]) + gap = abs(r_s) - abs(r_p) + + if gap > 0.05 and abs(r_s) > 0.3: + rows.append({ + "col_a": cols[i], + "col_b": cols[j], + "pearson": round(r_p, 4), + "spearman": round(r_s, 4), + "monotonic_gap": round(gap, 4), + "relationship": ( + "Strong non-linear monotonic" + if gap > 0.15 + else "Moderate non-linear monotonic" + ), + }) + + if not rows: + return pd.DataFrame() + + return pd.DataFrame(rows).sort_values( + "monotonic_gap", ascending=False + ).reset_index(drop=True) + + # ── Binning analysis ────────────────────────────────── + + def binning_analysis(self, n_bins: int = 10) -> pd.DataFrame: + """Analyze optimal binning for numeric columns. + + Computes equal-width and equal-frequency binning, then evaluates + the entropy of each binning to recommend the best strategy. + + Args: + n_bins: Number of bins. + + Returns: + DataFrame with binning statistics per column. + """ + cols = self._schema.numeric_columns + if not cols: + return pd.DataFrame() + + rows: list[dict] = [] + for col in cols: + series = self._df[col].dropna() + if len(series) < n_bins: + continue + + # Equal-width binning + try: + ew_bins = pd.cut(series, bins=n_bins) + ew_counts = ew_bins.value_counts(normalize=True) + ew_entropy = float(-np.sum( + ew_counts * np.log2(ew_counts + 1e-15) + )) + except Exception: + ew_entropy = None + + # Equal-frequency binning + try: + ef_bins = pd.qcut(series, q=n_bins, duplicates="drop") + ef_counts = ef_bins.value_counts(normalize=True) + ef_entropy = float(-np.sum( + ef_counts * np.log2(ef_counts + 1e-15) + )) + except Exception: + ef_entropy = None + + max_entropy = float(np.log2(n_bins)) + + recommendation = "equal_frequency" # default + if ew_entropy is not None and ef_entropy is not None: + if ew_entropy > ef_entropy * 0.95: + recommendation = "equal_width" + + rows.append({ + "column": col, + "n_bins": n_bins, + "equal_width_entropy": round(ew_entropy, 4) if ew_entropy else None, + "equal_freq_entropy": round(ef_entropy, 4) if ef_entropy else None, + "max_entropy": round(max_entropy, 4), + "recommended_method": recommendation, + "skewness": round(float(series.skew()), 4), + }) + + return pd.DataFrame(rows).set_index("column") if rows else pd.DataFrame() + + # ── Cardinality analysis ────────────────────────────── + + def cardinality_analysis(self) -> pd.DataFrame: + """Analyze cardinality of all columns for encoding recommendations. + + Returns: + DataFrame with cardinality stats and encoding recommendations. + """ + rows: list[dict] = [] + for col in self._df.columns: + series = self._df[col] + n_unique = int(series.nunique()) + n_total = int(series.count()) + ratio = n_unique / n_total if n_total > 0 else 0.0 + + # Determine type recommendation + if ratio > 0.95: + encoding = "id_column (drop or hash)" + elif n_unique <= 2: + encoding = "binary encoding" + elif n_unique <= 10: + encoding = "one-hot encoding" + elif n_unique <= 50: + encoding = "label encoding or target encoding" + elif n_unique <= 500: + encoding = "target encoding or frequency encoding" + else: + encoding = "hash encoding or embeddings" + + rows.append({ + "column": col, + "n_unique": n_unique, + "n_total": n_total, + "cardinality_ratio": round(ratio, 4), + "cardinality_level": ( + "binary" if n_unique <= 2 + else "low" if n_unique <= 10 + else "medium" if n_unique <= 50 + else "high" if n_unique <= 500 + else "very_high" + ), + "recommended_encoding": encoding, + }) + + return pd.DataFrame(rows).set_index("column") if rows else pd.DataFrame() + + # ── Leakage detection ───────────────────────────────── + + def leakage_detection(self) -> pd.DataFrame: + """Detect potential data leakage indicators. + + Flags columns with: + - Perfect or near-perfect correlation with other columns + - Suspiciously high unique ratio (possible target leak) + - Constant or near-constant values + + Returns: + DataFrame with leakage risk assessment per column. + """ + cols = self._schema.numeric_columns + all_cols = list(self._df.columns) + + rows: list[dict] = [] + + for col in all_cols: + series = self._df[col] + n_total = int(series.count()) + n_unique = int(series.nunique()) + ratio = n_unique / n_total if n_total > 0 else 0 + + risks: list[str] = [] + + # Near-constant + if n_unique <= 1: + risks.append("constant_column") + elif n_unique == 2 and n_total > 100: + top_freq = series.value_counts().iloc[0] / n_total + if top_freq > 0.99: + risks.append("near_constant") + + # ID-like + if ratio > 0.95 and n_total > 100: + risks.append("id_like") + + # Perfect correlation with another column + if col in cols: + for other in cols: + if other == col: + continue + try: + r = abs(float(self._df[col].corr(self._df[other]))) + if r > 0.99: + risks.append(f"perfect_corr_with_{other}") + break + except Exception: + continue + + risk_level = ( + "high" if len(risks) >= 2 + else "medium" if len(risks) == 1 + else "low" + ) + + if risks: + rows.append({ + "column": col, + "risk_level": risk_level, + "risks": "; ".join(risks), + "unique_ratio": round(ratio, 4), + "n_unique": n_unique, + }) + + return pd.DataFrame(rows).set_index("column") if rows else pd.DataFrame() + + # ── Summary ─────────────────────────────────────────── + + def summary(self) -> dict[str, Any]: + """Return combined feature insight results.""" + result: dict[str, Any] = {} + + try: + inter = self.interaction_detection() + if not inter.empty: + result["interactions"] = inter + except Exception as exc: + logger.debug("Interaction detection skipped: %s", exc) + + try: + mono = self.monotonic_detection() + if not mono.empty: + result["monotonic"] = mono + except Exception as exc: + logger.debug("Monotonic detection skipped: %s", exc) + + try: + bins = self.binning_analysis() + if not bins.empty: + result["binning"] = bins + except Exception as exc: + logger.debug("Binning analysis skipped: %s", exc) + + try: + card = self.cardinality_analysis() + if not card.empty: + result["cardinality"] = card + except Exception as exc: + logger.debug("Cardinality analysis skipped: %s", exc) + + try: + leak = self.leakage_detection() + if not leak.empty: + result["leakage"] = leak + except Exception as exc: + logger.debug("Leakage detection skipped: %s", exc) + + return result diff --git a/f2a/stats/insight_engine.py b/f2a/stats/insight_engine.py new file mode 100644 index 0000000..8762f07 --- /dev/null +++ b/f2a/stats/insight_engine.py @@ -0,0 +1,1094 @@ +"""Automatic Insight Engine — generates prioritised natural-language insights. + +The engine scans *all* previously computed statistics (basic + advanced) and +applies a comprehensive set of interpretive rules to surface: + +* ``FINDING`` — notable data patterns or facts +* ``WARNING`` — data quality or integrity concerns +* ``RECOMMENDATION`` — actionable preprocessing / modelling suggestions +* ``OPPORTUNITY`` — exploitable patterns or segmentation opportunities + +Every insight carries a severity (critical / high / medium / low), a +priority score for ranking, related column names, and concrete action items. +""" + +from __future__ import annotations + +import math +from dataclasses import dataclass, field +from enum import Enum +from typing import Any + +import numpy as np +import pandas as pd + +from f2a.core.schema import DataSchema +from f2a.utils.logging import get_logger + +logger = get_logger(__name__) + +# ===================================================================== +# Data classes +# ===================================================================== + +class InsightType(str, Enum): + FINDING = "finding" + WARNING = "warning" + RECOMMENDATION = "recommendation" + OPPORTUNITY = "opportunity" + + +class Severity(str, Enum): + CRITICAL = "critical" + HIGH = "high" + MEDIUM = "medium" + LOW = "low" + + +# Numeric weight for priority scoring +_SEV_WEIGHT = {Severity.CRITICAL: 1.0, Severity.HIGH: 0.75, Severity.MEDIUM: 0.5, Severity.LOW: 0.25} + + +@dataclass +class Insight: + """A single auto-generated insight.""" + + type: InsightType + severity: Severity + category: str # distribution | correlation | cluster | anomaly | missing | quality | feature | general + title: str + description: str + affected_columns: list[str] = field(default_factory=list) + evidence: dict[str, Any] = field(default_factory=dict) + action_items: list[str] = field(default_factory=list) + priority_score: float = 0.0 + + # Computed after instantiation + def __post_init__(self) -> None: + if self.priority_score == 0.0: + col_factor = min(len(self.affected_columns) / 5.0, 1.0) if self.affected_columns else 0.3 + actionable = 1.0 if self.action_items else 0.6 + self.priority_score = round( + _SEV_WEIGHT.get(self.severity, 0.5) * 0.5 + + col_factor * 0.3 + + actionable * 0.2, + 4, + ) + + def to_dict(self) -> dict[str, Any]: + return { + "type": self.type.value, + "severity": self.severity.value, + "category": self.category, + "title": self.title, + "description": self.description, + "affected_columns": self.affected_columns, + "evidence": {k: _safe_serialize(v) for k, v in self.evidence.items()}, + "action_items": self.action_items, + "priority_score": self.priority_score, + } + + +def _safe_serialize(v: Any) -> Any: + """Convert numpy / pandas types to JSON-safe Python primitives.""" + if isinstance(v, np.integer): + return int(v) + if isinstance(v, np.floating): + return float(v) + if isinstance(v, np.bool_): + return bool(v) + if isinstance(v, np.ndarray): + return v.tolist() + if isinstance(v, pd.DataFrame): + return v.to_dict() + if isinstance(v, pd.Series): + return v.to_dict() + return v + + +# ===================================================================== +# Insight Engine +# ===================================================================== + +class InsightEngine: + """Generate, rank, and present actionable insights from ``StatsResult``. + + Usage:: + + engine = InsightEngine(stats_result, data_schema) + insights = engine.generate() # list[Insight] + executive = engine.executive_summary() # str + """ + + def __init__(self, stats: Any, schema: DataSchema) -> None: + self._stats = stats + self._schema = schema + self._insights: list[Insight] = [] + + # ------------------------------------------------------------------ + # Public API + # ------------------------------------------------------------------ + + def generate(self) -> list[Insight]: + """Run all rule sets and return insights sorted by priority (desc).""" + self._insights.clear() + + try: + self._distribution_insights() + except Exception as exc: + logger.debug("Distribution insight rules failed: %s", exc) + + try: + self._correlation_insights() + except Exception as exc: + logger.debug("Correlation insight rules failed: %s", exc) + + try: + self._missing_insights() + except Exception as exc: + logger.debug("Missing insight rules failed: %s", exc) + + try: + self._outlier_insights() + except Exception as exc: + logger.debug("Outlier insight rules failed: %s", exc) + + try: + self._quality_insights() + except Exception as exc: + logger.debug("Quality insight rules failed: %s", exc) + + try: + self._clustering_insights() + except Exception as exc: + logger.debug("Clustering insight rules failed: %s", exc) + + try: + self._anomaly_insights() + except Exception as exc: + logger.debug("Anomaly insight rules failed: %s", exc) + + try: + self._feature_insights() + except Exception as exc: + logger.debug("Feature insight rules failed: %s", exc) + + try: + self._pca_insights() + except Exception as exc: + logger.debug("PCA insight rules failed: %s", exc) + + try: + self._duplicate_insights() + except Exception as exc: + logger.debug("Duplicate insight rules failed: %s", exc) + + try: + self._advanced_distribution_insights() + except Exception as exc: + logger.debug("Adv distribution insight rules failed: %s", exc) + + try: + self._advanced_correlation_insights() + except Exception as exc: + logger.debug("Adv correlation insight rules failed: %s", exc) + + try: + self._general_insights() + except Exception as exc: + logger.debug("General insight rules failed: %s", exc) + + self._insights.sort(key=lambda i: i.priority_score, reverse=True) + return self._insights + + def executive_summary(self) -> str: + """One-paragraph natural-language summary of the dataset.""" + if not self._insights: + self.generate() + + n = self._schema.n_rows + d = self._schema.n_cols + num = len(self._schema.numeric_columns) + cat = len(self._schema.categorical_columns) + + crit = sum(1 for i in self._insights if i.severity == Severity.CRITICAL) + high = sum(1 for i in self._insights if i.severity == Severity.HIGH) + med = sum(1 for i in self._insights if i.severity == Severity.MEDIUM) + + parts = [ + f"Dataset contains {n:,} rows and {d} columns ({num} numeric, {cat} categorical).", + ] + if crit: + parts.append(f"{crit} critical issue(s) require immediate attention.") + if high: + parts.append(f"{high} high-priority finding(s) detected.") + if med: + parts.append(f"{med} moderate observations noted.") + + # Top 3 headlines + top3 = self._insights[:3] + if top3: + parts.append("Key highlights:") + for idx, ins in enumerate(top3, 1): + parts.append(f" {idx}. {ins.title}") + + return " ".join(parts) + + def summary_dict(self) -> dict[str, Any]: + """Serialize all insights for storage / HTML rendering.""" + if not self._insights: + self.generate() + return { + "executive_summary": self.executive_summary(), + "total_count": len(self._insights), + "by_severity": { + s.value: sum(1 for i in self._insights if i.severity == s) + for s in Severity + }, + "by_type": { + t.value: sum(1 for i in self._insights if i.type == t) + for t in InsightType + }, + "insights": [i.to_dict() for i in self._insights], + } + + # ------------------------------------------------------------------ + # Helper + # ------------------------------------------------------------------ + + def _add(self, **kwargs: Any) -> None: + self._insights.append(Insight(**kwargs)) + + # ================================================================== + # Rule Sets + # ================================================================== + + # -- 1. Distribution -------------------------------------------------- + + def _distribution_insights(self) -> None: + summary = self._stats.summary + dist = self._stats.distribution_info + if summary.empty: + return + + numeric_rows = summary[summary.get("type", pd.Series(dtype=str)) == "numeric"] if "type" in summary.columns else summary + if numeric_rows.empty: + return + + # Extreme skewness + if "skewness" in numeric_rows.columns: + skewed = numeric_rows[numeric_rows["skewness"].abs() > 2.0].dropna(subset=["skewness"]) + if not skewed.empty: + cols = list(skewed.index) + worst = skewed["skewness"].abs().idxmax() + worst_val = skewed.loc[worst, "skewness"] + self._add( + type=InsightType.RECOMMENDATION, + severity=Severity.HIGH, + category="distribution", + title=f"{len(cols)} column(s) with extreme skewness", + description=( + f"Columns {cols[:5]} have |skewness| > 2, indicating " + f"heavily asymmetric distributions. " + f"Worst: '{worst}' (skewness={worst_val:.2f})." + ), + affected_columns=cols, + evidence={"worst_column": worst, "worst_skewness": float(worst_val)}, + action_items=[ + "Apply log or Box-Cox transform to reduce skewness", + "Consider robust statistics (median, MAD) instead of mean/std", + ], + ) + + # High kurtosis (heavy tails) + if "kurtosis" in numeric_rows.columns: + heavy = numeric_rows[numeric_rows["kurtosis"] > 7.0].dropna(subset=["kurtosis"]) + if not heavy.empty: + cols = list(heavy.index) + self._add( + type=InsightType.WARNING, + severity=Severity.MEDIUM, + category="distribution", + title=f"{len(cols)} column(s) with extreme kurtosis (heavy tails)", + description=( + f"Columns {cols[:5]} have kurtosis > 7, meaning very heavy tails. " + "Outliers may dominate summary statistics." + ), + affected_columns=cols, + evidence={"kurtosis_values": {c: float(numeric_rows.loc[c, "kurtosis"]) for c in cols[:5]}}, + action_items=[ + "Use winsorization or robust estimators", + "Check these columns for extreme outliers", + ], + ) + + # Normality summary + if not dist.empty and "is_normal_0.05" in dist.columns: + normal = dist[dist["is_normal_0.05"] == True] + non_normal = dist[dist["is_normal_0.05"] == False] + total = len(dist) + if len(non_normal) > total * 0.8 and total >= 3: + self._add( + type=InsightType.FINDING, + severity=Severity.MEDIUM, + category="distribution", + title=f"{len(non_normal)}/{total} numeric columns are non-normal", + description=( + "Most numeric columns fail normality tests (α=0.05). " + "Non-parametric methods may be more appropriate." + ), + affected_columns=list(non_normal.index), + action_items=[ + "Prefer non-parametric tests (Kruskal-Wallis, Mann-Whitney) over t-tests/ANOVA", + "Consider power transforms if normality is needed for downstream models", + ], + ) + + # Low variability + if "cv" in numeric_rows.columns: + low_var = numeric_rows[(numeric_rows["cv"].notna()) & (numeric_rows["cv"].abs() < 0.05)] + if not low_var.empty: + cols = list(low_var.index) + self._add( + type=InsightType.FINDING, + severity=Severity.LOW, + category="distribution", + title=f"{len(cols)} column(s) with very low variability (CV < 5%)", + description=( + f"Columns {cols[:5]} have coefficient of variation < 5%, " + "meaning values are tightly clustered. These may be near-constant." + ), + affected_columns=cols, + action_items=["Evaluate whether these columns carry useful information"], + ) + + # -- 2. Correlation --------------------------------------------------- + + def _correlation_insights(self) -> None: + corr = self._stats.correlation_matrix + vif = self._stats.vif_table + spearman = self._stats.spearman_matrix + + # Multicollinearity via VIF + if not vif.empty and "VIF" in vif.columns: + severe = vif[vif["VIF"] > 10] + if not severe.empty: + cols = list(severe.index) + worst = severe["VIF"].idxmax() + self._add( + type=InsightType.WARNING, + severity=Severity.CRITICAL if len(severe) > 3 else Severity.HIGH, + category="correlation", + title=f"{len(severe)} column(s) with severe multicollinearity (VIF>10)", + description=( + f"VIF > 10 detected for: {cols[:5]}. " + f"Worst: '{worst}' (VIF={severe.loc[worst, 'VIF']:.1f}). " + "Redundant information may cause model instability." + ), + affected_columns=cols, + evidence={"vif_values": {c: float(severe.loc[c, "VIF"]) for c in cols[:5]}}, + action_items=[ + "Remove one column from each highly correlated pair", + "Apply PCA or regularization (Ridge/Lasso) to handle collinearity", + ], + ) + + # High pearson correlation pairs + if not corr.empty: + pairs: list[tuple[str, str, float]] = [] + cols_list = corr.columns.tolist() + for i, c1 in enumerate(cols_list): + for c2 in cols_list[i + 1:]: + v = corr.loc[c1, c2] + if abs(v) > 0.9: + pairs.append((c1, c2, float(v))) + if pairs: + affected = list({c for p in pairs for c in p[:2]}) + self._add( + type=InsightType.WARNING, + severity=Severity.HIGH, + category="correlation", + title=f"{len(pairs)} column pair(s) with |r| > 0.9", + description=( + "Near-perfect linear relationships detected. " + f"Top pair: '{pairs[0][0]}' ↔ '{pairs[0][1]}' (r={pairs[0][2]:.3f})." + ), + affected_columns=affected, + evidence={"pairs": [(p[0], p[1], p[2]) for p in pairs[:5]]}, + action_items=[ + "Consider dropping one column from each pair to reduce redundancy", + "Verify these are not data leakage or duplicate columns", + ], + ) + + # No correlations at all (independent features) + if not corr.empty and corr.shape[0] >= 3: + upper = corr.where(np.triu(np.ones(corr.shape, dtype=bool), k=1)) + max_abs = upper.abs().max().max() + if max_abs < 0.3: + self._add( + type=InsightType.FINDING, + severity=Severity.LOW, + category="correlation", + title="All numeric columns are weakly correlated (max |r| < 0.3)", + description=( + "No strong linear relationships found between any pair of numeric columns. " + "Features appear largely independent." + ), + affected_columns=list(corr.columns), + action_items=["Check for non-linear relationships via MI or distance correlation"], + ) + + # -- 3. Missing ------------------------------------------------------- + + def _missing_insights(self) -> None: + mi = self._stats.missing_info + if mi.empty or "missing_ratio" not in mi.columns: + return + + total_ratio = mi["missing_ratio"].mean() if not mi.empty else 0 + high_miss = mi[mi["missing_ratio"] > 0.5] + moderate_miss = mi[(mi["missing_ratio"] > 0.1) & (mi["missing_ratio"] <= 0.5)] + no_miss = mi[mi["missing_ratio"] == 0] + + # Columns with >50% missing + if not high_miss.empty: + cols = list(high_miss.index) + self._add( + type=InsightType.WARNING, + severity=Severity.CRITICAL, + category="missing", + title=f"{len(cols)} column(s) with >50% missing values", + description=( + f"Columns {cols[:5]} are more than half empty. " + "These columns may not be usable without strong imputation." + ), + affected_columns=cols, + evidence={"missing_ratios": {c: float(high_miss.loc[c, "missing_ratio"]) for c in cols[:5]}}, + action_items=[ + "Consider dropping these columns unless domain-critical", + "If kept, use model-based imputation (KNN, MICE) rather than simple mean/median", + ], + ) + + # Moderate missing + if not moderate_miss.empty: + cols = list(moderate_miss.index) + self._add( + type=InsightType.RECOMMENDATION, + severity=Severity.MEDIUM, + category="missing", + title=f"{len(cols)} column(s) with 10-50% missing values", + description=( + f"Columns {cols[:5]} have noticeable missing rates. " + "Imputation strategy should be chosen carefully." + ), + affected_columns=cols, + action_items=[ + "Check if missingness is random (MCAR) or systematic (MAR/MNAR)", + "For numeric columns: median or KNN imputation; for categorical: mode or indicator variable", + ], + ) + + # Completely clean + if len(no_miss) == len(mi) and len(mi) > 0: + self._add( + type=InsightType.FINDING, + severity=Severity.LOW, + category="missing", + title="No missing values detected in any column", + description="All columns are fully populated — no imputation needed.", + affected_columns=[], + ) + + # -- 4. Outlier ------------------------------------------------------- + + def _outlier_insights(self) -> None: + out = self._stats.outlier_summary + if out.empty or "outlier_%" not in out.columns: + return + + extreme = out[out["outlier_%"] > 15] + moderate = out[(out["outlier_%"] > 5) & (out["outlier_%"] <= 15)] + + if not extreme.empty: + cols = list(extreme.index) + self._add( + type=InsightType.WARNING, + severity=Severity.HIGH, + category="anomaly", + title=f"{len(cols)} column(s) with extreme outlier rate (>15%)", + description=( + f"Columns {cols[:5]} have very high outlier percentages. " + "This may indicate data quality issues or heavy-tailed distributions." + ), + affected_columns=cols, + evidence={"outlier_rates": {c: float(extreme.loc[c, "outlier_%"]) for c in cols[:5]}}, + action_items=[ + "Check if the distribution is truly heavy-tailed (in which case outliers are expected)", + "Apply winsorization or log-transform if outliers are distorting analysis", + "Consider using robust methods (median, MAD, IQR-based)", + ], + ) + + if not moderate.empty: + cols = list(moderate.index) + self._add( + type=InsightType.FINDING, + severity=Severity.MEDIUM, + category="anomaly", + title=f"{len(cols)} column(s) with notable outlier rate (5-15%)", + description=f"Columns {cols[:5]} have moderate outlier rates.", + affected_columns=cols, + action_items=["Review outlier boundaries and adjust if domain knowledge warrants"], + ) + + # -- 5. Quality ------------------------------------------------------- + + def _quality_insights(self) -> None: + qs = self._stats.quality_scores + if not qs: + return + + overall = qs.get("overall", 1.0) + if overall < 0.5: + self._add( + type=InsightType.WARNING, + severity=Severity.CRITICAL, + category="quality", + title=f"Overall data quality is poor ({overall * 100:.0f}%)", + description=( + "The combined quality score across completeness, uniqueness, " + "consistency, and validity is below 50%." + ), + evidence=qs, + action_items=[ + "Address missing values and inconsistencies before analysis", + "Review data collection pipeline for systematic issues", + ], + ) + elif overall < 0.75: + self._add( + type=InsightType.RECOMMENDATION, + severity=Severity.MEDIUM, + category="quality", + title=f"Data quality is moderate ({overall * 100:.0f}%)", + description="Some quality dimensions need attention before production use.", + evidence=qs, + action_items=["Focus on the lowest-scoring quality dimension"], + ) + + # Per-dimension alerts + for dim, label in [("completeness", "Completeness"), ("uniqueness", "Uniqueness"), + ("consistency", "Consistency"), ("validity", "Validity")]: + score = qs.get(dim, 1.0) + if score < 0.6: + self._add( + type=InsightType.WARNING, + severity=Severity.HIGH, + category="quality", + title=f"{label} score is low ({score * 100:.0f}%)", + description=f"The {label.lower()} dimension scored {score * 100:.0f}%, dragging down overall quality.", + evidence={dim: score}, + action_items=[f"Investigate and improve {label.lower()} issues"], + ) + + # -- 6. Clustering ---------------------------------------------------- + + def _clustering_insights(self) -> None: + adv = self._stats.advanced_stats + clustering = adv.get("clustering") + if not clustering: + return + + km = clustering.get("kmeans") + if km: + k = km.get("optimal_k", 0) + sil = km.get("best_silhouette", 0) + sizes = km.get("cluster_sizes", {}) + + if k >= 2 and sil > 0.4: + self._add( + type=InsightType.OPPORTUNITY, + severity=Severity.HIGH, + category="cluster", + title=f"Clear cluster structure found (k={k}, silhouette={sil:.2f})", + description=( + f"K-Means identifies {k} well-separated clusters " + f"(silhouette={sil:.2f}). Cluster sizes: {sizes}." + ), + evidence={"optimal_k": k, "silhouette": sil, "sizes": sizes}, + action_items=[ + "Profile each cluster to understand segment characteristics", + "Use cluster labels as a feature for downstream modelling", + ], + ) + elif k >= 2 and sil > 0.2: + self._add( + type=InsightType.FINDING, + severity=Severity.MEDIUM, + category="cluster", + title=f"Moderate cluster structure (k={k}, silhouette={sil:.2f})", + description=( + f"Some grouping exists but clusters overlap. " + f"Silhouette={sil:.2f} suggests partial separation." + ), + evidence={"optimal_k": k, "silhouette": sil}, + action_items=["Consider density-based methods (DBSCAN) for better cluster boundaries"], + ) + + # Check for imbalanced clusters + if sizes: + total = sum(sizes.values()) + if total > 0: + min_pct = min(sizes.values()) / total + max_pct = max(sizes.values()) / total + if min_pct < 0.05: + tiny_clusters = [k for k, v in sizes.items() if v / total < 0.05] + self._add( + type=InsightType.FINDING, + severity=Severity.MEDIUM, + category="cluster", + title=f"Highly imbalanced clusters detected", + description=( + f"Cluster(s) {tiny_clusters} contain <5% of data. " + "These may represent anomalous sub-populations." + ), + evidence={"tiny_clusters": tiny_clusters, "min_pct": min_pct}, + action_items=["Inspect small clusters — they may be anomalies or niche segments"], + ) + + dbscan = clustering.get("dbscan") + if dbscan: + noise_ratio = dbscan.get("noise_ratio", 0) + if noise_ratio > 0.2: + self._add( + type=InsightType.WARNING, + severity=Severity.MEDIUM, + category="cluster", + title=f"DBSCAN labels {noise_ratio * 100:.0f}% of data as noise", + description=( + "A high proportion of data points don't belong to any density cluster. " + "This may indicate dispersed data or sub-optimal epsilon." + ), + evidence={"noise_ratio": noise_ratio, "eps": dbscan.get("eps")}, + action_items=["Try adjusting eps parameter or use HDBSCAN for adaptive density"], + ) + + # -- 7. Anomaly ------------------------------------------------------- + + def _anomaly_insights(self) -> None: + adv = self._stats.advanced_stats + anomaly = adv.get("advanced_anomaly", {}) + consensus = anomaly.get("consensus") + if not consensus: + return + + ratio = consensus.get("consensus_ratio", 0) + count = consensus.get("consensus_count", 0) + n = consensus.get("n_samples", 1) + agreement = consensus.get("agreement_matrix", {}) + + if ratio > 0.05: + self._add( + type=InsightType.WARNING, + severity=Severity.HIGH, + category="anomaly", + title=f"Multi-method consensus: {count} anomalies ({ratio * 100:.1f}%)", + description=( + f"{count} rows flagged as anomalous by ≥2 independent methods " + f"(IF + LOF + Mahalanobis). " + f"All-agree: {agreement.get('all_agree_anomaly', 0)}, " + f"majority: {agreement.get('majority_anomaly', 0)}." + ), + evidence={"consensus_ratio": ratio, "agreement": agreement}, + action_items=[ + "Investigate consensus anomalies — they are high-confidence outliers", + "Consider removing or winsorizing before modelling", + ], + ) + elif ratio > 0.01: + self._add( + type=InsightType.FINDING, + severity=Severity.MEDIUM, + category="anomaly", + title=f"Multi-method anomalies: {count} rows ({ratio * 100:.1f}%)", + description=( + f"A small fraction of rows are flagged by multiple anomaly detection methods." + ), + evidence={"consensus_ratio": ratio}, + action_items=["Review flagged rows for data entry errors or special cases"], + ) + + # -- 8. Feature Insights ---------------------------------------------- + + def _feature_insights(self) -> None: + adv = self._stats.advanced_stats + fi = adv.get("feature_insights", {}) + if not fi: + return + + # Leakage detection + leakage = fi.get("leakage") + if leakage is not None and not leakage.empty: + high_risk = leakage[leakage.get("risk_level", pd.Series()) == "high"] if "risk_level" in leakage.columns else pd.DataFrame() + if not high_risk.empty: + cols = list(high_risk.index) + self._add( + type=InsightType.WARNING, + severity=Severity.CRITICAL, + category="feature", + title=f"{len(cols)} column(s) flagged for potential data leakage", + description=( + f"Columns {cols[:5]} show high leakage risk " + "(constant, ID-like, or perfectly correlated with others)." + ), + affected_columns=cols, + action_items=[ + "Remove these columns before building any ML model", + "Verify they are not derived from the target variable", + ], + ) + + # Strong interactions + interactions = fi.get("interactions") + if interactions is not None and not interactions.empty: + strong = interactions[interactions.get("interaction_strength", pd.Series(dtype=float)) > 0.7] if "interaction_strength" in interactions.columns else pd.DataFrame() + if not strong.empty and len(strong) > 0: + top = strong.iloc[0] + self._add( + type=InsightType.OPPORTUNITY, + severity=Severity.MEDIUM, + category="feature", + title=f"{len(strong)} strong feature interaction(s) detected", + description=( + f"Top interaction: '{top.get('col_a', '?')}' × '{top.get('col_b', '?')}' " + f"(strength={top.get('interaction_strength', 0):.2f}). " + "Product features may improve model performance." + ), + affected_columns=[str(top.get("col_a", "")), str(top.get("col_b", ""))], + action_items=["Create interaction (product) features for the top pairs"], + ) + + # Cardinality / encoding + card = fi.get("cardinality") + if card is not None and not card.empty and "recommended_encoding" in card.columns: + hash_cols = card[card["recommended_encoding"] == "hashing"] + if not hash_cols.empty: + cols = list(hash_cols.index) + self._add( + type=InsightType.RECOMMENDATION, + severity=Severity.MEDIUM, + category="feature", + title=f"{len(cols)} high-cardinality column(s) need special encoding", + description=( + f"Columns {cols[:5]} have very high cardinality. " + "One-hot encoding would create too many features." + ), + affected_columns=cols, + action_items=[ + "Use target encoding, hashing, or embedding for these columns", + "Consider grouping rare categories into 'Other'", + ], + ) + + # -- 9. PCA ----------------------------------------------------------- + + def _pca_insights(self) -> None: + pca_sum = self._stats.pca_summary + pca_var = self._stats.pca_variance + if not pca_sum: + return + + comp90 = pca_sum.get("components_for_90pct", 0) + n_orig = len(self._schema.numeric_columns) + if n_orig > 0 and comp90 > 0: + reduction = 1 - comp90 / n_orig + if reduction > 0.5: + self._add( + type=InsightType.OPPORTUNITY, + severity=Severity.MEDIUM, + category="feature", + title=f"High dimensionality reduction potential: {n_orig} → {comp90} components for 90% variance", + description=( + f"PCA shows that {comp90} components explain 90% of variance " + f"from {n_orig} original features ({reduction * 100:.0f}% reduction)." + ), + evidence={"original_features": n_orig, "pca_components": comp90, "reduction": reduction}, + action_items=[ + "Consider PCA projection for dimensionality reduction in ML pipelines", + "Examine top PCA loadings to understand dominant variance directions", + ], + ) + + # First PC dominance + if not pca_var.empty and "variance_ratio" in pca_var.columns: + first_pc = pca_var.iloc[0]["variance_ratio"] if len(pca_var) > 0 else 0 + if first_pc > 0.6: + self._add( + type=InsightType.FINDING, + severity=Severity.MEDIUM, + category="feature", + title=f"First principal component explains {first_pc * 100:.0f}% of variance", + description=( + "A single axis captures most of the data's variability. " + "This suggests a dominant latent factor." + ), + evidence={"pc1_variance_ratio": first_pc}, + action_items=["Inspect PC1 loadings to identify the driving variables"], + ) + + # -- 10. Duplicates --------------------------------------------------- + + def _duplicate_insights(self) -> None: + dup = self._stats.duplicate_stats + if not dup: + return + + ratio = dup.get("duplicate_ratio", 0) + count = dup.get("duplicate_rows", 0) + + if ratio > 0.1: + self._add( + type=InsightType.WARNING, + severity=Severity.HIGH, + category="quality", + title=f"{count} duplicate rows ({ratio * 100:.1f}% of dataset)", + description="Significant portion of data is duplicated, which may bias analysis and modelling.", + evidence={"duplicate_rows": count, "duplicate_ratio": ratio}, + action_items=[ + "Remove exact duplicates before analysis", + "Check if duplicates are legitimate (e.g. repeated measurements) or data errors", + ], + ) + elif ratio > 0.01: + self._add( + type=InsightType.FINDING, + severity=Severity.LOW, + category="quality", + title=f"{count} duplicate rows ({ratio * 100:.1f}%)", + description="A small number of duplicate rows exist.", + evidence={"duplicate_rows": count, "duplicate_ratio": ratio}, + action_items=["Review whether duplicates should be removed for your use case"], + ) + + # -- 11. Advanced Distribution ---------------------------------------- + + def _advanced_distribution_insights(self) -> None: + adv = self._stats.advanced_stats + adv_dist = adv.get("advanced_distribution", {}) + if not adv_dist: + return + + # Best-fit distribution + best_fit = adv_dist.get("best_fit") + if best_fit is not None and not best_fit.empty and "best_distribution" in best_fit.columns: + non_normal = best_fit[best_fit["best_distribution"] != "norm"] + if not non_normal.empty: + dist_counts: dict[str, int] = {} + for d in non_normal["best_distribution"]: + dist_counts[d] = dist_counts.get(d, 0) + 1 + most_common = max(dist_counts, key=dist_counts.get) + self._add( + type=InsightType.FINDING, + severity=Severity.MEDIUM, + category="distribution", + title=f"{len(non_normal)} column(s) best fit by non-normal distributions", + description=( + f"Distribution fitting reveals non-Normal best fits. " + f"Most common: {most_common} ({dist_counts[most_common]} columns). " + f"Others: {dict(list(dist_counts.items())[:5])}." + ), + affected_columns=list(non_normal.index), + evidence={"distribution_counts": dist_counts}, + action_items=[ + "Use the identified distributions for parametric modeling or simulation", + "Transform columns toward normality if Gaussian assumptions are needed", + ], + ) + + # Power transform recommendations + pt = adv_dist.get("power_transform") + if pt is not None and not pt.empty and "needs_transform" in pt.columns: + needs = pt[pt["needs_transform"] == True] + if not needs.empty: + cols = list(needs.index) + self._add( + type=InsightType.RECOMMENDATION, + severity=Severity.MEDIUM, + category="distribution", + title=f"{len(cols)} column(s) benefit from power transformation", + description=( + f"Box-Cox / Yeo-Johnson transforms can significantly reduce skewness " + f"for columns: {cols[:5]}." + ), + affected_columns=cols, + action_items=[ + "Apply the recommended transform (Box-Cox or Yeo-Johnson) in preprocessing", + ], + ) + + # -- 12. Advanced Correlation ----------------------------------------- + + def _advanced_correlation_insights(self) -> None: + adv = self._stats.advanced_stats + adv_corr = adv.get("advanced_correlation", {}) + if not adv_corr: + return + + # Non-linear dependencies via MI + mi = adv_corr.get("mutual_information") + pearson = self._stats.correlation_matrix + if mi is not None and not mi.empty and not pearson.empty: + # Find pairs with high MI but low Pearson (non-linear relationship) + mi_cols = set(mi.columns) & set(pearson.columns) + nonlinear_pairs = [] + for c1 in mi_cols: + for c2 in mi_cols: + if c1 >= c2: + continue + mi_val = mi.loc[c1, c2] if c1 in mi.index and c2 in mi.columns else 0 + p_val = abs(pearson.loc[c1, c2]) if c1 in pearson.index and c2 in pearson.columns else 0 + if mi_val > 0.3 and p_val < 0.3: + nonlinear_pairs.append((c1, c2, float(mi_val), float(p_val))) + + if nonlinear_pairs: + nonlinear_pairs.sort(key=lambda x: x[2], reverse=True) + top = nonlinear_pairs[0] + self._add( + type=InsightType.FINDING, + severity=Severity.HIGH, + category="correlation", + title=f"{len(nonlinear_pairs)} non-linear dependency pair(s) detected", + description=( + f"High mutual information but low Pearson correlation suggests non-linear " + f"relationships. Top: '{top[0]}' ↔ '{top[1]}' (MI={top[2]:.2f}, r={top[3]:.2f})." + ), + affected_columns=[top[0], top[1]], + evidence={"nonlinear_pairs": nonlinear_pairs[:5]}, + action_items=[ + "Use non-linear models (tree-based, kernel) to capture these relationships", + "Consider polynomial or interaction features", + ], + ) + + # Confounded correlations (partial vs raw) + pcorr = adv_corr.get("partial_correlation") + if pcorr is not None and not pcorr.empty and not pearson.empty: + confounded = [] + pcorr_cols = set(pcorr.columns) & set(pearson.columns) + for c1 in pcorr_cols: + for c2 in pcorr_cols: + if c1 >= c2: + continue + raw = pearson.loc[c1, c2] if c1 in pearson.index and c2 in pearson.columns else 0 + part = pcorr.loc[c1, c2] if c1 in pcorr.index and c2 in pcorr.columns else 0 + if abs(raw) > 0.5 and abs(raw - part) > 0.3: + confounded.append((c1, c2, float(raw), float(part))) + + if confounded: + confounded.sort(key=lambda x: abs(x[2] - x[3]), reverse=True) + top = confounded[0] + self._add( + type=InsightType.FINDING, + severity=Severity.HIGH, + category="correlation", + title=f"{len(confounded)} likely confounded correlation(s) detected", + description=( + f"Raw correlation differs significantly from partial correlation, " + f"suggesting confounding variables. " + f"Top: '{top[0]}' ↔ '{top[1]}' (raw r={top[2]:.2f}, partial r={top[3]:.2f})." + ), + affected_columns=[top[0], top[1]], + evidence={"confounded_pairs": confounded[:5]}, + action_items=[ + "Do not assume causal relationship from raw correlation for these pairs", + "Investigate which variables are confounders", + ], + ) + + # Bootstrap CI stability + bci = adv_corr.get("bootstrap_ci") + if bci is not None and not bci.empty and "ci_width" in bci.columns: + unstable = bci[bci["ci_width"] > 0.4] + if not unstable.empty: + self._add( + type=InsightType.WARNING, + severity=Severity.MEDIUM, + category="correlation", + title=f"{len(unstable)} correlation estimate(s) with wide bootstrap CI", + description=( + "Correlation confidence intervals wider than 0.4 indicate " + "unreliable estimates — possibly due to small sample or outliers." + ), + evidence={"unstable_count": len(unstable)}, + action_items=[ + "Treat these correlations with caution", + "Consider collecting more data or removing outliers", + ], + ) + + # -- 13. General / Cross-Cutting -------------------------------------- + + def _general_insights(self) -> None: + n_rows = self._schema.n_rows + n_cols = self._schema.n_cols + n_num = len(self._schema.numeric_columns) + n_cat = len(self._schema.categorical_columns) + + # Curse of dimensionality + if n_cols > 0 and n_rows / n_cols < 10: + self._add( + type=InsightType.WARNING, + severity=Severity.HIGH, + category="general", + title=f"Low sample-to-feature ratio ({n_rows / n_cols:.1f}:1)", + description=( + f"With {n_rows} rows and {n_cols} columns, the sample-to-feature ratio is low. " + "This raises overfitting risk in ML models." + ), + evidence={"n_rows": n_rows, "n_cols": n_cols, "ratio": n_rows / n_cols}, + action_items=[ + "Apply dimensionality reduction (PCA, feature selection) before modelling", + "Use regularization (L1/L2) or simpler models", + "Collect more data if possible", + ], + ) + + # Very small dataset + if n_rows < 50: + self._add( + type=InsightType.WARNING, + severity=Severity.HIGH, + category="general", + title=f"Very small dataset ({n_rows} rows)", + description=( + "Statistical tests and ML models may be unreliable with so few samples. " + "Confidence intervals will be wide." + ), + evidence={"n_rows": n_rows}, + action_items=[ + "Use cross-validation with appropriate folds (e.g., leave-one-out for very small n)", + "Prefer non-parametric or Bayesian approaches", + ], + ) + + # All-numeric or all-categorical + if n_num > 0 and n_cat == 0 and n_cols > 3: + self._add( + type=InsightType.FINDING, + severity=Severity.LOW, + category="general", + title="Dataset is fully numeric (no categorical columns)", + description="All columns are numeric, which simplifies preprocessing but may miss categorical patterns.", + action_items=["Verify no categorical data was inadvertently coded as integers"], + ) + elif n_cat > 0 and n_num == 0 and n_cols > 3: + self._add( + type=InsightType.FINDING, + severity=Severity.LOW, + category="general", + title="Dataset is fully categorical (no numeric columns)", + description="All columns are categorical. Numeric encoding will be needed for most ML algorithms.", + action_items=["Plan encoding strategy (one-hot, target, ordinal) for all columns"], + ) diff --git a/f2a/stats/missing.py b/f2a/stats/missing.py new file mode 100644 index 0000000..d5287ce --- /dev/null +++ b/f2a/stats/missing.py @@ -0,0 +1,74 @@ +"""Missing data analysis module.""" + +from __future__ import annotations + +import pandas as pd + +from f2a.core.schema import DataSchema + + +class MissingStats: + """Analyze missing data patterns. + + Args: + df: Target DataFrame to analyze. + schema: Data schema. + """ + + def __init__(self, df: pd.DataFrame, schema: DataSchema) -> None: + self._df = df + self._schema = schema + + def column_summary(self) -> pd.DataFrame: + """Return per-column missing data summary. + + Returns: + DataFrame with missing count, ratio, and dtype per column. + """ + rows: list[dict] = [] + for col_info in self._schema.columns: + rows.append( + { + "column": col_info.name, + "missing_count": col_info.n_missing, + "missing_ratio": col_info.missing_ratio, + "missing_%": round(col_info.missing_ratio * 100, 2), + "dtype": col_info.dtype, + } + ) + + result = pd.DataFrame(rows).set_index("column") + return result.sort_values("missing_count", ascending=False) + + def row_missing_distribution(self) -> pd.DataFrame: + """Return per-row missing count distribution. + + Returns: + Frequency table of missing counts per row. + """ + row_missing = self._df.isna().sum(axis=1) + dist = row_missing.value_counts().sort_index() + return pd.DataFrame( + { + "missing_per_row": dist.index, + "row_count": dist.values, + "row_%": (dist.values / len(self._df) * 100).round(2), + } + ) + + def missing_matrix(self) -> pd.DataFrame: + """Return missing data matrix (boolean). + + Boolean matrix used for visualizing missing data patterns. + + Returns: + Boolean DataFrame where True indicates missing. + """ + return self._df.isna() + + def total_missing_ratio(self) -> float: + """Return the overall missing data ratio.""" + total_cells = self._df.shape[0] * self._df.shape[1] + if total_cells == 0: + return 0.0 + return round(float(self._df.isna().sum().sum() / total_cells), 4) diff --git a/f2a/stats/ml_readiness.py b/f2a/stats/ml_readiness.py new file mode 100644 index 0000000..354dc7e --- /dev/null +++ b/f2a/stats/ml_readiness.py @@ -0,0 +1,368 @@ +"""ML Readiness Evaluator — multi-dimensional assessment of dataset fitness. + +Evaluates a dataset across six dimensions to produce a composite *readiness +score* and letter grade, together with blocking issues that **must** be resolved +and improvement suggestions that **should** be considered before feeding the +data into a machine learning pipeline. + +Dimensions +---------- +1. **Completeness** — missing value burden +2. **Consistency** — type homogeneity, value-range sanity +3. **Balance** — class / category imbalance, outlier skew +4. **Informativeness** — variance, uniqueness, MI content +5. **Independence** — multicollinearity (VIF / high-r) +6. **Scale** — sample-to-feature ratio, curse of dimensionality +""" + +from __future__ import annotations + +import math +from dataclasses import dataclass, field +from typing import Any + +import numpy as np +import pandas as pd + +from f2a.core.schema import DataSchema +from f2a.utils.logging import get_logger + +logger = get_logger(__name__) + +# Grade thresholds +_GRADES = [ + (95, "A+"), (90, "A"), (85, "B+"), (80, "B"), + (75, "C+"), (70, "C"), (60, "D"), (0, "F"), +] + + +def _to_grade(score: float) -> str: + for threshold, grade in _GRADES: + if score >= threshold: + return grade + return "F" + + +@dataclass +class ReadinessScore: + """ML readiness evaluation result.""" + + overall: float # 0-100 + grade: str # A+, A, B+, B, C+, C, D, F + dimensions: dict[str, float] # each 0-100 + blocking_issues: list[str] = field(default_factory=list) + suggestions: list[str] = field(default_factory=list) + details: dict[str, Any] = field(default_factory=dict) + + def to_dict(self) -> dict[str, Any]: + return { + "overall": round(self.overall, 1), + "grade": self.grade, + "dimensions": {k: round(v, 1) for k, v in self.dimensions.items()}, + "blocking_issues": self.blocking_issues, + "suggestions": self.suggestions, + "details": self.details, + } + + +class MLReadinessEvaluator: + """Evaluate the ML-readiness of a dataset from pre-computed stats. + + Parameters + ---------- + df : pd.DataFrame + The (cleaned) analysis DataFrame. + schema : DataSchema + Type metadata. + stats : StatsResult + All pre-computed statistical results (basic + advanced). + column_roles : pd.DataFrame | None + Output of ``ColumnRoleClassifier.summary()`` (optional). + """ + + # Dimension weights — must sum to 1.0 + _WEIGHTS = { + "completeness": 0.25, + "consistency": 0.15, + "balance": 0.15, + "informativeness": 0.20, + "independence": 0.15, + "scale": 0.10, + } + + def __init__( + self, + df: pd.DataFrame, + schema: DataSchema, + stats: Any, + column_roles: pd.DataFrame | None = None, + ) -> None: + self._df = df + self._schema = schema + self._stats = stats + self._roles = column_roles + self._blocking: list[str] = [] + self._suggestions: list[str] = [] + + def evaluate(self) -> ReadinessScore: + """Compute the overall ML readiness score.""" + dims: dict[str, float] = {} + details: dict[str, Any] = {} + + for name, method in [ + ("completeness", self._completeness), + ("consistency", self._consistency), + ("balance", self._balance), + ("informativeness", self._informativeness), + ("independence", self._independence), + ("scale", self._scale), + ]: + try: + score, det = method() + dims[name] = max(0.0, min(100.0, score)) + details[name] = det + except Exception as exc: + logger.debug("ML readiness dimension '%s' failed: %s", name, exc) + dims[name] = 50.0 # neutral fallback + details[name] = {"error": str(exc)} + + overall = sum(dims[d] * self._WEIGHTS[d] for d in dims) + grade = _to_grade(overall) + + return ReadinessScore( + overall=round(overall, 1), + grade=grade, + dimensions=dims, + blocking_issues=self._blocking, + suggestions=self._suggestions, + details=details, + ) + + # ================================================================== + # Dimension scorers — each returns (score_0_100, detail_dict) + # ================================================================== + + def _completeness(self) -> tuple[float, dict]: + mi = self._stats.missing_info + detail: dict[str, Any] = {} + + if mi.empty or "missing_ratio" not in mi.columns: + return 100.0, {"no_missing_info": True} + + ratios = mi["missing_ratio"] + overall_miss = float(ratios.mean()) + high_miss_cols = list(mi[ratios > 0.5].index) + mod_miss_cols = list(mi[(ratios > 0.1) & (ratios <= 0.5)].index) + + detail["overall_missing_rate"] = round(overall_miss, 4) + detail["high_missing_columns"] = high_miss_cols[:10] + detail["moderate_missing_columns"] = mod_miss_cols[:10] + + if high_miss_cols: + self._blocking.append( + f"{len(high_miss_cols)} column(s) have >50% missing — drop or impute: " + f"{', '.join(high_miss_cols[:5])}" + ) + + if mod_miss_cols: + self._suggestions.append( + f"{len(mod_miss_cols)} column(s) have 10-50% missing — plan imputation strategy" + ) + + # Score: 100 if 0 missing, linearly degrade + score = max(0, 100 * (1 - overall_miss * 2)) # 50% average missing → 0 + return score, detail + + def _consistency(self) -> tuple[float, dict]: + detail: dict[str, Any] = {} + penalties = 0.0 + n_cols = self._schema.n_cols + + # Mixed types from preprocessing + pp = self._stats.preprocessing + mixed = len(pp.mixed_type_columns) if pp else 0 + inf_cols = len(pp.infinite_value_columns) if pp else 0 + + detail["mixed_type_columns"] = mixed + detail["infinite_value_columns"] = inf_cols + + if mixed > 0: + penalties += (mixed / max(n_cols, 1)) * 40 + self._suggestions.append(f"{mixed} mixed-type column(s) — cast to consistent types") + + if inf_cols > 0: + penalties += (inf_cols / max(n_cols, 1)) * 20 + self._suggestions.append(f"{inf_cols} column(s) contain infinity values — replace with NaN or cap") + + # ID-like columns that shouldn't be features + if self._roles is not None and not self._roles.empty: + ids = self._roles[self._roles["primary_role"] == "id"] + if not ids.empty: + detail["id_columns"] = list(ids.index) + self._suggestions.append( + f"Remove {len(ids)} ID-like column(s) before modelling: " + f"{', '.join(list(ids.index)[:5])}" + ) + + # Constants + if self._roles is not None and not self._roles.empty: + constants = self._roles[self._roles["primary_role"] == "constant"] + if not constants.empty: + penalties += len(constants) / max(n_cols, 1) * 20 + self._blocking.append( + f"{len(constants)} constant column(s) — remove before modelling" + ) + + score = max(0, 100 - penalties) + return score, detail + + def _balance(self) -> tuple[float, dict]: + detail: dict[str, Any] = {} + penalties = 0.0 + + # Outlier ratio + out = self._stats.outlier_summary + if not out.empty and "outlier_%" in out.columns: + avg_outlier = float(out["outlier_%"].mean()) + detail["avg_outlier_pct"] = round(avg_outlier, 2) + if avg_outlier > 20: + penalties += 30 + self._suggestions.append("High average outlier rate — consider winsorization or robust methods") + elif avg_outlier > 10: + penalties += 15 + + # Categorical imbalance (Gini index) + cat_cols = self._schema.categorical_columns[:20] + if cat_cols: + ginis = [] + for col in cat_cols: + if col in self._df.columns: + vc = self._df[col].value_counts(normalize=True).values + gini = 1 - np.sum(vc ** 2) + ginis.append(gini) + if ginis: + avg_gini = float(np.mean(ginis)) + detail["avg_categorical_gini"] = round(avg_gini, 4) + # Low Gini means imbalanced + if avg_gini < 0.3: + penalties += 20 + self._suggestions.append( + "Categorical columns are highly imbalanced — consider SMOTE or class weighting" + ) + + score = max(0, 100 - penalties) + return score, detail + + def _informativeness(self) -> tuple[float, dict]: + detail: dict[str, Any] = {} + penalties = 0.0 + n_cols = self._schema.n_cols + + # Duplicate ratio + dup = self._stats.duplicate_stats + dup_ratio = dup.get("duplicate_ratio", 0) if dup else 0 + detail["duplicate_ratio"] = round(dup_ratio, 4) + if dup_ratio > 0.2: + penalties += 25 + self._blocking.append(f"{dup_ratio * 100:.0f}% duplicate rows — remove before modelling") + elif dup_ratio > 0.05: + penalties += 10 + self._suggestions.append("Some duplicate rows exist — verify they are intentional") + + # Low-variance features (constant or near-constant) + summary = self._stats.summary + if not summary.empty and "cv" in summary.columns: + near_const = summary[(summary["cv"].notna()) & (summary["cv"].abs() < 0.01)] + if not near_const.empty: + penalties += (len(near_const) / max(n_cols, 1)) * 30 + detail["near_constant_columns"] = list(near_const.index)[:10] + self._suggestions.append( + f"{len(near_const)} near-constant column(s) carry very little information" + ) + + # PCA compressibility (high reduction = redundancy penalty, but also okay) + pca_sum = self._stats.pca_summary + if pca_sum: + comp90 = pca_sum.get("components_for_90pct", 0) + n_num = len(self._schema.numeric_columns) + if n_num > 0 and comp90 > 0: + compression = comp90 / n_num + detail["pca_compression"] = round(compression, 3) + if compression < 0.3: + # very compressible → lots of redundancy + penalties += 10 + self._suggestions.append( + f"90% variance in just {comp90}/{n_num} PCs — consider PCA for dimensionality reduction" + ) + + score = max(0, 100 - penalties) + return score, detail + + def _independence(self) -> tuple[float, dict]: + detail: dict[str, Any] = {} + penalties = 0.0 + + vif = self._stats.vif_table + if not vif.empty and "VIF" in vif.columns: + severe = vif[vif["VIF"] > 10] + moderate = vif[(vif["VIF"] > 5) & (vif["VIF"] <= 10)] + detail["severe_vif_columns"] = list(severe.index)[:10] + detail["moderate_vif_columns"] = list(moderate.index)[:10] + + if not severe.empty: + worst_vif = float(severe["VIF"].max()) + penalties += min(50, len(severe) * 10) + if worst_vif > 100: + self._blocking.append( + f"Extreme multicollinearity: VIF={worst_vif:.0f} for '{severe['VIF'].idxmax()}' — remove or combine" + ) + else: + self._suggestions.append( + f"{len(severe)} column(s) with VIF>10 — consider regularization or PCA" + ) + + if not moderate.empty: + penalties += len(moderate) * 3 + + # High-correlation pairs + corr = self._stats.correlation_matrix + if not corr.empty: + n_high = 0 + cols_list = corr.columns.tolist() + for i, c1 in enumerate(cols_list): + for c2 in cols_list[i + 1:]: + if abs(corr.loc[c1, c2]) > 0.95: + n_high += 1 + if n_high > 0: + detail["near_perfect_pairs"] = n_high + penalties += min(30, n_high * 5) + + score = max(0, 100 - penalties) + return score, detail + + def _scale(self) -> tuple[float, dict]: + detail: dict[str, Any] = {} + n_rows = self._schema.n_rows + n_features = len(self._schema.numeric_columns) + len(self._schema.categorical_columns) + + ratio = n_rows / max(n_features, 1) + detail["sample_feature_ratio"] = round(ratio, 1) + detail["n_rows"] = n_rows + detail["n_features"] = n_features + + if ratio < 5: + self._blocking.append( + f"Sample-to-feature ratio is {ratio:.1f}:1 — very high overfitting risk" + ) + score = max(0, ratio / 5 * 50) + elif ratio < 10: + self._suggestions.append( + f"Sample-to-feature ratio ({ratio:.0f}:1) is low — use regularization" + ) + score = 50 + (ratio - 5) / 5 * 30 + elif ratio < 20: + score = 80 + (ratio - 10) / 10 * 15 + else: + score = min(100, 95 + min(ratio / 100, 1) * 5) + + return score, detail diff --git a/f2a/stats/outlier.py b/f2a/stats/outlier.py new file mode 100644 index 0000000..8f5d1e9 --- /dev/null +++ b/f2a/stats/outlier.py @@ -0,0 +1,159 @@ +"""Outlier detection module. + +Provides IQR-based and Z-score-based outlier detection for numeric columns. +""" + +from __future__ import annotations + +import numpy as np +import pandas as pd + +from f2a.core.schema import DataSchema + + +class OutlierStats: + """Detect and summarise outliers in numeric columns. + + Args: + df: Target DataFrame. + schema: Data schema. + """ + + def __init__(self, df: pd.DataFrame, schema: DataSchema) -> None: + self._df = df + self._schema = schema + + # ── IQR method ──────────────────────────────────────── + + def iqr_summary(self, multiplier: float = 1.5) -> pd.DataFrame: + """Detect outliers using the IQR fence method. + + Args: + multiplier: IQR multiplier (default 1.5 for moderate outliers, + 3.0 for extreme outliers). + + Returns: + Per-column outlier summary DataFrame. + """ + cols = self._schema.numeric_columns + if not cols: + return pd.DataFrame() + + rows: list[dict] = [] + for col in cols: + series = self._df[col].dropna() + if len(series) == 0: + continue + + q1 = float(series.quantile(0.25)) + q3 = float(series.quantile(0.75)) + iqr = q3 - q1 + lower = q1 - multiplier * iqr + upper = q3 + multiplier * iqr + + outlier_mask = (series < lower) | (series > upper) + outliers = series[outlier_mask] + n_outliers = len(outliers) + + rows.append({ + "column": col, + "q1": round(q1, 4), + "q3": round(q3, 4), + "iqr": round(iqr, 4), + "lower_bound": round(lower, 4), + "upper_bound": round(upper, 4), + "outlier_count": n_outliers, + "outlier_%": round(n_outliers / len(series) * 100, 2), + "min_outlier": round(float(outliers.min()), 4) if n_outliers > 0 else None, + "max_outlier": round(float(outliers.max()), 4) if n_outliers > 0 else None, + }) + + return pd.DataFrame(rows).set_index("column") if rows else pd.DataFrame() + + # ── Z-score method ──────────────────────────────────── + + def zscore_summary(self, threshold: float = 3.0) -> pd.DataFrame: + """Detect outliers using the Z-score method. + + Args: + threshold: Z-score absolute threshold (default 3.0). + + Returns: + Per-column outlier summary DataFrame. + """ + cols = self._schema.numeric_columns + if not cols: + return pd.DataFrame() + + rows: list[dict] = [] + for col in cols: + series = self._df[col].dropna() + if len(series) < 3: + continue + + mean = float(series.mean()) + std = float(series.std()) + if std == 0: + continue + + z = np.abs((series - mean) / std) + n_outliers = int((z > threshold).sum()) + + rows.append({ + "column": col, + "mean": round(mean, 4), + "std": round(std, 4), + "threshold": threshold, + "outlier_count": n_outliers, + "outlier_%": round(n_outliers / len(series) * 100, 2), + "max_zscore": round(float(z.max()), 4), + }) + + return pd.DataFrame(rows).set_index("column") if rows else pd.DataFrame() + + # ── Convenience ─────────────────────────────────────── + + def summary(self, method: str = "iqr", **kwargs) -> pd.DataFrame: + """Return outlier summary using the specified *method*. + + Args: + method: ``"iqr"`` (default) or ``"zscore"``. + **kwargs: Passed to the underlying method. + """ + if method == "zscore": + return self.zscore_summary(**kwargs) + return self.iqr_summary(**kwargs) + + def outlier_mask(self, method: str = "iqr", **kwargs) -> pd.DataFrame: + """Return a boolean DataFrame where ``True`` marks an outlier. + + Useful for downstream visualisation. + """ + cols = self._schema.numeric_columns + if not cols: + return pd.DataFrame() + + mask = pd.DataFrame(False, index=self._df.index, columns=cols) + + if method == "zscore": + threshold = kwargs.get("threshold", 3.0) + for col in cols: + series = self._df[col].dropna() + if len(series) < 3 or series.std() == 0: + continue + z = np.abs((series - series.mean()) / series.std()) + mask.loc[z.index, col] = z > threshold + else: + multiplier = kwargs.get("multiplier", 1.5) + for col in cols: + series = self._df[col].dropna() + if len(series) == 0: + continue + q1 = series.quantile(0.25) + q3 = series.quantile(0.75) + iqr = q3 - q1 + mask.loc[series.index, col] = (series < q1 - multiplier * iqr) | ( + series > q3 + multiplier * iqr + ) + + return mask diff --git a/f2a/stats/pca_analysis.py b/f2a/stats/pca_analysis.py new file mode 100644 index 0000000..b167bff --- /dev/null +++ b/f2a/stats/pca_analysis.py @@ -0,0 +1,159 @@ +"""PCA (Principal Component Analysis) module. + +Computes variance explained, loadings, and transformed coordinates +for numeric columns. Requires ``scikit-learn``. +""" + +from __future__ import annotations + +from typing import Any + +import numpy as np +import pandas as pd + +from f2a.core.schema import DataSchema +from f2a.utils.logging import get_logger + +logger = get_logger(__name__) + + +class PCAStats: + """Perform PCA on numeric columns. + + Args: + df: Target DataFrame. + schema: Data schema. + max_components: Maximum number of components to compute. + """ + + def __init__( + self, + df: pd.DataFrame, + schema: DataSchema, + max_components: int = 10, + ) -> None: + self._df = df + self._schema = schema + self._max_components = max_components + + self._fitted = False + self._pca: Any = None + self._X_scaled: np.ndarray | None = None + self._feature_names: list[str] = [] + self._n_components = 0 + + # ── Lazy fitting ────────────────────────────────────── + + def _fit(self) -> bool: + """Fit PCA model. Returns ``True`` on success.""" + if self._fitted: + return self._pca is not None + + self._fitted = True + + cols = self._schema.numeric_columns + if len(cols) < 2: + return False + + try: + from sklearn.decomposition import PCA + from sklearn.preprocessing import StandardScaler + except ImportError: + logger.info("scikit-learn not installed; skipping PCA analysis.") + return False + + df_clean = self._df[cols].dropna() + if len(df_clean) < max(10, len(cols)): + return False + + try: + scaler = StandardScaler() + X = scaler.fit_transform(df_clean) + + self._n_components = min(self._max_components, len(cols), len(df_clean) - 1) + if self._n_components < 1: + return False + + self._pca = PCA(n_components=self._n_components) + self._pca.fit(X) + self._X_scaled = X + self._feature_names = list(cols) + return True + except Exception as exc: + logger.warning("PCA failed: %s", exc) + return False + + # ── Variance explained ──────────────────────────────── + + def variance_explained(self) -> pd.DataFrame: + """Return variance explained by each principal component. + + Returns: + DataFrame with variance ratio, cumulative ratio, and eigenvalue + per component. + """ + if not self._fit(): + return pd.DataFrame() + + rows: list[dict] = [] + cum = np.cumsum(self._pca.explained_variance_ratio_) + for i in range(self._n_components): + rows.append({ + "component": f"PC{i + 1}", + "variance_ratio": round(float(self._pca.explained_variance_ratio_[i]), 4), + "cumulative_ratio": round(float(cum[i]), 4), + "eigenvalue": round(float(self._pca.explained_variance_[i]), 4), + }) + + return pd.DataFrame(rows).set_index("component") + + # ── Loadings ────────────────────────────────────────── + + def loadings(self) -> pd.DataFrame: + """Return PCA loadings (feature weights per component). + + Returns: + DataFrame with features as rows and ``PC1 .. PCn`` as columns. + """ + if not self._fit(): + return pd.DataFrame() + + n_show = min(5, self._n_components) + cols = [f"PC{i + 1}" for i in range(n_show)] + return pd.DataFrame( + self._pca.components_[:n_show].T, + index=self._feature_names, + columns=cols, + ).round(4) + + # ── Transformed coordinates ─────────────────────────── + + def transformed(self, n_components: int = 2) -> pd.DataFrame: + """Return data projected onto the first *n_components* principal components.""" + if not self._fit() or self._X_scaled is None: + return pd.DataFrame() + + n = min(n_components, self._n_components) + coords = self._pca.transform(self._X_scaled)[:, :n] + cols = [f"PC{i + 1}" for i in range(n)] + return pd.DataFrame(coords, columns=cols) + + # ── Summary ─────────────────────────────────────────── + + def summary(self) -> dict[str, Any]: + """Return a concise PCA summary.""" + ve = self.variance_explained() + if ve.empty: + return {} + + # Number of components to reach 90 % variance + cum = ve["cumulative_ratio"] + above_90 = cum[cum >= 0.90] + n_for_90 = int(above_90.index[0].replace("PC", "")) if len(above_90) > 0 else len(cum) + + return { + "n_components": len(ve), + "total_variance_explained": round(float(cum.iloc[-1]), 4), + "components_for_90pct": n_for_90, + "top_component_variance": round(float(ve["variance_ratio"].iloc[0]), 4), + } diff --git a/f2a/stats/quality.py b/f2a/stats/quality.py new file mode 100644 index 0000000..461fe02 --- /dev/null +++ b/f2a/stats/quality.py @@ -0,0 +1,236 @@ +"""Data quality scoring module. + +Computes per-column and overall quality scores across **six** dimensions: + +1. **Completeness** — proportion of non-missing cells. +2. **Uniqueness** — proportion of non-duplicate rows. +3. **Consistency** — dtype-based type-uniformity check (fast). +4. **Validity** — proportion of finite numeric values (no ``inf``). +5. **Timeliness** — recency of datetime columns (optional). +6. **Conformity** — value-range and pattern compliance. + +The ``overall_score`` is a weighted average of whichever dimensions apply +to the dataset, ensuring the score adapts to the data's characteristics. +""" + +from __future__ import annotations + +import re +from typing import Any + +import numpy as np +import pandas as pd + +from f2a.core.schema import DataSchema + + +class QualityStats: + """Compute data quality scores. + + Args: + df: Target DataFrame. + schema: Data schema. + """ + + def __init__(self, df: pd.DataFrame, schema: DataSchema) -> None: + self._df = df + self._schema = schema + + # ── Dimension scores ────────────────────────────────── + + def completeness(self) -> float: + """Proportion of non-missing cells.""" + total = self._df.shape[0] * self._df.shape[1] + if total == 0: + return 1.0 + return round(1.0 - float(self._df.isna().sum().sum() / total), 4) + + def uniqueness(self) -> float: + """Proportion of non-duplicate rows.""" + n = len(self._df) + if n == 0: + return 1.0 + return round(1.0 - float(self._df.duplicated().sum() / n), 4) + + def consistency(self) -> float: + """Type-consistency score — fraction of columns with uniform dtype. + + Uses ``dtype.kind`` instead of the slow per-element ``apply(type)`` + approach, checking whether object-typed columns are truly mixed-type. + """ + ncol = len(self._df.columns) + if ncol == 0: + return 1.0 + + consistent = 0 + for col in self._df.columns: + kind = self._df[col].dtype.kind + if kind != "O": + # Non-object dtypes (int, float, bool, datetime, …) are + # inherently type-consistent. + consistent += 1 + continue + # For object columns, sample up to 1 000 values and check types. + non_null = self._df[col].dropna() + if len(non_null) == 0: + consistent += 1 + continue + sample = non_null.head(1_000) + types_seen = set(type(v).__name__ for v in sample.values) + if len(types_seen) <= 1: + consistent += 1 + + return round(consistent / ncol, 4) + + def validity(self) -> float: + """Proportion of finite numeric values (excludes ``inf`` / ``-inf``).""" + num_cols = self._schema.numeric_columns + if not num_cols: + return 1.0 + + total = 0 + valid = 0 + for col in num_cols: + series = self._df[col].dropna() + total += len(series) + valid += int(np.isfinite(series).sum()) + + return round(valid / total, 4) if total > 0 else 1.0 + + def timeliness(self) -> float | None: + """Recency score for datetime columns (0 = ancient, 1 = fresh). + + If no datetime columns exist, returns ``None`` and the dimension + is excluded from the overall score. + + Heuristic: score = mean(exp(−days_since / 365)) across datetime cols. + """ + dt_cols = self._schema.datetime_columns + if not dt_cols: + return None + + now = pd.Timestamp.now() + scores: list[float] = [] + for col in dt_cols: + series = pd.to_datetime(self._df[col], errors="coerce").dropna() + if series.empty: + continue + max_ts = series.max() + if pd.isna(max_ts): + continue + days_since = max((now - max_ts).days, 0) + # exponential decay with half-life ≈ 253 days + scores.append(float(np.exp(-days_since / 365.0))) + + if not scores: + return None + return round(float(np.mean(scores)), 4) + + def conformity(self) -> float: + """Pattern-and-range compliance score. + + Checks: + * Numeric columns: values within [μ ± 4σ] (i.e. no extreme outliers). + * String columns: no excessively long / short values or embedded + control characters. + + Returns: + Score in [0, 1]. 1.0 = fully conforming. + """ + scores: list[float] = [] + + # ── Numeric: fraction within ±4σ + for col in self._schema.numeric_columns: + series = self._df[col].dropna() + if len(series) < 10: + scores.append(1.0) + continue + mu, sigma = float(series.mean()), float(series.std()) + if sigma == 0: + scores.append(1.0) + continue + in_range = ((series >= mu - 4 * sigma) & (series <= mu + 4 * sigma)).sum() + scores.append(float(in_range) / len(series)) + + # ── String: no control characters (ASCII 0-31 except \n\r\t) + _CTRL_RE = re.compile(r"[\x00-\x08\x0b\x0c\x0e-\x1f]") + for col in self._schema.categorical_columns: + series = self._df[col].dropna().astype(str).head(2_000) + if series.empty: + scores.append(1.0) + continue + has_ctrl = series.apply(lambda v: bool(_CTRL_RE.search(v))) + scores.append(1.0 - float(has_ctrl.mean())) + + if not scores: + return 1.0 + return round(float(np.mean(scores)), 4) + + def overall_score(self) -> float: + """Weighted average of all applicable quality dimensions. + + Base weights (always active): + completeness 30 %, uniqueness 20 %, consistency 15 %, + validity 15 %, conformity 10 %. + If timeliness is available, it receives 10 % and the others + are proportionally reduced. + """ + dims: dict[str, tuple[float, float]] = { + "completeness": (0.30, self.completeness()), + "uniqueness": (0.20, self.uniqueness()), + "consistency": (0.15, self.consistency()), + "validity": (0.15, self.validity()), + "conformity": (0.10, self.conformity()), + } + + timeliness_val = self.timeliness() + if timeliness_val is not None: + dims["timeliness"] = (0.10, timeliness_val) + + total_weight = sum(w for w, _ in dims.values()) + score = sum(w * v for w, v in dims.values()) / total_weight + return round(score, 4) + + # ── Summaries ───────────────────────────────────────── + + def summary(self) -> dict[str, Any]: + """Return all quality dimension scores.""" + result: dict[str, Any] = { + "completeness": self.completeness(), + "uniqueness": self.uniqueness(), + "consistency": self.consistency(), + "validity": self.validity(), + "conformity": self.conformity(), + } + timeliness_val = self.timeliness() + if timeliness_val is not None: + result["timeliness"] = timeliness_val + result["overall"] = self.overall_score() + return result + + def column_quality(self) -> pd.DataFrame: + """Return per-column quality scores. + + Returns: + DataFrame indexed by column name with completeness, uniqueness, + type, and composite quality_score. + """ + rows: list[dict] = [] + for col_info in self._schema.columns: + col = col_info.name + series = self._df[col] + compl = 1.0 - col_info.missing_ratio + + n_total = int(series.count()) + n_unique = int(series.nunique()) + uniqueness = n_unique / n_total if n_total > 0 else 1.0 + + rows.append({ + "column": col, + "completeness": round(compl, 4), + "uniqueness": round(min(uniqueness, 1.0), 4), + "type": col_info.inferred_type.value, + "quality_score": round((compl + min(uniqueness, 1.0)) / 2, 4), + }) + + return pd.DataFrame(rows).set_index("column") if rows else pd.DataFrame() diff --git a/f2a/stats/statistical_tests.py b/f2a/stats/statistical_tests.py new file mode 100644 index 0000000..a0cbb5e --- /dev/null +++ b/f2a/stats/statistical_tests.py @@ -0,0 +1,494 @@ +"""Statistical hypothesis tests module. + +Provides Levene, Kruskal-Wallis, Mann-Whitney, Chi-Square goodness-of-fit, +Grubbs outlier test, and Augmented Dickey-Fuller stationarity test. + +**Enhancements over v1**: + +* **Kruskal-Wallis** now uses categorical columns as grouping variables + so each test compares one numeric column across groups of a factor — the + semantically correct usage. +* **Benjamini-Hochberg FDR** correction is applied to all pairwise / + multi-test batteries (Levene, Mann-Whitney, Kruskal-Wallis). +* **Effect sizes** are reported alongside every test: + - rank-biserial *r* for Mann-Whitney U + - η² (eta-squared) for Kruskal-Wallis H + - Cohen's *d* proxy for Levene (log-variance difference) + - Cramér's *V* for Chi-Square + +References: + - Levene (1960) — equality of variances + - Kruskal & Wallis (1952) — non-parametric one-way ANOVA + - Mann & Whitney (1947) — two-sample rank test + - Grubbs (1950) — single-outlier test + - Dickey & Fuller (1979) — stationarity test + - Benjamini & Hochberg (1995) — FDR control + - Rosenthal (1991) — rank-biserial correlation + - Cohen (1988) — effect size conventions +""" + +from __future__ import annotations + +from typing import Any + +import numpy as np +import pandas as pd +from scipy import stats as sp_stats + +from f2a.core.schema import DataSchema +from f2a.utils.logging import get_logger + +logger = get_logger(__name__) + +# ── Utility: Benjamini-Hochberg FDR correction ─────────── + + +def _bh_adjust(p_values: list[float]) -> list[float]: + """Return Benjamini-Hochberg adjusted p-values. + + Args: + p_values: Raw p-values (same order as rows). + + Returns: + Adjusted p-values clipped to [0, 1]. + """ + m = len(p_values) + if m == 0: + return [] + arr = np.asarray(p_values, dtype=float) + order = np.argsort(arr) + ranked = np.empty_like(arr) + ranked[order] = np.arange(1, m + 1) + + adjusted = arr * m / ranked + # enforce monotonicity (descending by rank order) + sorted_idx = np.argsort(ranked)[::-1] + cum_min = np.minimum.accumulate(adjusted[sorted_idx]) + adjusted[sorted_idx] = cum_min + return np.clip(adjusted, 0.0, 1.0).tolist() + + +def _significance_stars(p: float) -> str: + """Return significance star annotation.""" + if p < 0.001: + return "***" + if p < 0.01: + return "**" + if p < 0.05: + return "*" + if p < 0.1: + return "†" + return "ns" + + +class StatisticalTests: + """Perform various statistical hypothesis tests. + + Args: + df: Target DataFrame. + schema: Data schema. + """ + + _MAX_PAIRWISE = 15 + _MAX_CATEGORIES = 20 + _MIN_GROUP_SIZE = 5 + + def __init__(self, df: pd.DataFrame, schema: DataSchema) -> None: + self._df = df + self._schema = schema + + # ── Levene's test (homogeneity of variances) ────────── + + def levene_test(self) -> pd.DataFrame: + """Levene's test for equality of variances across numeric columns. + + Tests whether pairs of numeric columns have equal variances. + Results include BH-adjusted p-values and a log-variance-ratio + effect size proxy. + + Returns: + DataFrame with pairwise Levene test results. + """ + cols = self._schema.numeric_columns + if len(cols) < 2: + return pd.DataFrame() + + cols = cols[: self._MAX_PAIRWISE] + rows: list[dict] = [] + + for i in range(len(cols)): + for j in range(i + 1, len(cols)): + a = self._df[cols[i]].dropna().values + b = self._df[cols[j]].dropna().values + if len(a) < 3 or len(b) < 3: + continue + try: + stat, p = sp_stats.levene(a, b) + # Effect size: absolute log-variance ratio + var_a = float(np.var(a, ddof=1)) if len(a) > 1 else 1e-12 + var_b = float(np.var(b, ddof=1)) if len(b) > 1 else 1e-12 + log_var_ratio = abs( + float(np.log(max(var_a, 1e-12) / max(var_b, 1e-12))) + ) + rows.append({ + "col_a": cols[i], + "col_b": cols[j], + "levene_stat": round(float(stat), 4), + "p_value": round(float(p), 6), + "log_var_ratio": round(log_var_ratio, 4), + }) + except Exception: + continue + + if not rows: + return pd.DataFrame() + + # BH-adjusted p-values + raw_p = [r["p_value"] for r in rows] + adj_p = _bh_adjust(raw_p) + for r, ap in zip(rows, adj_p): + r["adjusted_p"] = round(ap, 6) + r["significant_0.05"] = ap < 0.05 + r["stars"] = _significance_stars(ap) + + return pd.DataFrame(rows) + + # ── Kruskal-Wallis test ─────────────────────────────── + + def kruskal_wallis(self) -> pd.DataFrame: + """Kruskal-Wallis H-test: numeric column grouped by categorical factor. + + For each (categorical, numeric) pair the test checks whether the + numeric distribution differs across the levels of the factor. + Reports η² (eta-squared) effect size and BH-adjusted p-values. + + Returns: + DataFrame with one row per (grouping_col, numeric_col) pair. + """ + num_cols = self._schema.numeric_columns + cat_cols = self._schema.categorical_columns + + if not num_cols or not cat_cols: + return pd.DataFrame() + + # Limit to manageable size + cat_cols = cat_cols[:10] + num_cols = num_cols[:15] + + rows: list[dict] = [] + + for cat in cat_cols: + groups_series = self._df[cat] + unique_vals = groups_series.dropna().unique() + # skip useless groupings (1 group, or >50 levels) + if len(unique_vals) < 2 or len(unique_vals) > 50: + continue + + for num in num_cols: + sub = self._df[[cat, num]].dropna() + grouped = [ + grp[num].values + for _, grp in sub.groupby(cat) + if len(grp) >= self._MIN_GROUP_SIZE + ] + if len(grouped) < 2: + continue + + try: + stat, p = sp_stats.kruskal(*grouped) + n_total = sum(len(g) for g in grouped) + k = len(grouped) + # η² = (H - k + 1) / (n - k) + eta_sq = max( + 0.0, (float(stat) - k + 1) / (n_total - k) + ) if n_total > k else 0.0 + rows.append({ + "grouping_col": cat, + "numeric_col": num, + "n_groups": k, + "h_statistic": round(float(stat), 4), + "p_value": round(float(p), 6), + "eta_squared": round(eta_sq, 4), + "effect_magnitude": ( + "large" if eta_sq >= 0.14 + else "medium" if eta_sq >= 0.06 + else "small" + ), + }) + except Exception: + continue + + if not rows: + return pd.DataFrame() + + # BH correction + raw_p = [r["p_value"] for r in rows] + adj_p = _bh_adjust(raw_p) + for r, ap in zip(rows, adj_p): + r["adjusted_p"] = round(ap, 6) + r["reject_h0_0.05"] = ap < 0.05 + r["stars"] = _significance_stars(ap) + r["interpretation"] = ( + f"Significant (η²={r['eta_squared']}, {r['effect_magnitude']})" + if ap < 0.05 + else "No significant difference" + ) + + return pd.DataFrame(rows) + + # ── Mann-Whitney U test ─────────────────────────────── + + def mann_whitney(self) -> pd.DataFrame: + """Pairwise Mann-Whitney U tests between numeric columns. + + Reports rank-biserial *r* effect size (Rosenthal, 1991) and + BH-adjusted p-values. + + Returns: + DataFrame with col_a, col_b, U-stat, p-value, effect size. + """ + cols = self._schema.numeric_columns + if len(cols) < 2: + return pd.DataFrame() + + cols = cols[: self._MAX_PAIRWISE] + rows: list[dict] = [] + + for i in range(len(cols)): + for j in range(i + 1, len(cols)): + a = self._df[cols[i]].dropna().values + b = self._df[cols[j]].dropna().values + if len(a) < self._MIN_GROUP_SIZE or len(b) < self._MIN_GROUP_SIZE: + continue + try: + stat, p = sp_stats.mannwhitneyu(a, b, alternative="two-sided") + n1, n2 = len(a), len(b) + # rank-biserial r = 1 - 2U / (n1 * n2) + r_rb = 1.0 - 2.0 * float(stat) / (n1 * n2) + rows.append({ + "col_a": cols[i], + "col_b": cols[j], + "u_statistic": round(float(stat), 2), + "p_value": round(float(p), 6), + "rank_biserial_r": round(r_rb, 4), + "effect_magnitude": ( + "large" if abs(r_rb) >= 0.5 + else "medium" if abs(r_rb) >= 0.3 + else "small" + ), + }) + except Exception: + continue + + if not rows: + return pd.DataFrame() + + # BH correction + raw_p = [r["p_value"] for r in rows] + adj_p = _bh_adjust(raw_p) + for r, ap in zip(rows, adj_p): + r["adjusted_p"] = round(ap, 6) + r["significant_0.05"] = ap < 0.05 + r["stars"] = _significance_stars(ap) + + return pd.DataFrame(rows) + + # ── Chi-square goodness-of-fit ──────────────────────── + + def chi_square_goodness(self) -> pd.DataFrame: + """Chi-square goodness-of-fit test for categorical columns. + + Tests whether observed frequencies differ from expected uniform. + Reports Cramér's *V* effect size. + + Returns: + DataFrame with test results per categorical column. + """ + cols = self._schema.categorical_columns + if not cols: + return pd.DataFrame() + + rows: list[dict] = [] + for col in cols[: self._MAX_CATEGORIES]: + vc = self._df[col].value_counts() + if len(vc) < 2 or len(vc) > 100: + continue + + observed = vc.values.astype(float) + expected = np.full_like(observed, observed.mean()) + n_obs = float(observed.sum()) + k = len(vc) + + try: + stat, p = sp_stats.chisquare(observed, f_exp=expected) + # Cramér's V for goodness-of-fit: sqrt(chi2 / (n*(k-1))) + cramers_v = float(np.sqrt(stat / (n_obs * (k - 1)))) if k > 1 else 0.0 + rows.append({ + "column": col, + "n_categories": k, + "chi2_stat": round(float(stat), 4), + "p_value": round(float(p), 6), + "cramers_v": round(cramers_v, 4), + "effect_magnitude": ( + "large" if cramers_v >= 0.5 + else "medium" if cramers_v >= 0.3 + else "small" + ), + "uniform_0.05": float(p) > 0.05, + "interpretation": ( + "Approximately uniform" + if float(p) > 0.05 + else "Non-uniform distribution" + ), + }) + except Exception: + continue + + return pd.DataFrame(rows).set_index("column") if rows else pd.DataFrame() + + # ── Grubbs' outlier test ────────────────────────────── + + def grubbs_test(self, alpha: float = 0.05) -> pd.DataFrame: + """Grubbs' test for a single outlier in each numeric column. + + Tests whether the maximum or minimum value is an outlier + assuming normal distribution. + + Args: + alpha: Significance level. + + Returns: + DataFrame with test results per column. + """ + cols = self._schema.numeric_columns + if not cols: + return pd.DataFrame() + + rows: list[dict] = [] + for col in cols: + series = self._df[col].dropna() + n = len(series) + if n < 7: + continue + + mean = float(series.mean()) + std = float(series.std()) + if std == 0: + continue + + # Test statistic = max(|x_i - mean|) / std + max_diff_idx = (series - mean).abs().idxmax() + max_val = float(series.loc[max_diff_idx]) + g_stat = abs(max_val - mean) / std + + # Critical value (t-distribution) + t_crit = float(sp_stats.t.ppf(1 - alpha / (2 * n), n - 2)) + g_crit = (n - 1) / np.sqrt(n) * np.sqrt(t_crit**2 / (n - 2 + t_crit**2)) + + is_outlier = g_stat > g_crit + + rows.append({ + "column": col, + "suspect_value": round(max_val, 4), + "grubbs_statistic": round(float(g_stat), 4), + "critical_value": round(float(g_crit), 4), + "is_outlier": is_outlier, + "n": n, + }) + + return pd.DataFrame(rows).set_index("column") if rows else pd.DataFrame() + + # ── Augmented Dickey-Fuller (stationarity) ──────────── + + def adf_test(self) -> pd.DataFrame: + """Augmented Dickey-Fuller test for stationarity. + + Tests whether a numeric time-series is stationary. + H0: The series has a unit root (non-stationary). + + Returns: + DataFrame with ADF results per numeric column. + """ + cols = self._schema.numeric_columns + if not cols: + return pd.DataFrame() + + try: + from statsmodels.tsa.stattools import adfuller + except ImportError: + logger.info("statsmodels not available; skipping ADF test.") + return pd.DataFrame() + + rows: list[dict] = [] + for col in cols: + series = self._df[col].dropna() + if len(series) < 20: + continue + try: + result = adfuller(series, autolag="AIC") + adf_stat, p_val, used_lag, nobs, critical_values, ic_best = result + rows.append({ + "column": col, + "adf_statistic": round(float(adf_stat), 4), + "p_value": round(float(p_val), 6), + "used_lag": int(used_lag), + "n_observations": int(nobs), + "critical_1%": round(float(critical_values["1%"]), 4), + "critical_5%": round(float(critical_values["5%"]), 4), + "critical_10%": round(float(critical_values["10%"]), 4), + "is_stationary_0.05": float(p_val) < 0.05, + }) + except Exception: + continue + + return pd.DataFrame(rows).set_index("column") if rows else pd.DataFrame() + + # ── Summary ─────────────────────────────────────────── + + def summary(self) -> dict[str, Any]: + """Return combined statistical test results.""" + result: dict[str, Any] = {} + + try: + lev = self.levene_test() + if not lev.empty: + result["levene"] = lev + except Exception as exc: + logger.debug("Levene test skipped: %s", exc) + + try: + kw = self.kruskal_wallis() + if not kw.empty: + result["kruskal_wallis"] = kw + except Exception as exc: + logger.debug("Kruskal-Wallis skipped: %s", exc) + + try: + mw = self.mann_whitney() + if not mw.empty: + result["mann_whitney"] = mw + except Exception as exc: + logger.debug("Mann-Whitney skipped: %s", exc) + + try: + csq = self.chi_square_goodness() + if not csq.empty: + result["chi_square_goodness"] = csq + except Exception as exc: + logger.debug("Chi-square goodness skipped: %s", exc) + + try: + grb = self.grubbs_test() + if not grb.empty: + result["grubbs"] = grb + except Exception as exc: + logger.debug("Grubbs test skipped: %s", exc) + + try: + adf = self.adf_test() + if not adf.empty: + result["adf"] = adf + except Exception as exc: + logger.debug("ADF test skipped: %s", exc) + + return result diff --git a/f2a/utils/__init__.py b/f2a/utils/__init__.py new file mode 100644 index 0000000..e650a02 --- /dev/null +++ b/f2a/utils/__init__.py @@ -0,0 +1 @@ +"""Utilities module — type inference, validation, and logging.""" diff --git a/f2a/utils/exceptions.py b/f2a/utils/exceptions.py new file mode 100644 index 0000000..2d3b068 --- /dev/null +++ b/f2a/utils/exceptions.py @@ -0,0 +1,32 @@ +"""Custom exception definitions.""" + + +class F2AError(Exception): + """Base exception for the f2a library.""" + + +class UnsupportedFormatError(F2AError): + """Unsupported file format.""" + + def __init__(self, source: str, detected: str | None = None) -> None: + msg = f"Unsupported file format: {source}" + if detected: + msg += f" (detected: {detected})" + super().__init__(msg) + + +class DataLoadError(F2AError): + """Data loading failure.""" + + def __init__(self, source: str, reason: str = "") -> None: + msg = f"Failed to load data: {source}" + if reason: + msg += f" — {reason}" + super().__init__(msg) + + +class EmptyDataError(F2AError): + """Empty dataset.""" + + def __init__(self, source: str) -> None: + super().__init__(f"Dataset is empty: {source}") diff --git a/f2a/utils/logging.py b/f2a/utils/logging.py new file mode 100644 index 0000000..a0f77f4 --- /dev/null +++ b/f2a/utils/logging.py @@ -0,0 +1,23 @@ +"""f2a logging configuration.""" + +import logging + +_LOG_FORMAT = "%(asctime)s [%(levelname)s] %(name)s: %(message)s" + + +def get_logger(name: str) -> logging.Logger: + """Return a module-level logger. + + Args: + name: Logger name (typically ``__name__``). + + Returns: + Configured :class:`logging.Logger` instance. + """ + logger = logging.getLogger(f"f2a.{name}") + if not logger.handlers: + handler = logging.StreamHandler() + handler.setFormatter(logging.Formatter(_LOG_FORMAT)) + logger.addHandler(handler) + logger.setLevel(logging.INFO) + return logger diff --git a/f2a/utils/type_inference.py b/f2a/utils/type_inference.py new file mode 100644 index 0000000..1607a54 --- /dev/null +++ b/f2a/utils/type_inference.py @@ -0,0 +1,95 @@ +"""Automatic data type inference utilities.""" + +from __future__ import annotations + +from enum import Enum + +import pandas as pd + + +class ColumnType(str, Enum): + """Column type classification.""" + + NUMERIC = "numeric" + CATEGORICAL = "categorical" + TEXT = "text" + DATETIME = "datetime" + BOOLEAN = "boolean" + + +# Max unique value ratio to consider a column categorical +_CATEGORICAL_RATIO_THRESHOLD = 0.05 # 5% +# Max absolute unique count to consider a column categorical +_CATEGORICAL_UNIQUE_THRESHOLD = 50 +# Min average string length to consider a column text +_TEXT_LENGTH_THRESHOLD = 50 + + +def infer_column_type(series: pd.Series) -> ColumnType: + """Infer the semantic type of a single column. + + Args: + series: Target pandas Series to analyze. + + Returns: + Inferred :class:`ColumnType`. + """ + # Boolean check + try: + if series.dtype == "bool" or set(series.dropna().unique()) <= {True, False, 0, 1}: + return ColumnType.BOOLEAN + except TypeError: + # Column contains unhashable types (e.g. numpy arrays, lists) + return ColumnType.TEXT + + # Datetime check + if pd.api.types.is_datetime64_any_dtype(series): + return ColumnType.DATETIME + + # Numeric check + if pd.api.types.is_numeric_dtype(series): + n_unique = series.nunique() + n_total = len(series) + # Treat as categorical if very few unique values + if n_unique <= 10 and n_total > 100: + return ColumnType.CATEGORICAL + return ColumnType.NUMERIC + + # String types + if pd.api.types.is_string_dtype(series) or pd.api.types.is_object_dtype(series): + n_unique = series.nunique() + n_total = len(series.dropna()) + + if n_total == 0: + return ColumnType.TEXT + + # Attempt datetime parsing + try: + pd.to_datetime(series.dropna().head(20)) + return ColumnType.DATETIME + except (ValueError, TypeError): + pass + + # Determine text vs categorical by unique ratio and string length + ratio = n_unique / n_total if n_total > 0 else 1.0 + avg_len = series.dropna().astype(str).str.len().mean() + + if avg_len > _TEXT_LENGTH_THRESHOLD: + return ColumnType.TEXT + if n_unique <= _CATEGORICAL_UNIQUE_THRESHOLD or ratio <= _CATEGORICAL_RATIO_THRESHOLD: + return ColumnType.CATEGORICAL + return ColumnType.TEXT + + return ColumnType.TEXT + + +def infer_all_types(df: pd.DataFrame) -> dict[str, ColumnType]: + """Infer types for all columns in a DataFrame. + + Args: + df: Target DataFrame to analyze. + + Returns: + Column name → :class:`ColumnType` mapping. + """ + return {col: infer_column_type(df[col]) for col in df.columns} diff --git a/f2a/utils/validators.py b/f2a/utils/validators.py new file mode 100644 index 0000000..03ede7d --- /dev/null +++ b/f2a/utils/validators.py @@ -0,0 +1,280 @@ +"""Input validation utilities.""" + +from __future__ import annotations + +import re +from pathlib import Path + +from f2a.utils.exceptions import UnsupportedFormatError + +# ── Supported extensions → source type mapping ──────── +# Register new formats here; they will be auto-routed. +SUPPORTED_EXTENSIONS: dict[str, str] = { + # CSV / delimited text + ".csv": "csv", + ".tsv": "tsv", + ".txt": "delimited", # auto-detect delimiter + ".dat": "delimited", + ".tab": "tsv", + # JSON family + ".json": "json", + ".jsonl": "jsonl", + ".ndjson": "jsonl", + # Spreadsheets + ".xlsx": "excel", + ".xls": "excel", + ".xlsm": "excel", + ".xlsb": "excel", + ".ods": "ods", + # Binary / columnar formats + ".parquet": "parquet", + ".pq": "parquet", + ".feather": "feather", + ".ftr": "feather", + ".arrow": "arrow_ipc", + ".ipc": "arrow_ipc", + ".orc": "orc", + ".hdf": "hdf5", + ".hdf5": "hdf5", + ".h5": "hdf5", + ".pkl": "pickle", + ".pickle": "pickle", + # Statistical packages + ".sas7bdat": "sas", + ".xpt": "sas_xport", + ".dta": "stata", + ".sav": "spss", + ".zsav": "spss", + ".por": "spss", + # Databases + ".db": "sqlite", + ".sqlite": "sqlite", + ".sqlite3": "sqlite", + ".ddb": "duckdb", + ".duckdb": "duckdb", + # Markup / structured text + ".xml": "xml", + ".html": "html", + ".htm": "html", + # Fixed-width + ".fwf": "fwf", +} + +HF_PREFIXES = ("hf://", "huggingface://") +HF_URL_PATTERN = re.compile( + r"^https?://huggingface\.co/datasets/" + r"(?P| ignore |
| 1 |
{col.get('dtype', '')}| Column | DType | Inferred | -Unique | Missing | -
|---|
| Severity | Column | Message | Recommendation |
|---|
| Dimension | Score | Detail |
|---|
| Dimension | Score |
|---|
| Column | Count | -Mean | Std | -Min | Median | -Max | Skew | -Kurt | -
|---|
| Column | Count | -Unique | Top | -Freq | -
|---|
No missing values detected.
" - - rows = "" - for col_data in per_col: - n = col_data.get("n_missing", 0) - ratio = col_data.get("missing_ratio", 0) - if n > 0: - bar_width = ratio * 100 - color = "#22c55e" if bar_width < 5 else "#eab308" if bar_width < 30 else "#ef4444" - rows += f""" -No missing values detected.
" - - return f""" -| Column | Missing | Ratio |
|---|
{self._escape_html(json_str)}
- {t('generated_by', self.lang)} v{self._get_version()}
- {body} -lerobot/roboturk
+ +| column | +type | +count | +missing | +missing_% | +unique | +mean | +median | +std | +se | +cv | +mad | +min | +max | +range | +p5 | +q1 | +q3 | +p95 | +iqr | +skewness | +kurtosis | +top | +freq | +
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| observation.state | +text | +187507 | +0 | +0.0000 | +187507 | +nan | +nan | +nan | +nan | +nan | +nan | +nan | +nan | +nan | +nan | +nan | +nan | +nan | +nan | +nan | +nan | +nan | +nan | +
| action | +text | +187507 | +0 | +0.0000 | +187507 | +nan | +nan | +nan | +nan | +nan | +nan | +nan | +nan | +nan | +nan | +nan | +nan | +nan | +nan | +nan | +nan | +nan | +nan | +
| timestamp | +numeric | +187507 | +0 | +0.0000 | +352 | +6.3121 | +5.3000 | +4.8659 | +0.0112 | +0.7709 | +3.3000 | +0.0000 | +35.1000 | +35.1000 | +0.4000 | +2.4000 | +9.3000 | +15.0000 | +6.9000 | +1.0530 | +1.4028 | +nan | +nan | +
| episode_index | +numeric | +187507 | +0 | +0.0000 | +1995 | +1004.9688 | +998.0000 | +573.2126 | +1.3238 | +0.5704 | +500.0000 | +0.0000 | +1994.0000 | +1994.0000 | +108.0000 | +501.0000 | +1503.0000 | +1890.0000 | +1002.0000 | +-0.0093 | +-1.2072 | +nan | +nan | +
| frame_index | +numeric | +187507 | +0 | +0.0000 | +352 | +63.1209 | +53.0000 | +48.6586 | +0.1124 | +0.7709 | +33.0000 | +0.0000 | +351.0000 | +351.0000 | +4.0000 | +24.0000 | +93.0000 | +150.0000 | +69.0000 | +1.0530 | +1.4028 | +nan | +nan | +
| next.reward | +boolean | +187507 | +0 | +0.0000 | +1 | +nan | +nan | +nan | +nan | +nan | +nan | +nan | +nan | +nan | +nan | +nan | +nan | +nan | +nan | +nan | +nan | +0.0 | +187507.0000 | +
| next.done | +boolean | +187507 | +0 | +0.0000 | +2 | +nan | +nan | +nan | +nan | +nan | +nan | +nan | +nan | +nan | +nan | +nan | +nan | +nan | +nan | +nan | +nan | +False | +185512.0000 | +
| index | +numeric | +187507 | +0 | +0.0000 | +187507 | +93753.0000 | +93753.0000 | +54128.7528 | +125.0027 | +0.5774 | +46877.0000 | +0.0000 | +187506.0000 | +187506.0000 | +9375.3000 | +46876.5000 | +140629.5000 | +178130.7000 | +93753.0000 | +-0.0000 | +-1.2000 | +nan | +nan | +
| task_index | +categorical | +187507 | +0 | +0.0000 | +3 | +nan | +nan | +nan | +nan | +nan | +nan | +nan | +nan | +nan | +nan | +nan | +nan | +nan | +nan | +nan | +nan | +2 | +67436.0000 | +
| column | +n | +skewness | +skew_type | +kurtosis | +kurt_type | +normality_test | +normality_p | +is_normal_0.05 | +shapiro_p | +dagostino_p | +ks_p | +anderson_stat | +anderson_5pct_cv | +
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| timestamp | +187507 | +1.0530 | +high skew | +1.4028 | +leptokurtic | +dagostino | +0.0000 | +False | +NaN | +0.0000 | +0.0000 | +3223.8222 | +0.7520 | +
| episode_index | +187507 | +-0.0093 | +symmetric | +-1.2072 | +platykurtic | +dagostino | +0.0000 | +False | +NaN | +0.0000 | +0.0000 | +2148.6268 | +0.7520 | +
| frame_index | +187507 | +1.0530 | +high skew | +1.4028 | +leptokurtic | +dagostino | +0.0000 | +False | +NaN | +0.0000 | +0.0000 | +3223.8205 | +0.7520 | +
| index | +187507 | +-0.0000 | +symmetric | +-1.2000 | +platykurtic | +dagostino | +0.0000 | +False | +NaN | +0.0000 | +0.0000 | +2084.9274 | +0.7520 | +
| column | +VIF | +multicollinearity | +
|---|---|---|
| episode_index | +266.2900 | +severe | +
| index | +-0.0000 | +low | +
| frame_index | +-57054521769846057730048.0000 | +low | +
| timestamp | +-57054521769871039004672.0000 | +low | +
| column | +missing_count | +missing_ratio | +missing_% | +dtype | +
|---|---|---|---|---|
| observation.state | +0 | +0.0000 | +0.0000 | +object | +
| action | +0 | +0.0000 | +0.0000 | +object | +
| timestamp | +0 | +0.0000 | +0.0000 | +float32 | +
| episode_index | +0 | +0.0000 | +0.0000 | +int64 | +
| frame_index | +0 | +0.0000 | +0.0000 | +int64 | +
| next.reward | +0 | +0.0000 | +0.0000 | +float32 | +
| next.done | +0 | +0.0000 | +0.0000 | +bool | +
| index | +0 | +0.0000 | +0.0000 | +int64 | +
| task_index | +0 | +0.0000 | +0.0000 | +int64 | +
| column | +q1 | +q3 | +iqr | +lower_bound | +upper_bound | +outlier_count | +outlier_% | +min_outlier | +max_outlier | +
|---|---|---|---|---|---|---|---|---|---|
| timestamp | +2.4000 | +9.3000 | +6.9000 | +-7.9500 | +19.6500 | +2712.0000 | +1.4500 | +19.7000 | +35.1000 | +
| episode_index | +501.0000 | +1503.0000 | +1002.0000 | +-1002.0000 | +3006.0000 | +0.0000 | +0.0000 | +nan | +nan | +
| frame_index | +24.0000 | +93.0000 | +69.0000 | +-79.5000 | +196.5000 | +2712.0000 | +1.4500 | +197.0000 | +351.0000 | +
| index | +46876.5000 | +140629.5000 | +93753.0000 | +-93753.0000 | +281259.0000 | +0.0000 | +0.0000 | +nan | +nan | +
| column | +count | +unique | +top_value | +top_frequency | +top_% | +entropy | +norm_entropy | +
|---|---|---|---|---|---|---|---|
| task_index | +187507 | +3 | +2 | +67436 | +35.9600 | +1.5806 | +0.9973 | +
| column | +variance | +std | +cv | +range | +
|---|---|---|---|---|
| index | +2929921879.6667 | +54128.7528 | +0.5774 | +187506.0000 | +
| episode_index | +328572.6540 | +573.2126 | +0.5704 | +1994.0000 | +
| frame_index | +2367.6604 | +48.6586 | +0.7709 | +351.0000 | +
| timestamp | +23.6766 | +4.8659 | +0.7709 | +35.1000 | +
| component | +variance_ratio | +cumulative_ratio | +eigenvalue | +
|---|---|---|---|
| PC1 | +0.5058 | +0.5058 | +2.0232 | +
| PC2 | +0.4942 | +1.0000 | +1.9768 | +
| PC3 | +0.0000 | +1.0000 | +0.0001 | +
| PC4 | +0.0000 | +1.0000 | +0.0000 | +
| + | PC1 | +PC2 | +PC3 | +PC4 | +
|---|---|---|---|---|
| timestamp | +0.5003 | +-0.4997 | +-0.0001 | +0.7071 | +
| episode_index | +0.4996 | +0.5004 | +-0.7071 | +0.0000 | +
| frame_index | +0.5003 | +-0.4997 | +-0.0001 | +-0.7071 | +
| index | +0.4998 | +0.5002 | +0.7071 | +-0.0000 | +
Dataset contains 187,507 rows and 9 columns (4 numeric, 1 categorical). 4 high-priority finding(s) detected. 5 moderate observations noted. Key highlights: 1. 2 column pair(s) with |r| > 0.9 2. 2 likely confounded correlation(s) detected 3. 4/4 numeric columns are non-normal
Near-perfect linear relationships detected. Top pair: 'timestamp' ↔ 'frame_index' (r=1.000).
Raw correlation differs significantly from partial correlation, suggesting confounding variables. Top: 'episode_index' ↔ 'index' (raw r=1.00, partial r=8183723376125764.00).
Most numeric columns fail normality tests (α=0.05). Non-parametric methods may be more appropriate.
Distribution fitting reveals non-Normal best fits. Most common: beta (2 columns). Others: {'beta': 2, 'lognorm': 1, 'uniform': 1}.
K-Means identifies 3 well-separated clusters (silhouette=0.40). Cluster sizes: {'cluster_0': 1895, 'cluster_1': 1882, 'cluster_2': 1223}.
VIF > 10 detected for: ['episode_index']. Worst: 'episode_index' (VIF=266.3). Redundant information may cause model instability.
Top interaction: 'timestamp' × 'episode_index' (strength=0.73). Product features may improve model performance.
Box-Cox / Yeo-Johnson transforms can significantly reduce skewness for columns: ['timestamp', 'frame_index'].
A small fraction of rows are flagged by multiple anomaly detection methods.
All columns are fully populated — no imputation needed.
| column | +best_distribution | +aic | +bic | +ks_statistic | +ks_p_value | +fit_quality | +
|---|---|---|---|---|---|---|
| timestamp | +beta | +28275.5600 | +28301.6300 | +0.0363 | +0.0000 | +poor | +
| episode_index | +beta | +75977.6500 | +76003.7200 | +0.0150 | +0.2102 | +good | +
| frame_index | +lognorm | +20261.0000 | +20280.5500 | +0.5098 | +0.0000 | +poor | +
| index | +uniform | +121419.0200 | +121432.0600 | +0.0136 | +0.3124 | +good | +
| column | +jb_statistic | +p_value | +is_normal_0.05 | +skewness | +kurtosis | +
|---|---|---|---|---|---|
| timestamp | +50024.3984 | +0.0000 | +False | +1.0530 | +1.4028 | +
| episode_index | +11389.0254 | +0.0000 | +False | +-0.0093 | +-1.2072 | +
| frame_index | +50024.3684 | +0.0000 | +False | +1.0530 | +1.4028 | +
| index | +11250.4200 | +0.0000 | +False | +-0.0000 | +-1.2000 | +
| column | +original_skewness | +recommended_method | +optimal_lambda | +transformed_skewness | +needs_transform | +improvement | +
|---|---|---|---|---|---|---|
| timestamp | +1.0530 | +yeo-johnson | +0.2569 | +-0.0496 | +True | +1.0034 | +
| episode_index | +-0.0093 | +yeo-johnson | +0.7184 | +-0.2834 | +False | +-0.2741 | +
| frame_index | +1.0530 | +yeo-johnson | +0.3990 | +-0.0965 | +True | +0.9565 | +
| index | +-0.0000 | +yeo-johnson | +0.7071 | +-0.2916 | +False | +-0.2916 | +
| column | +n | +std | +iqr | +silverman_bandwidth | +scotts_bandwidth | +
|---|---|---|---|---|---|
| timestamp | +187507.0000 | +4.8659 | +6.9000 | +0.3862 | +0.2967 | +
| episode_index | +187507.0000 | +573.2126 | +1002.0000 | +45.4941 | +34.9517 | +
| frame_index | +187507.0000 | +48.6586 | +69.0000 | +3.8619 | +2.9670 | +
| index | +187507.0000 | +54128.7528 | +93753.0000 | +4296.0273 | +3300.5092 | +
| + | timestamp | +episode_index | +frame_index | +index | +
|---|---|---|---|---|
| timestamp | +1.0000 | +6.0442 | +-1.0000 | +-99974086307298640.0000 | +
| episode_index | +6.0442 | +1.0000 | +-6.0442 | +8183723376125764.0000 | +
| frame_index | +-1.0000 | +-6.0442 | +1.0000 | +99974086307314160.0000 | +
| index | +-99974086307298656.0000 | +8183723376125764.0000 | +99974086307314160.0000 | +1.0000 | +
| + | timestamp | +episode_index | +frame_index | +index | +
|---|---|---|---|---|
| timestamp | +0.0000 | +0.0362 | +5.0693 | +0.0353 | +
| episode_index | +0.0362 | +0.0000 | +0.0000 | +6.3049 | +
| frame_index | +5.0693 | +0.0000 | +0.0000 | +0.0000 | +
| index | +0.0353 | +6.3049 | +0.0000 | +0.0000 | +
| + | col_a | +col_b | +pearson_r | +ci_lower | +ci_upper | +ci_width | +significant | +
|---|---|---|---|---|---|---|---|
| 0 | +timestamp | +episode_index | +0.0221 | +-0.0039 | +0.0485 | +0.0524 | +False | +
| 1 | +timestamp | +frame_index | +1.0000 | +1.0000 | +1.0000 | +0.0000 | +True | +
| 2 | +timestamp | +index | +0.0225 | +-0.0048 | +0.0508 | +0.0556 | +False | +
| 3 | +episode_index | +frame_index | +0.0221 | +-0.0045 | +0.0496 | +0.0541 | +False | +
| 4 | +episode_index | +index | +0.9999 | +0.9999 | +1.0000 | +0.0000 | +True | +
| 5 | +frame_index | +index | +0.0225 | +-0.0054 | +0.0500 | +0.0554 | +False | +
| + | timestamp | +episode_index | +frame_index | +index | +
|---|---|---|---|---|
| timestamp | +1.0000 | +0.0360 | +1.0000 | +0.0361 | +
| episode_index | +0.0360 | +1.0000 | +0.0360 | +0.9999 | +
| frame_index | +1.0000 | +0.0360 | +1.0000 | +0.0361 | +
| index | +0.0361 | +0.9999 | +0.0361 | +1.0000 | +
| + | timestamp | +episode_index | +frame_index | +index | +
|---|---|---|---|---|
| cluster_0 | +4.1473 | +1508.9013 | +41.4728 | +141229.8786 | +
| cluster_1 | +4.3556 | +481.7476 | +43.5563 | +44448.0032 | +
| cluster_2 | +13.2153 | +1064.5127 | +132.1529 | +99369.0989 | +
| + | factor_1 | +factor_2 | +
|---|---|---|
| timestamp | +1.0000 | +-0.0000 | +
| episode_index | +0.0221 | +-0.9997 | +
| frame_index | +1.0000 | +0.0000 | +
| index | +0.0225 | +-0.9997 | +
| column | +contribution_score | +rank | +
|---|---|---|
| timestamp | +0.5000 | +4.0000 | +
| episode_index | +0.5000 | +2.0000 | +
| frame_index | +0.5000 | +3.0000 | +
| index | +0.5000 | +1.0000 | +
| + | col_a | +col_b | +interaction_strength | +corr_product_a | +corr_product_b | +corr_a_b | +recommendation | +
|---|---|---|---|---|---|---|---|
| 0 | +timestamp | +episode_index | +0.7259 | +0.7480 | +0.5459 | +0.0221 | +Strong interaction | +
| 1 | +episode_index | +frame_index | +0.7259 | +0.5459 | +0.7480 | +0.0221 | +Strong interaction | +
| 2 | +timestamp | +index | +0.7217 | +0.7442 | +0.5495 | +0.0225 | +Strong interaction | +
| 3 | +frame_index | +index | +0.7217 | +0.7442 | +0.5495 | +0.0225 | +Strong interaction | +
| column | +n_bins | +equal_width_entropy | +equal_freq_entropy | +max_entropy | +recommended_method | +skewness | +
|---|---|---|---|---|---|---|
| timestamp | +10 | +2.2300 | +3.3211 | +3.3219 | +equal_frequency | +1.0530 | +
| episode_index | +10 | +3.3208 | +3.3219 | +3.3219 | +equal_width | +-0.0093 | +
| frame_index | +10 | +2.2300 | +3.3211 | +3.3219 | +equal_frequency | +1.0530 | +
| index | +10 | +3.3219 | +3.3219 | +3.3219 | +equal_width | +-0.0000 | +
| + | col_a | +col_b | +levene_stat | +p_value | +log_var_ratio | +adjusted_p | +significant_0.05 | +stars | +
|---|---|---|---|---|---|---|---|---|
| 0 | +timestamp | +episode_index | +564897.5197 | +0.0000 | +9.5380 | +0.0000 | +True | +*** | +
| 1 | +timestamp | +frame_index | +222079.0332 | +0.0000 | +4.6052 | +0.0000 | +True | +*** | +
| 2 | +timestamp | +index | +562425.8915 | +0.0000 | +18.6338 | +0.0000 | +True | +*** | +
| 3 | +episode_index | +frame_index | +482754.0326 | +0.0000 | +4.9329 | +0.0000 | +True | +*** | +
| 4 | +episode_index | +index | +550576.4333 | +0.0000 | +9.0957 | +0.0000 | +True | +*** | +
| 5 | +frame_index | +index | +561596.5666 | +0.0000 | +14.0286 | +0.0000 | +True | +*** | +
| + | grouping_col | +numeric_col | +n_groups | +h_statistic | +p_value | +eta_squared | +effect_magnitude | +adjusted_p | +reject_h0_0.05 | +stars | +interpretation | +
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | +task_index | +timestamp | +3 | +10663.7432 | +0.0000 | +0.0569 | +small | +0.0000 | +True | +*** | +Significant (η²=0.0569, small) | +
| 1 | +task_index | +episode_index | +3 | +625.0241 | +0.0000 | +0.0033 | +small | +0.0000 | +True | +*** | +Significant (η²=0.0033, small) | +
| 2 | +task_index | +frame_index | +3 | +10663.6852 | +0.0000 | +0.0569 | +small | +0.0000 | +True | +*** | +Significant (η²=0.0569, small) | +
| 3 | +task_index | +index | +3 | +625.0238 | +0.0000 | +0.0033 | +small | +0.0000 | +True | +*** | +Significant (η²=0.0033, small) | +
| + | col_a | +col_b | +u_statistic | +p_value | +rank_biserial_r | +effect_magnitude | +adjusted_p | +significant_0.05 | +stars | +
|---|---|---|---|---|---|---|---|---|---|
| 0 | +timestamp | +episode_index | +95185168.0000 | +0.0000 | +0.9946 | +large | +0.0000 | +True | +*** | +
| 1 | +timestamp | +frame_index | +2516704372.0000 | +0.0000 | +0.8568 | +large | +0.0000 | +True | +*** | +
| 2 | +timestamp | +index | +1278510.0000 | +0.0000 | +0.9999 | +large | +0.0000 | +True | +*** | +
| 3 | +episode_index | +frame_index | +34126631417.0000 | +0.0000 | +-0.9413 | +large | +0.0000 | +True | +*** | +
| 4 | +episode_index | +index | +188532431.5000 | +0.0000 | +0.9893 | +large | +0.0000 | +True | +*** | +
| 5 | +frame_index | +index | +11929357.5000 | +0.0000 | +0.9993 | +large | +0.0000 | +True | +*** | +
| column | +n_categories | +chi2_stat | +p_value | +cramers_v | +effect_magnitude | +uniform_0.05 | +interpretation | +
|---|---|---|---|---|---|---|---|
| task_index | +3 | +1112.7892 | +0.0000 | +0.0545 | +small | +False | +Non-uniform distribution | +
| column | +suspect_value | +grubbs_statistic | +critical_value | +is_outlier | +n | +
|---|---|---|---|---|---|
| timestamp | +35.1000 | +5.9163 | +5.1454 | +True | +187507 | +
| episode_index | +0.0000 | +1.7532 | +5.1454 | +False | +187507 | +
| frame_index | +351.0000 | +5.9163 | +5.1454 | +True | +187507 | +
| index | +0.0000 | +1.7320 | +5.1454 | +False | +187507 | +
| column | +primary_role | +confidence | +secondary_role | +properties | +
|---|---|---|---|---|
| observation.state | +id | +0.8500 | +NaN | +{'unique_ratio': 1.0} | +
| action | +id | +0.8500 | +NaN | +{'unique_ratio': 1.0} | +
| timestamp | +timestamp | +0.7000 | +NaN | +{'dtype': 'float32', 'hint': 'monotonic numeric with time-like name'} | +
| episode_index | +numeric_feature | +0.8500 | +NaN | +{'dtype': 'int64'} | +
| frame_index | +numeric_feature | +0.8500 | +NaN | +{'dtype': 'int64'} | +
| next.reward | +constant | +1.0000 | +NaN | +{'n_unique': 1} | +
| next.done | +binary | +0.9000 | +NaN | +{'n_unique': 2, 'values': [False, True]} | +
| index | +id | +0.9000 | +NaN | +{'unique_ratio': 1.0} | +
| task_index | +categorical_feature | +0.8500 | +NaN | +{'n_unique': 3, 'unique_ratio': 0.0} | +