diff --git a/README.md b/README.md index 8e6caaf..f3b52f1 100644 --- a/README.md +++ b/README.md @@ -28,6 +28,24 @@ TinyExp focuses on simple, maintainable experiment management: - Your config stays structured and easy to override. - Your execution path stays consistent as experiments grow. +## Design Philosophy + +TinyExp is intentionally light. + +It is not trying to be a heavy trainer framework that owns your epoch loop, callback system, or full runtime +lifecycle. Instead, it focuses on a smaller goal: + +- keep the experiment itself as the main entrypoint +- keep the training loop in user space +- make configuration and launch behavior explicit +- expose shared capabilities through focused `XXXCfg` components +- provide thin helpers rather than framework-owned control flow +- treat examples as reusable recipes, not just demos + +In short, TinyExp should help you write less experiment plumbing, not less experiment logic. + +For a longer explanation, see [`docs/philosophy.md`](docs/philosophy.md). + ## Quick Start (1 Minute) ### Option A: Install with pip and use import-based entrypoint diff --git a/docs/index.md b/docs/index.md index 06881fc..722039c 100644 --- a/docs/index.md +++ b/docs/index.md @@ -2,10 +2,40 @@ A minimalist Python project for deep learning experiment management. -TinyExp lets you launch experiments with one click: the file you edit becomes the entrypoint to your experiment. +TinyExp keeps one idea at the center: +your configured experiment is your entrypoint. -# Features +Instead of splitting config, launcher, and execution across many files, TinyExp keeps them together in one experiment +definition so iteration stays fast and predictable. + +## What You Get + +- Experiment-centered configuration with Hydra/OmegaConf +- CLI overrides without rewriting code +- Training loops that stay close to plain PyTorch +- The same experiment definition from local debug to distributed launch + +## Design Philosophy + +TinyExp is intentionally light. + +It is not trying to be a heavy trainer framework that owns your epoch loop, callback system, or full runtime +lifecycle. Instead, it focuses on a smaller and more explicit goal: + +- keep the experiment itself as the main entrypoint +- keep the training loop in user space +- make configuration and launch behavior explicit +- expose shared capabilities through focused `XXXCfg` components +- provide thin helpers instead of framework-owned control flow +- treat examples as reusable recipes, not just demos + +In short, TinyExp should help you write less experiment plumbing, not less experiment logic. + +For the longer version, see [Design Philosophy](philosophy.md). + +## Features - ๐Ÿš€ One-click experiment launch: The file you edit becomes the entrypoint to your experiment. -- ๐Ÿ“Š Experiment tracking: Track your experiments with W&B. -- ๐Ÿ”„ Experiment management: Manage your experiments configuration with Hydra. +- ๐Ÿ”„ Config-driven experiment management with Hydra. +- ๐Ÿงฉ Thin helpers without taking over your training loop. +- ๐Ÿงช Examples that can serve as reusable experiment recipes. diff --git a/docs/phase1-api-draft.md b/docs/phase1-api-draft.md new file mode 100644 index 0000000..1f50735 --- /dev/null +++ b/docs/phase1-api-draft.md @@ -0,0 +1,276 @@ +# Phase 1: API Draft + +This document proposes a concrete API shape for the first implementation slice. + +It should be read together with: + +- [Design Philosophy](philosophy.md) +- [Phase 1: Minimal Helpers Plan](phase1-minimal-helpers.md) +- [Phase 1: File-by-File Implementation Plan](phase1-file-by-file-plan.md) + +This is still a draft. The goal is to make the intended shape explicit before implementation expands further. + +## Drafting Principles + +The Phase 1 API should follow these rules: + +- keep `TinyExp` small +- keep training and evaluation loops in examples +- expose shared capabilities through focused `XXXCfg` components +- make configuration override-friendly through Hydra +- keep execution explicit through method calls +- avoid introducing trainer-like control flow + +## `TinyExp` Draft Surface + +Phase 1 keeps the root experiment object intentionally small. + +### Fields + +Recommended experiment-level fields: + +- `mode: str = "train"` +- `resume_from: str = ""` +- `output_root: str = "./output"` +- `exp_name: str = ...` + +These fields describe the experiment as a whole rather than one isolated feature subsystem. + +### Composed config components + +The experiment object should continue to expose capability-specific config components, such as: + +- `logger_cfg` +- `wandb_cfg` +- `checkpoint_cfg` + +Other feature configs may be added later only if they earn their place. + +### Methods + +Recommended Phase 1 methods on `TinyExp`: + +```python +def get_run_dir(self) -> str: + ... +``` + +This belongs on `TinyExp` because it is experiment-scoped, not feature-scoped. + +## `CheckpointCfg` Draft + +Checkpointing is the main new shared capability in Phase 1. + +It should follow the same cfg-driven pattern as logger and W&B integration: + +- fields define behavior and defaults +- methods perform explicit actions only when called + +### Draft fields + +```python +@dataclass +class CheckpointCfg: + last_ckpt_name: str = "last.ckpt" + best_ckpt_name: str = "best.ckpt" +``` + +Phase 1 should keep this deliberately small. + +### Draft methods + +```python +def save_checkpoint( + self, + *, + run_dir: str, + name: str, + model=None, + optimizer=None, + scheduler=None, + epoch: int | None = None, + global_step: int | None = None, + best_metric: float | None = None, + extra_state: dict | None = None, +) -> str: + ... + +def load_checkpoint( + self, + path: str, + *, + model=None, + optimizer=None, + scheduler=None, + strict: bool = True, + map_location=None, +) -> dict: + ... +``` + +### Responsibilities + +`CheckpointCfg` should: + +- define default checkpoint filenames +- save a standard checkpoint structure +- load a standard checkpoint structure +- optionally restore state into provided model / optimizer / scheduler objects + +### Non-responsibilities + +`CheckpointCfg` should not decide: + +- when to save +- whether to save best checkpoints +- which metric is considered best +- whether resume is automatic +- how many checkpoints to retain + +Those remain example-level or user-level decisions. + +## Draft Checkpoint Format + +The checkpoint format should be simple and explicit. + +Recommended structure: + +```python +{ + "model_state_dict": ..., + "optimizer_state_dict": ..., + "scheduler_state_dict": ..., + "epoch": ..., + "global_step": ..., + "best_metric": ..., + "meta": { + "exp_name": ..., + "exp_class": ..., + "saved_at": ..., + }, + ... +} +``` + +Notes: + +- `optimizer_state_dict` is optional +- `scheduler_state_dict` is optional +- `meta` should stay lightweight +- `extra_state` can extend the structure without forcing premature abstraction + +## Run Directory Draft + +Run directory behavior should remain simple in Phase 1. + +### Draft methods + +```python +def get_run_dir(self) -> str: + ... +``` + +### Expected behavior + +- `get_run_dir()` returns `os.path.join(self.output_root, self.exp_name)` +- methods that write files should create parent directories when needed + +Phase 1 should not add timestamped run folders, version managers, or heavier run registry behavior. + +## Example Usage Draft + +Below is the intended style for examples after Phase 1. + +### Logger setup + +```python +run_dir = self.get_run_dir() +logger = self.logger_cfg.build_logger( + save_dir=run_dir, + distributed_rank=accelerator.rank, +) +``` + +### Explicit W&B usage + +```python +if self.wandb_cfg.enable_wandb: + wandb = self.wandb_cfg.build_wandb( + accelerator=accelerator, + project="Baselines", + config=cfg_dict, + ) +``` + +### Explicit checkpoint load + +```python +resume_state = None +if self.resume_from: + resume_state = self.checkpoint_cfg.load_checkpoint( + self.resume_from, + model=model, + optimizer=optimizer, + scheduler=scheduler, + map_location=accelerator.device, + ) +``` + +### Explicit checkpoint save + +```python +self.checkpoint_cfg.save_checkpoint( + run_dir=run_dir, + name=self.checkpoint_cfg.last_ckpt_name, + model=accelerator.unwrap_model(model), + optimizer=optimizer, + scheduler=scheduler, + epoch=epoch, + global_step=global_step, + best_metric=best_metric, +) +``` + +### Explicit best checkpoint policy in example code + +```python +if best_metric is None or val_metric > best_metric: + best_metric = val_metric + self.checkpoint_cfg.save_checkpoint( + run_dir=run_dir, + name=self.checkpoint_cfg.best_ckpt_name, + model=accelerator.unwrap_model(model), + optimizer=optimizer, + scheduler=scheduler, + epoch=epoch, + global_step=global_step, + best_metric=best_metric, + ) +``` + +This is the intended balance: + +- framework provides the capability +- example decides when to use it + +## API Choices Deferred Beyond Phase 1 + +The following questions should stay open until the first implementation slice proves itself: + +- whether metrics deserve their own `MetricCfg` +- whether `CheckpointCfg` should move into its own module +- whether shared recipe base classes are worth introducing + +These should not be over-designed before the first slice is working. + +## Success Criteria + +The Phase 1 API draft is successful if it leads to implementation that feels: + +- small +- explicit +- override-friendly +- consistent with the existing `XXXCfg` style +- still close to plain PyTorch + +That is the standard Phase 1 should be judged against. diff --git a/docs/phase1-file-by-file-plan.md b/docs/phase1-file-by-file-plan.md new file mode 100644 index 0000000..5cb8e51 --- /dev/null +++ b/docs/phase1-file-by-file-plan.md @@ -0,0 +1,306 @@ +# Phase 1: File-by-File Implementation Plan + +This document turns the Phase 1 helper direction into a file-by-file implementation plan. + +It follows the same constraints described in: + +- [Design Philosophy](philosophy.md) +- [Phase 1: Minimal Helpers Plan](phase1-minimal-helpers.md) + +The key design rule is unchanged: + +- shared capabilities should usually be exposed through focused `XXXCfg` classes +- those config fields should be Hydra-override-friendly +- behavior should execute only when the user explicitly calls a method + +This plan is intentionally conservative. It aims to establish one clean first slice, not to solve every future need in +one pass. + +## Phase 1 Scope + +The first implementation slice should focus on: + +- a stable run directory helper +- a new `CheckpointCfg` +- `mode=val` support through explicit checkpoint loading +- a migration of the MNIST example to validate the design + +This phase should not introduce: + +- a trainer abstraction +- a runtime layer +- callback systems +- automatic checkpoint policy +- automatic resume behavior + +## Files to Change + +The recommended Phase 1 file set is intentionally small: + +- `tinyexp/__init__.py` +- `tinyexp/examples/mnist_exp.py` +- `tests/test_tinyexp.py` or a new artifact-focused test file +- a new checkpoint-focused test file +- optionally one example-level integration test for validation mode + +The ResNet example should not be part of the first slice. + +## 1. `tinyexp/__init__.py` + +This is the main design anchor for Phase 1. + +### Why this file changes + +This file already defines: + +- `TinyExp` +- `LoggerCfg` +- `WandbCfg` +- `RedisCfgMixin` + +That makes it the natural place to reinforce the cfg-driven model and add the first checkpoint component. + +### What should change + +#### Keep `TinyExp` small + +`TinyExp` should continue to be the root experiment object, but it should not turn into a feature sink. + +It should remain responsible for: + +- experiment-level config structure +- launcher-facing fields +- a small number of experiment-wide helpers +- composition of shared `XXXCfg` components + +#### Add only minimal experiment-wide fields + +Recommended additions or clarifications: + +- `mode: str = "train"` +- `resume_from: str = ""` + +These belong at the experiment level because they describe run intent rather than one isolated feature subsystem. + +#### Add only minimal experiment-wide helpers + +Recommended methods on `TinyExp`: + +- `get_run_dir() -> str` + +This is a good fit for `TinyExp` because it is experiment-scoped rather than belonging to a single feature config. + +### What should not be added here + +Avoid adding many feature-specific top-level methods such as: + +- `save_checkpoint(...)` +- `load_checkpoint(...)` + +Those are better expressed through a focused config component. + +## 2. Add `CheckpointCfg` in `tinyexp/__init__.py` + +For the first slice, `CheckpointCfg` can live in `tinyexp/__init__.py` alongside `LoggerCfg` and `WandbCfg`. + +This keeps the initial implementation simple and consistent with the current project structure. + +If it grows later, it can be split into a dedicated module. + +### Why `CheckpointCfg` + +Checkpointing fits the cfg-driven TinyExp pattern well: + +- filenames and related defaults are configuration +- save/load methods are explicit actions +- users choose when to call those methods + +This is more aligned with TinyExp's style than adding many checkpoint methods directly to `TinyExp`. + +### Recommended `CheckpointCfg` scope + +Fields: + +- `last_ckpt_name: str = "last.ckpt"` +- `best_ckpt_name: str = "best.ckpt"` + +Methods: + +- `save_checkpoint(...) -> str` +- `load_checkpoint(...) -> dict` + +### Recommended responsibilities + +`CheckpointCfg` should handle: + +- default checkpoint filenames +- run-dir-relative checkpoint path generation when useful +- standard save format +- standard load behavior +- optional loading into model / optimizer / scheduler objects + +### What `CheckpointCfg` should not own + +Do not put policy into `CheckpointCfg`, including: + +- when to save +- whether to save best checkpoints +- how to compare best metrics +- save frequency +- retention policies +- automatic resume behavior + +Those decisions belong in the example or user code. + +## 3. `tinyexp/examples/mnist_exp.py` + +This file should be the first real migration target. + +### Why this file changes first + +The MNIST example is: + +- small enough to change safely +- representative of the intended user workflow +- a good way to validate whether the cfg-driven helper design actually reduces useful boilerplate + +### What should change + +#### `run()` should adopt experiment-wide helpers + +Expected updates: + +- call `self.get_run_dir()` +- build the logger using `self.logger_cfg.build_logger(...)` +- branch on `self.mode` + +#### training should remain explicit + +The training loop should stay in the example. + +What should change is only the repeated plumbing: + +- explicit checkpoint loading when `self.resume_from` is set +- explicit calls to `self.checkpoint_cfg.save_checkpoint(...)` +- explicit best-checkpoint save logic, still decided by the example + +#### validation should also stay explicit + +For `mode=val`, the example should: + +- require a meaningful `resume_from` +- explicitly call `self.checkpoint_cfg.load_checkpoint(...)` +- run evaluation logic in example code + +The example remains responsible for evaluation semantics. + +### What should not change + +Do not try to extract: + +- a trainer +- a recipe base class +- generic evaluation policy + +Those can be revisited later only if repeated patterns clearly emerge. + +## 4. `tinyexp/examples/resnet_exp.py` + +This file should not be part of the first implementation slice. + +### Why it should wait + +The ResNet example includes additional concerns: + +- DDP usage +- ImageNet-specific data loading +- Redis-backed caching +- a more complex training setup + +It is not the right place to define the first minimal checkpoint and artifact API. + +### Recommended Phase 1 stance + +- leave it unchanged +- only revisit after the MNIST migration proves the shape of the APIs + +## 5. Tests + +Phase 1 needs lightweight but meaningful coverage. + +### Recommended test additions + +#### Artifact tests + +Add or expand tests for: + +- `get_run_dir()` + +These can live in: + +- `tests/test_tinyexp.py` +- or a new `tests/test_tinyexp_artifacts.py` + +#### Checkpoint tests + +Add a dedicated checkpoint test file, for example: + +- `tests/test_tinyexp_checkpoint_cfg.py` + +Cover: + +- save model-only checkpoint +- save/load with optimizer and scheduler +- standard metadata presence +- correct state restoration + +#### Example-level validation test + +Add one small integration-style test for: + +- `mode=val` +- loading a checkpoint through `resume_from` + +This should stay CPU-first and deterministic. + +## 6. Files Not Needed in Phase 1 + +The following files or modules do not need changes in the first slice: + +- `tinyexp/utils/ray_utils.py` +- `tinyexp/tiny_engine/accelerator/*` +- `tinyexp/examples/resnet_exp.py` +- Redis-related utilities + +This is important. + +The first slice should validate the cfg-driven artifact pattern, not broaden the implementation surface. + +## Recommended Implementation Order + +The order below minimizes risk and keeps the design easy to validate. + +1. update `tinyexp/__init__.py` with: + - `mode` + - `resume_from` + - `get_run_dir()` + - `CheckpointCfg` +2. add checkpoint-focused tests +3. migrate `tinyexp/examples/mnist_exp.py` +4. add validation-mode test coverage +5. only then decide whether any further cfg component is worth introducing + +## Stop Point for Phase 1 + +Phase 1 should stop once the following are true: + +- experiment-level artifact basics are available +- checkpointing is exposed through `checkpoint_cfg` +- the MNIST example uses the new pattern successfully +- validation from checkpoint works +- the project still feels light and explicit + +That stop point matters. + +The goal of Phase 1 is not to fully design TinyExp's long-term helper ecosystem. The goal is to establish one clean, +cfg-driven example of how shared capabilities should be added without drifting toward a trainer framework. diff --git a/docs/phase1-minimal-helpers.md b/docs/phase1-minimal-helpers.md new file mode 100644 index 0000000..51d05bf --- /dev/null +++ b/docs/phase1-minimal-helpers.md @@ -0,0 +1,236 @@ +# Phase 1: Minimal Helpers Plan + +This document describes the first implementation phase that follows TinyExp's design philosophy. + +The goal is not to turn TinyExp into a trainer framework. The goal is to add a small set of reusable, +configuration-driven helpers that remove repeated experiment plumbing while keeping training loops in user space. + +For the broader principles behind these choices, see [Design Philosophy](philosophy.md). + +## Why This Phase Exists + +TinyExp already has a clear core direction: + +- the experiment is the entrypoint +- configuration is explicit and override-friendly +- launch behavior should stay simple +- users should keep control of their training loop + +What is still missing is a minimal layer for common experiment chores that many examples will otherwise repeat by hand. + +This phase adds only thin helpers for those chores. It does not add a trainer, runtime, callback engine, or framework +owned lifecycle. + +It also follows an additional structural rule: + +- shared capabilities should usually be exposed through a focused `XXXCfg` class +- fields inside that class should be Hydra-override-friendly +- behavior should only run when the user explicitly calls a method on that config object + +## Goals + +Phase 1 should make experiments easier to run and maintain without changing TinyExp's character. + +The goals are: + +- keep plain PyTorch style intact +- reduce repeated setup code in examples +- improve reproducibility through lightweight artifacts +- make resume/eval workflows easier +- create a stable base for future examples and recipe-style inheritance + +## Non-Goals + +Phase 1 explicitly does not aim to add: + +- a generic trainer abstraction +- a runtime layer that owns epoch or step flow +- a callback or hook engine +- automatic external tracker initialization +- a framework-wide best-model policy system +- a heavy experiment lifecycle API + +If a feature starts to own the user workflow instead of helping it, it is out of scope for this phase. + +## Minimal Additions to TinyExp + +The base `TinyExp` class should remain small. This phase only proposes a few minimal additions directly on `TinyExp`, +and it prefers feature-specific `XXXCfg` classes for behavior-rich capabilities. + +### New fields + +Recommended additions: + +- `mode: str = "train"` +- `resume_from: str = ""` + +These are intentionally minimal: + +- `mode` provides a small, explicit switch for training, validation, and config help flows +- `resume_from` provides a standard path for loading a checkpoint + +More policy-driven settings should stay in examples unless they prove broadly reusable. + +### New helper methods on `TinyExp` + +The following methods are the proposed Phase 1 surface area: + +- `get_run_dir() -> str` + +These are helpers, not control-flow abstractions. + +### New `CheckpointCfg` + +Checkpointing should follow the same overall style as existing components such as `logger_cfg` and `wandb_cfg`. + +Recommended addition: + +- `checkpoint_cfg: CheckpointCfg` + +Recommended scope for `CheckpointCfg`: + +- default checkpoint filenames +- standard checkpoint save logic +- standard checkpoint load logic + +Recommended methods: + +- `save_checkpoint(...) -> str` +- `load_checkpoint(...) -> dict` + +This keeps checkpoint behavior grouped with its own configuration instead of expanding `TinyExp` with many feature +specific methods. + +## Artifact Conventions + +Phase 1 should establish simple, stable artifact conventions. + +The recommended default run layout is: + +- `output//last.ckpt` +- `output//best.ckpt` +- `output//log.txt` + +This layout is intentionally straightforward. It improves usability and reproducibility without introducing a heavy run +management system. + +## Helper Behavior + +### Run directory helpers + +`get_run_dir()` should return the default run directory for the current experiment. + +Directory creation should happen in the method that actually writes files, such as logger setup, +or checkpoint saving. + +This keeps `TinyExp` smaller and avoids a separate side-effect helper whose behavior can stay explicit +at the write boundary. + +### Checkpoint helpers + +`checkpoint_cfg.save_checkpoint()` and `checkpoint_cfg.load_checkpoint()` should provide a standard way to persist and +recover experiment state. + +Recommended checkpoint content: + +- `model_state_dict` +- `optimizer_state_dict` when available +- `scheduler_state_dict` when available +- `epoch` +- `global_step` +- `best_metric` +- `meta` + +Recommended metadata: + +- `exp_name` +- `exp_class` +- `saved_at` + +The helper should only standardize the storage format. It should not decide when checkpoints are written. + +Resume should remain explicit in user code: + +- `resume_from` stores the path +- the example decides whether to call `checkpoint_cfg.load_checkpoint()` +- the example decides how to continue from the loaded state + +## Boundary Between TinyExp and Examples + +This phase depends on keeping a strong boundary between the framework and examples. + +### TinyExp should own + +- configuration structure and override ergonomics +- launch integration +- thin artifact helpers +- feature-specific `XXXCfg` components +- small reusable utilities shared across many experiments + +### Examples should own + +- model construction +- data loading details +- the training loop +- evaluation logic +- when validation runs +- when checkpoints are saved +- what metric counts as best +- whether and when external integrations are initialized + +This boundary is central to TinyExp's design. + +## Example Migration Strategy + +The first migration target should be `tinyexp/examples/mnist_exp.py`. + +It is a good candidate because: + +- it is small enough to change safely +- it already represents the intended user-facing workflow +- it can validate whether the helpers are actually reducing useful boilerplate + +The migration should: + +- keep the training loop inside the example +- replace repeated path/config writing code with helpers +- add checkpoint save/load through `checkpoint_cfg` +- add `mode=val` using `resume_from` + +Only after this works well should TinyExp consider extracting a recipe-style base class from examples. + +## Testing Plan + +Phase 1 should be backed by lightweight tests. + +Recommended test coverage: + +- unit tests for run directory creation +- unit tests for checkpoint save/load +- a small integration test for `mode=val` + +The tests should stay CPU-first and deterministic. + +## Implementation Order + +Recommended implementation order: + +1. add run directory helpers +2. add `CheckpointCfg` with save/load +3. migrate `mnist_exp.py` +4. add `mode=val` +5. add tests + +This order keeps each change small and easy to validate. + +## Success Criteria + +Phase 1 is successful if TinyExp can do all of the following while still feeling light: + +- keep experiments centered around one explicit entrypoint +- preserve user-owned training loops +- save config in a standard way +- save and resume checkpoints with minimal boilerplate +- support a simple validation flow from a checkpoint + +In short, Phase 1 should make TinyExp more practical without making it more framework-heavy. diff --git a/docs/philosophy.md b/docs/philosophy.md new file mode 100644 index 0000000..dd3f530 --- /dev/null +++ b/docs/philosophy.md @@ -0,0 +1,182 @@ +# Design Philosophy + +TinyExp is intentionally small. + +It is not trying to become a heavy training framework, a trainer abstraction, or a system that takes over the +user's training loop. Its goal is much narrower and, for many research and iteration workflows, much more useful: +make experiment code easy to define, easy to launch, and easy to evolve without hiding plain PyTorch. + +## What TinyExp Is + +TinyExp is best understood as an experiment entry framework with a few lightweight helpers. + +It focuses on: + +- keeping the experiment definition as the main entrypoint +- making configuration explicit and easy to override +- expressing shared features through focused `XXXCfg` components +- supporting multiple launch styles without changing experiment code too much +- keeping user code close to normal PyTorch +- reducing repeated experiment "plumbing" without owning the full training lifecycle + +In practice, TinyExp wants the file you edit to remain the file you run. + +## What TinyExp Is Not + +TinyExp is intentionally not designed to be: + +- a general-purpose trainer framework +- a runtime system that owns epoch/step control flow +- a callback-heavy abstraction layer +- a framework that forces users into a single lifecycle or DSL +- a system that hides the actual training loop behind too many layers + +If a feature would make experiments feel less like plain PyTorch and more like framework ceremony, it is usually a +bad fit for TinyExp. + +## Core Principles + +### 1. The experiment is the entrypoint + +The experiment definition should be the center of the workflow. + +Users should not need to spread one experiment across many disconnected files just to launch, configure, and run it. +The experiment class should stay readable, local, and easy to reason about. + +### 2. Explicit is better than implicit + +TinyExp prefers explicit calls over hidden side effects. + +For example, integrations with external systems such as W&B should remain explicit. A config object can expose the +ability to build an integration, but the user should still decide when to call it. + +This same rule applies to TinyExp features more broadly: + +- configuration should live in a small `XXXCfg` class +- config fields should be override-friendly through Hydra +- behavior should only run when the user explicitly calls a method on that config object + +This keeps configuration and execution separate while still letting execution be part of the experiment structure. + +### 3. Prefer `XXXCfg` components for shared features + +When TinyExp grows, new capabilities should usually be introduced as focused config components rather than as many +top-level methods on `TinyExp`. + +For example, a feature is often a better fit as: + +- `logger_cfg.build_logger(...)` +- `wandb_cfg.build_wandb(...)` +- `checkpoint_cfg.save_checkpoint(...)` + +than as a large collection of flat framework methods. + +This pattern keeps a feature's configuration and execution close together: + +- fields describe the feature and can be overridden through Hydra +- methods execute behavior only when the user explicitly calls them +- `TinyExp` itself stays smaller and easier to understand + +### 4. Keep the training loop in user space + +The training loop is often the most task-specific part of an experiment. TinyExp should not rush to abstract it into +a universal trainer. + +Users should be able to: + +- write their own training loop +- define their own evaluation logic +- control when to validate, log, save, or resume +- stay in plain PyTorch as much as possible + +### 5. Helpers are good; control frameworks are not + +TinyExp should provide thin, reusable helpers for common experiment chores, such as: + +- output directory setup +- checkpoint save/load helpers exposed through focused config components +- launcher compatibility + +These helpers reduce repeated boilerplate without dictating how the user structures training. + +### 6. Examples are recipes, not just demos + +Examples in TinyExp are not only meant to showcase features. They should also serve as reusable recipes and +inheritance-friendly templates. + +That means examples should remain understandable and useful as starting points for real projects. When common logic +emerges across multiple examples, it may be worth extracting a small helper or a recipe base class. But that logic +should only move into the framework when it is broadly useful and still keeps the system light. + +### 7. Framework-level additions must earn their place + +A good question for any new feature is: + +Does this reduce repeated experiment plumbing while preserving user control? + +If yes, it may belong in TinyExp. + +If it starts to own the user workflow, hide core control flow, or push the project toward a heavy trainer-style +architecture, it probably does not belong in TinyExp. + +This rule should be applied strictly. + +TinyExp should not add a new helper, artifact, field, or abstraction just because it sounds generally useful or is a +common pattern in other frameworks. A framework-level addition should only be kept when its value is clear in the +current project, not as a placeholder for possible future needs. + +In practice, that means asking: + +- is there real repeated boilerplate across examples today? +- does this introduce a genuinely useful capability, or only another way to express something already visible? +- if this were removed, would users lose something important, or only a convenience wrapper? +- is the benefit strong enough to justify one more method, field, file, or documented convention? + +If these questions do not have a strong answer, the feature should usually stay out of TinyExp. + +Examples of things that often fail this test are: + +- thin one-line wrapper helpers added only for style or lint appeasement +- duplicate artifacts that do not add clear value over the experiment definition and logs +- speculative schema fields added only for future-proofing before any real compatibility need exists + +For example, if the experiment class already defines the configuration and the runtime logger already records the +effective config, that does not automatically justify a separate `dump_config()` helper or a default `config.yaml` +artifact. Those should exist only if they solve a concrete current problem that the existing structure does not. + +## Recommended Boundary + +### TinyExp should own + +- configuration structure and CLI overrides +- experiment entry and launch ergonomics +- lightweight utilities shared across many experiments +- small `XXXCfg` components for shared capabilities +- minimal helpers that do not take over control flow + +### Examples or user experiments should own + +- model construction details +- training and evaluation loops +- task-specific metrics +- validation timing +- checkpointing policy such as what counts as "best" +- external integration timing and usage + +This boundary keeps TinyExp small while still making it genuinely useful. + +## Design Direction for Future Development + +When extending TinyExp, prefer: + +- small `XXXCfg` components over large abstractions +- explicit calls over automatic behavior +- recipe-style examples over framework-owned trainers +- local clarity over generic indirection +- composable building blocks over lifecycle machinery +- removing weak abstractions rather than keeping them "just in case" +- one clear representation of a concept over several overlapping ones + +In short: + +TinyExp should help users write less experiment plumbing, not less experiment logic. diff --git a/mkdocs.yml b/mkdocs.yml index 82c6533..b0ed407 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -9,6 +9,10 @@ copyright: Maintained by zengarden. nav: - Home: index.md + - Philosophy: philosophy.md + - Phase 1 Plan: phase1-minimal-helpers.md + - Phase 1 File Plan: phase1-file-by-file-plan.md + - Phase 1 API Draft: phase1-api-draft.md - Modules: modules.md plugins: - search diff --git a/tests/examples/test_mnist_exp_run.py b/tests/examples/test_mnist_exp_run.py new file mode 100644 index 0000000..29a6ab2 --- /dev/null +++ b/tests/examples/test_mnist_exp_run.py @@ -0,0 +1,57 @@ +from __future__ import annotations + +from pathlib import Path +from types import SimpleNamespace + +import pytest + +from tinyexp.examples.mnist_exp import Exp + + +def test_mnist_run_val_mode_requires_resume_from(tmp_path: Path, monkeypatch) -> None: + exp = Exp(output_root=str(tmp_path), exp_name="mnist_val", mode="val", resume_from="") + + dummy_accelerator = SimpleNamespace(rank=0, device="cpu", is_main_process=True) + dummy_logger = SimpleNamespace(info=lambda *args, **kwargs: None) + + monkeypatch.setattr(exp.accelerator_cfg, "build_accelerator", lambda: dummy_accelerator) + monkeypatch.setattr(exp.logger_cfg, "build_logger", lambda **kwargs: dummy_logger) + + with pytest.raises(ValueError, match="resume_from"): + exp.run() + + +def test_mnist_run_val_mode_uses_checkpoint(tmp_path: Path, monkeypatch) -> None: + exp_for_ckpt = Exp(output_root=str(tmp_path), exp_name="mnist_val") + checkpoint_path = exp_for_ckpt.checkpoint_cfg.save_checkpoint( + run_dir=str(tmp_path / "mnist_val"), + name="demo.ckpt", + model=exp_for_ckpt.module_cfg.build_module(), + exp_name=exp_for_ckpt.exp_name, + exp_class=exp_for_ckpt.exp_class, + ) + + exp = Exp(output_root=str(tmp_path), exp_name="mnist_val", mode="val", resume_from=checkpoint_path) + + dummy_accelerator = SimpleNamespace(rank=0, device="cpu", is_main_process=True) + dummy_logger = SimpleNamespace(info=lambda *args, **kwargs: None) + + monkeypatch.setattr(exp.accelerator_cfg, "build_accelerator", lambda: dummy_accelerator) + monkeypatch.setattr(exp.logger_cfg, "build_logger", lambda **kwargs: dummy_logger) + + called: dict[str, object] = {} + + def fake_evaluate(*, accelerator, logger, module_or_module_path, val_dataloader=None): + called["accelerator"] = accelerator + called["logger"] = logger + called["module_or_module_path"] = module_or_module_path + called["val_dataloader"] = val_dataloader + return 0.5 + + monkeypatch.setattr(exp, "_evaluate", fake_evaluate) + + exp.run() + + assert called["accelerator"] is dummy_accelerator + assert called["logger"] is dummy_logger + assert called["module_or_module_path"] == checkpoint_path diff --git a/tests/examples/test_mnist_exp_unit.py b/tests/examples/test_mnist_exp_unit.py new file mode 100644 index 0000000..be6b826 --- /dev/null +++ b/tests/examples/test_mnist_exp_unit.py @@ -0,0 +1,50 @@ +from __future__ import annotations + +from types import SimpleNamespace + +import torch + +from tinyexp.examples.mnist_exp import Exp + + +def test_mnist_evaluate_loads_model_state_from_checkpoint(tmp_path) -> None: + exp = Exp(output_root=str(tmp_path), exp_name="mnist_test") + checkpoint_path = exp.checkpoint_cfg.save_checkpoint( + run_dir=str(tmp_path), + name="demo.ckpt", + model=exp.module_cfg.build_module(), + exp_name=exp.exp_name, + exp_class=exp.exp_class, + ) + + class DummyAccelerator: + device = "cpu" + rank = 0 + world_size = 1 + is_main_process = True + + def prepare(self, module): + return module + + def reduce_sum(self, tensor): + return tensor + + def wait_for_everyone(self) -> None: + return None + + dummy_logger = SimpleNamespace(info=lambda *args, **kwargs: None) + + class DummyDataLoader(list): + def __init__(self): + super().__init__([(torch.zeros(1, 1, 28, 28), torch.zeros(1, dtype=torch.long))]) + self.dataset = [0] + + val_dataloader = DummyDataLoader() + metric = exp._evaluate( + accelerator=DummyAccelerator(), + logger=dummy_logger, + module_or_module_path=checkpoint_path, + val_dataloader=val_dataloader, + ) + + assert isinstance(metric, float) diff --git a/tests/examples/test_resnet_exp_run.py b/tests/examples/test_resnet_exp_run.py new file mode 100644 index 0000000..e2bb573 --- /dev/null +++ b/tests/examples/test_resnet_exp_run.py @@ -0,0 +1,178 @@ +from __future__ import annotations + +from pathlib import Path +from types import SimpleNamespace + +import pytest +import torch +import torch.nn as nn + +from tinyexp.examples.resnet_exp import ResNetExp + + +def test_resnet_run_val_mode_requires_resume_from(tmp_path: Path, monkeypatch) -> None: + exp = ResNetExp(output_root=str(tmp_path), exp_name="resnet_val", mode="val", resume_from="") + + dummy_accelerator = SimpleNamespace(rank=0, device="cpu", is_main_process=True) + dummy_logger = SimpleNamespace(info=lambda *args, **kwargs: None) + + monkeypatch.setattr(exp.accelerator_cfg, "build_accelerator", lambda: dummy_accelerator) + monkeypatch.setattr(exp.logger_cfg, "build_logger", lambda **kwargs: dummy_logger) + + with pytest.raises(ValueError, match="resume_from"): + exp.run() + + +def test_resnet_run_val_mode_uses_checkpoint(tmp_path: Path, monkeypatch) -> None: + exp_for_ckpt = ResNetExp(output_root=str(tmp_path), exp_name="resnet_val") + checkpoint_path = exp_for_ckpt.checkpoint_cfg.save_checkpoint( + run_dir=str(tmp_path / "resnet_val"), + name="demo.ckpt", + model=nn.Linear(2, 2), + exp_name=exp_for_ckpt.exp_name, + exp_class=exp_for_ckpt.exp_class, + ) + + exp = ResNetExp(output_root=str(tmp_path), exp_name="resnet_val", mode="val", resume_from=checkpoint_path) + + dummy_accelerator = SimpleNamespace(rank=0, device="cpu", is_main_process=True) + dummy_logger = SimpleNamespace(info=lambda *args, **kwargs: None) + + monkeypatch.setattr(exp.accelerator_cfg, "build_accelerator", lambda: dummy_accelerator) + monkeypatch.setattr(exp.logger_cfg, "build_logger", lambda **kwargs: dummy_logger) + + called: dict[str, object] = {} + + def fake_evaluate(*, accelerator, logger, module_or_module_path, val_dataloader=None): + called["accelerator"] = accelerator + called["logger"] = logger + called["module_or_module_path"] = module_or_module_path + called["val_dataloader"] = val_dataloader + return 0.5 + + monkeypatch.setattr(exp, "_evaluate", fake_evaluate) + + exp.run() + + assert called["accelerator"] is dummy_accelerator + assert called["logger"] is dummy_logger + assert called["module_or_module_path"] == checkpoint_path + assert called["val_dataloader"] is None + + +def test_resnet_train_saves_last_and_best_checkpoints(tmp_path: Path, monkeypatch) -> None: + exp = ResNetExp(output_root=str(tmp_path), exp_name="resnet_train") + + class DummyAccelerator: + rank = 0 + device = "cpu" + is_main_process = True + world_size = 1 + + def prepare(self, module, optimizer): + return module, optimizer + + def unwrap_model(self, module): + return module + + def backward(self, loss): + loss.backward() + + train_batch = [(torch.randn(2, 2), torch.tensor([0, 1]))] + val_batch = [(torch.randn(2, 2), torch.tensor([0, 1]))] + + monkeypatch.setattr(exp.dataloader_cfg, "build_train_dataloader", lambda accelerator, redis_cache_cfg: train_batch) + monkeypatch.setattr(exp.dataloader_cfg, "build_val_dataloader", lambda accelerator: val_batch) + monkeypatch.setattr(exp.module_cfg, "build_module", lambda: nn.Linear(2, 2)) + monkeypatch.setattr( + exp.optimizer_cfg, + "build_optimizer", + lambda module, dataloader, accelerator: torch.optim.SGD(module.parameters(), lr=0.1), + ) + + saved: list[dict[str, object]] = [] + + def fake_save_checkpoint(**kwargs): + saved.append(kwargs) + if len(saved) >= 2: + raise StopIteration + return str(tmp_path / "resnet_train" / "last.ckpt") + + monkeypatch.setattr(exp.checkpoint_cfg, "save_checkpoint", fake_save_checkpoint) + monkeypatch.setattr(exp, "_evaluate", lambda **kwargs: 0.5) + + with pytest.raises(StopIteration): + exp._train( + accelerator=DummyAccelerator(), + logger=SimpleNamespace(info=lambda *args, **kwargs: None), + cfg_dict={}, + run_dir=str(tmp_path / "resnet_train"), + ) + + assert saved[0]["run_dir"] == str(tmp_path / "resnet_train") + assert saved[0]["name"] == exp.checkpoint_cfg.last_ckpt_name + assert saved[0]["epoch"] == 0 + assert saved[0]["global_step"] == 1 + assert saved[0]["best_metric"] is None + assert saved[1]["name"] == exp.checkpoint_cfg.best_ckpt_name + assert saved[1]["best_metric"] == 0.5 + + +def test_resnet_train_resume_loads_checkpoint_state(tmp_path: Path, monkeypatch) -> None: + exp = ResNetExp(output_root=str(tmp_path), exp_name="resnet_train", resume_from="resume.ckpt") + + class DummyAccelerator: + rank = 0 + device = "cpu" + is_main_process = True + world_size = 1 + + def prepare(self, module, optimizer): + return module, optimizer + + def unwrap_model(self, module): + return module + + def backward(self, loss): + loss.backward() + + train_batch = [(torch.randn(2, 2), torch.tensor([0, 1]))] + val_batch = [(torch.randn(2, 2), torch.tensor([0, 1]))] + + monkeypatch.setattr(exp.dataloader_cfg, "build_train_dataloader", lambda accelerator, redis_cache_cfg: train_batch) + monkeypatch.setattr(exp.dataloader_cfg, "build_val_dataloader", lambda accelerator: val_batch) + monkeypatch.setattr(exp.module_cfg, "build_module", lambda: nn.Linear(2, 2)) + monkeypatch.setattr( + exp.optimizer_cfg, + "build_optimizer", + lambda module, dataloader, accelerator: torch.optim.SGD(module.parameters(), lr=0.1), + ) + + load_calls: list[dict[str, object]] = [] + saved: list[dict[str, object]] = [] + + def fake_load_checkpoint(path, **kwargs): + load_calls.append({"path": path, **kwargs}) + return {"epoch": 4, "global_step": 17, "best_metric": 0.7} + + def fake_save_checkpoint(**kwargs): + saved.append(kwargs) + raise StopIteration + + monkeypatch.setattr(exp.checkpoint_cfg, "load_checkpoint", fake_load_checkpoint) + monkeypatch.setattr(exp.checkpoint_cfg, "save_checkpoint", fake_save_checkpoint) + monkeypatch.setattr(exp, "_evaluate", lambda **kwargs: 0.5) + + with pytest.raises(StopIteration): + exp._train( + accelerator=DummyAccelerator(), + logger=SimpleNamespace(info=lambda *args, **kwargs: None), + cfg_dict={}, + run_dir=str(tmp_path / "resnet_train"), + ) + + assert load_calls[0]["path"] == "resume.ckpt" + assert load_calls[0]["map_location"] == "cpu" + assert saved[0]["epoch"] == 5 + assert saved[0]["global_step"] == 18 + assert saved[0]["best_metric"] == 0.7 diff --git a/tests/test_tinyexp_artifacts.py b/tests/test_tinyexp_artifacts.py new file mode 100644 index 0000000..e1d74a2 --- /dev/null +++ b/tests/test_tinyexp_artifacts.py @@ -0,0 +1,147 @@ +from __future__ import annotations + +from pathlib import Path + +import pytest +import torch + +from tinyexp import CheckpointCfg, TinyExp +from tinyexp.exceptions import UnsupportedCheckpointFormatError + + +def test_get_run_dir(tmp_path: Path) -> None: + exp = TinyExp(output_root=str(tmp_path), exp_name="demo_exp") + + expected = tmp_path / "demo_exp" + assert exp.get_run_dir() == str(expected) + + +def test_logger_cfg_creates_run_dir(tmp_path: Path) -> None: + exp = TinyExp(output_root=str(tmp_path), exp_name="demo_exp") + run_dir = Path(exp.get_run_dir()) + + exp.logger_cfg.build_logger(save_dir=str(run_dir), distributed_rank=0) + + assert run_dir.is_dir() + assert (run_dir / "log.txt").is_file() + + +def test_checkpoint_cfg_save_and_load_roundtrip(tmp_path: Path) -> None: + model = torch.nn.Linear(2, 1) + optimizer = torch.optim.SGD(model.parameters(), lr=0.1) + scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1) + + with torch.no_grad(): + model.weight.fill_(1.5) + model.bias.fill_(0.5) + + checkpoint_cfg = CheckpointCfg() + checkpoint_path = checkpoint_cfg.save_checkpoint( + run_dir=str(tmp_path), + name=checkpoint_cfg.last_ckpt_name, + model=model, + optimizer=optimizer, + scheduler=scheduler, + epoch=3, + global_step=12, + best_metric=0.9, + exp_name="demo_exp", + exp_class="tests.demo.Exp", + extra_state={"custom_value": 7}, + ) + + reloaded_model = torch.nn.Linear(2, 1) + reloaded_optimizer = torch.optim.SGD(reloaded_model.parameters(), lr=0.1) + reloaded_scheduler = torch.optim.lr_scheduler.StepLR(reloaded_optimizer, step_size=1) + + checkpoint = checkpoint_cfg.load_checkpoint( + checkpoint_path, + model=reloaded_model, + optimizer=reloaded_optimizer, + scheduler=reloaded_scheduler, + ) + + assert Path(checkpoint_path).is_file() + assert checkpoint["epoch"] == 3 + assert checkpoint["global_step"] == 12 + assert checkpoint["best_metric"] == 0.9 + assert checkpoint["extra_state"]["custom_value"] == 7 + assert checkpoint["meta"]["exp_name"] == "demo_exp" + assert checkpoint["meta"]["exp_class"] == "tests.demo.Exp" + assert "saved_at" in checkpoint["meta"] + + for original_param, reloaded_param in zip(model.parameters(), reloaded_model.parameters()): + assert torch.equal(original_param, reloaded_param) + + +def test_checkpoint_cfg_extra_state_does_not_override_reserved_keys(tmp_path: Path) -> None: + checkpoint_cfg = CheckpointCfg() + + checkpoint_path = checkpoint_cfg.save_checkpoint( + run_dir=str(tmp_path), + name=checkpoint_cfg.last_ckpt_name, + epoch=3, + extra_state={"epoch": 99, "meta": {"exp_name": "bad"}}, + ) + + checkpoint = checkpoint_cfg.load_checkpoint(checkpoint_path) + + assert checkpoint["epoch"] == 3 + assert checkpoint["extra_state"]["epoch"] == 99 + assert checkpoint["meta"]["exp_name"] == "" + + +def test_checkpoint_cfg_rejects_unsupported_model_only_format(tmp_path: Path) -> None: + checkpoint_path = tmp_path / "model_only.ckpt" + torch.save({"state_dict": {"weight": torch.tensor([1.0])}}, checkpoint_path) + + checkpoint_cfg = CheckpointCfg() + + with pytest.raises(UnsupportedCheckpointFormatError, match="not a supported tinyexp checkpoint format"): + checkpoint_cfg.load_checkpoint(str(checkpoint_path)) + + +def test_checkpoint_cfg_rejects_non_dict_payload(tmp_path: Path) -> None: + checkpoint_path = tmp_path / "not_a_dict.ckpt" + torch.save([1, 2, 3], checkpoint_path) + + checkpoint_cfg = CheckpointCfg() + + with pytest.raises(TypeError, match="must be a dict, got list"): + checkpoint_cfg.load_checkpoint(str(checkpoint_path)) + + +def test_checkpoint_cfg_requires_model_state_when_model_is_provided(tmp_path: Path) -> None: + checkpoint_path = tmp_path / "missing_model_state.ckpt" + torch.save({"meta": {}, "epoch": 1}, checkpoint_path) + + checkpoint_cfg = CheckpointCfg() + model = torch.nn.Linear(2, 1) + + with pytest.raises(KeyError, match="model_state_dict"): + checkpoint_cfg.load_checkpoint(str(checkpoint_path), model=model) + + +def test_checkpoint_cfg_requires_optimizer_state_when_optimizer_is_provided(tmp_path: Path) -> None: + checkpoint_path = tmp_path / "missing_optimizer_state.ckpt" + torch.save({"meta": {}, "model_state_dict": {}}, checkpoint_path) + + checkpoint_cfg = CheckpointCfg() + model = torch.nn.Linear(2, 1) + optimizer = torch.optim.SGD(model.parameters(), lr=0.1) + + with pytest.raises(KeyError, match="optimizer_state_dict"): + checkpoint_cfg.load_checkpoint(str(checkpoint_path), optimizer=optimizer) + + +def test_checkpoint_cfg_requires_scheduler_state_when_scheduler_is_provided(tmp_path: Path) -> None: + checkpoint_path = tmp_path / "missing_scheduler_state.ckpt" + torch.save({"meta": {}, "model_state_dict": {}, "optimizer_state_dict": {}}, checkpoint_path) + + checkpoint_cfg = CheckpointCfg() + model = torch.nn.Linear(2, 1) + optimizer = torch.optim.SGD(model.parameters(), lr=0.1) + scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1) + + with pytest.raises(KeyError, match="scheduler_state_dict"): + checkpoint_cfg.load_checkpoint(str(checkpoint_path), scheduler=scheduler) diff --git a/tinyexp/__init__.py b/tinyexp/__init__.py index d656884..ecc7c2b 100644 --- a/tinyexp/__init__.py +++ b/tinyexp/__init__.py @@ -5,18 +5,21 @@ import os import sys from dataclasses import dataclass, field -from typing import Optional +from datetime import datetime, timezone +from pathlib import Path +from typing import Any, Optional +import torch from hydra.conf import HydraConf, RunDir from hydra.core.config_store import ConfigStore from omegaconf import DictConfig from omegaconf.listconfig import ListConfig -from .exceptions import UnknownConfigurationKeyError +from .exceptions import UnknownConfigurationKeyError, UnsupportedCheckpointFormatError from .utils.log_utils import tiny_logger_setup from .utils.ray_utils import simple_launch_exp -__all__ = ["ConfigStore", "RedisCfgMixin", "TinyExp", "simple_launch_exp"] +__all__ = ["CheckpointCfg", "ConfigStore", "RedisCfgMixin", "TinyExp", "simple_launch_exp"] @dataclass @@ -46,6 +49,99 @@ def _default_exp_name() -> str: return "exp" +def _is_main_process() -> bool: + return os.getenv("RANK", "0") == "0" + + +@dataclass +class CheckpointCfg: + last_ckpt_name: str = "last.ckpt" + best_ckpt_name: str = "best.ckpt" + + def save_checkpoint( + self, + *, + run_dir: str, + name: str, + model=None, + optimizer=None, + scheduler=None, + epoch: Optional[int] = None, + global_step: Optional[int] = None, + best_metric: Optional[float] = None, + exp_name: str = "", + exp_class: str = "", + extra_state: Optional[dict[str, Any]] = None, + ) -> str: + save_path = Path(run_dir) / name + save_path.parent.mkdir(parents=True, exist_ok=True) + + checkpoint: dict[str, Any] = { + "epoch": epoch, + "global_step": global_step, + "best_metric": best_metric, + "meta": { + "exp_name": exp_name, + "exp_class": exp_class, + "saved_at": datetime.now(timezone.utc).isoformat(), + }, + } + if model is not None: + checkpoint["model_state_dict"] = model.state_dict() + if optimizer is not None: + checkpoint["optimizer_state_dict"] = optimizer.state_dict() + if scheduler is not None: + checkpoint["scheduler_state_dict"] = scheduler.state_dict() + if extra_state is not None: + checkpoint["extra_state"] = extra_state + + torch.save(checkpoint, save_path) + return str(save_path) + + def _validate_checkpoint_payload(self, path: str, checkpoint: Any) -> dict[str, Any]: + if not isinstance(checkpoint, dict): + raise TypeError(f"Checkpoint at {path} must be a dict, got {type(checkpoint).__name__}.") # noqa: TRY003 + + if ( + not any(key in checkpoint for key in ("epoch", "global_step", "best_metric", "meta", "extra_state")) + and "model_state_dict" not in checkpoint + ): + raise UnsupportedCheckpointFormatError(path) + + return checkpoint + + def _load_required_state( + self, checkpoint: dict[str, Any], *, model=None, optimizer=None, scheduler=None, strict: bool = True + ) -> None: + if model is not None: + if "model_state_dict" not in checkpoint: + raise KeyError("model_state_dict") + model.load_state_dict(checkpoint["model_state_dict"], strict=strict) + if optimizer is not None: + if "optimizer_state_dict" not in checkpoint: + raise KeyError("optimizer_state_dict") + optimizer.load_state_dict(checkpoint["optimizer_state_dict"]) + if scheduler is not None: + if "scheduler_state_dict" not in checkpoint: + raise KeyError("scheduler_state_dict") + scheduler.load_state_dict(checkpoint["scheduler_state_dict"]) + + def load_checkpoint( + self, + path: str, + *, + model=None, + optimizer=None, + scheduler=None, + strict: bool = True, + map_location=None, + ) -> dict[str, Any]: + checkpoint = self._validate_checkpoint_payload(path, torch.load(path, map_location=map_location)) + self._load_required_state(checkpoint, model=model, optimizer=optimizer, scheduler=scheduler, strict=strict) + + return checkpoint + + @dataclass class TinyExp: """ @@ -72,6 +168,8 @@ class TinyExp: # log directory output_root: str = "./output" + mode: str = "train" + resume_from: str = "" # overridden configurations, only for internal use overrided_cfg: dict = field(default_factory=dict) @@ -97,11 +195,16 @@ def build_wandb(self, accelerator=None, **kwargs): @dataclass class LoggerCfg: def build_logger(self, save_dir: str, distributed_rank: int = 0, filename: str = "log.txt", mode: str = "a"): + Path(save_dir).mkdir(parents=True, exist_ok=True) logger = tiny_logger_setup(save_dir, distributed_rank, filename, mode) logger.info(f"==> log file: {os.path.join(save_dir, filename)}") return logger logger_cfg: LoggerCfg = field(default_factory=LoggerCfg) + checkpoint_cfg: CheckpointCfg = field(default_factory=CheckpointCfg) + + def get_run_dir(self) -> str: + return os.path.join(self.output_root, self.exp_name) def set_cfg(self, cfg_hydra, cfg_object=None): if cfg_object is None: diff --git a/tinyexp/examples/mnist_exp.py b/tinyexp/examples/mnist_exp.py index 2a14de7..ccdfdd1 100644 --- a/tinyexp/examples/mnist_exp.py +++ b/tinyexp/examples/mnist_exp.py @@ -1,5 +1,4 @@ import datetime -import os from dataclasses import dataclass, field import torch @@ -142,24 +141,30 @@ def build_lr_scheduler(self, optimizer): # ------------------------------ bellowing is the execution part --------------------- # def run(self) -> None: accelerator = self.accelerator_cfg.build_accelerator() - logger = self.logger_cfg.build_logger( - save_dir=os.path.join(self.output_root, self.exp_name), - distributed_rank=accelerator.rank, - ) + run_dir = self.get_run_dir() + logger = self.logger_cfg.build_logger(save_dir=run_dir, distributed_rank=accelerator.rank) cfg_dict = OmegaConf.to_container(OmegaConf.structured(self), resolve=True) del cfg_dict["hydra"] cfg_msg = OmegaConf.to_yaml(cfg_dict).strip().replace("\n", "\n ") logger.info(f"-------- Configurations --------\n {cfg_msg}") if self.mode == "train": - self._train(accelerator=accelerator, logger=logger, cfg_dict=cfg_dict) + self._train(accelerator=accelerator, logger=logger, cfg_dict=cfg_dict, run_dir=run_dir) + elif self.mode == "val": + if not self.resume_from: + raise ValueError("resume_from is required when mode='val'") # noqa: TRY003 + self._evaluate(accelerator=accelerator, logger=logger, module_or_module_path=self.resume_from) else: raise NotImplementedError(f"Mode {self.mode} is not implemented") - def _evaluate(self, accelerator, logger, module_or_module_path, val_dataloader=None) -> None: + def _evaluate(self, accelerator, logger, module_or_module_path, val_dataloader=None) -> float: if isinstance(module_or_module_path, str): module = Net() - module.load_state_dict(torch.load(module_or_module_path)) + self.checkpoint_cfg.load_checkpoint( + module_or_module_path, + model=module, + map_location=accelerator.device, + ) module = accelerator.prepare(module) else: module = module_or_module_path @@ -187,7 +192,9 @@ def _evaluate(self, accelerator, logger, module_or_module_path, val_dataloader=N if self.wandb_cfg.enable_wandb and accelerator.is_main_process: wandb.log({"val_metric": eval_metric}) - def _train(self, accelerator, logger, cfg_dict) -> None: + return eval_metric + + def _train(self, accelerator, logger, cfg_dict, run_dir: str) -> None: train_dataloader = self.dataloader_cfg.build_train_dataloader(accelerator) val_dataloader = self.dataloader_cfg.build_val_dataloader(accelerator) ori_module = self.module_cfg.build_module() @@ -195,6 +202,20 @@ def _train(self, accelerator, logger, cfg_dict) -> None: lr_scheduler = self.lr_scheduler_cfg.build_lr_scheduler(ori_optimizer) module, optimizer = accelerator.prepare(ori_module, ori_optimizer) + start_epoch = 0 + global_step = 0 + best_metric = None + if self.resume_from: + checkpoint = self.checkpoint_cfg.load_checkpoint( + self.resume_from, + model=accelerator.unwrap_model(module), + optimizer=optimizer, + scheduler=lr_scheduler, + map_location=accelerator.device, + ) + start_epoch = int(checkpoint.get("epoch", -1)) + 1 + global_step = int(checkpoint.get("global_step", 0)) + best_metric = checkpoint.get("best_metric") train_iter = iter(train_dataloader) if self.wandb_cfg.enable_wandb and accelerator.rank == 0: @@ -204,7 +225,7 @@ def _train(self, accelerator, logger, cfg_dict) -> None: config=cfg_dict, ) - for epoch in range(3): + for epoch in range(start_epoch, 3): module.train() for step in range(len(train_dataloader)): @@ -221,6 +242,7 @@ def _train(self, accelerator, logger, cfg_dict) -> None: optimizer.zero_grad() accelerator.backward(loss) optimizer.step() + global_step += 1 if (step + 1) % 20 == 0: logger.info(f"epoch {epoch} loss: {loss.item(): .4f} lr: {optimizer.param_groups[0]['lr']: .4f}") if self.wandb_cfg.enable_wandb and accelerator.rank == 0: @@ -231,9 +253,36 @@ def _train(self, accelerator, logger, cfg_dict) -> None: "lr": optimizer.param_groups[0]["lr"], } ) - self._evaluate( + eval_metric = self._evaluate( accelerator=accelerator, logger=logger, module_or_module_path=module, val_dataloader=val_dataloader ) + if accelerator.is_main_process: + self.checkpoint_cfg.save_checkpoint( + run_dir=run_dir, + name=self.checkpoint_cfg.last_ckpt_name, + model=accelerator.unwrap_model(module), + optimizer=optimizer, + scheduler=lr_scheduler, + epoch=epoch, + global_step=global_step, + best_metric=best_metric, + exp_name=self.exp_name, + exp_class=self.exp_class, + ) + if best_metric is None or eval_metric > best_metric: + best_metric = eval_metric + self.checkpoint_cfg.save_checkpoint( + run_dir=run_dir, + name=self.checkpoint_cfg.best_ckpt_name, + model=accelerator.unwrap_model(module), + optimizer=optimizer, + scheduler=lr_scheduler, + epoch=epoch, + global_step=global_step, + best_metric=best_metric, + exp_name=self.exp_name, + exp_class=self.exp_class, + ) lr_scheduler.step() diff --git a/tinyexp/examples/resnet_exp.py b/tinyexp/examples/resnet_exp.py index 5447497..2239f70 100644 --- a/tinyexp/examples/resnet_exp.py +++ b/tinyexp/examples/resnet_exp.py @@ -327,23 +327,30 @@ def build_val_dataloader(self, accelerator): def run(self) -> None: accelerator = self.accelerator_cfg.build_accelerator() - logger = self.logger_cfg.build_logger( - save_dir=os.path.join(self.output_root, self.exp_name), distributed_rank=accelerator.rank - ) + run_dir = self.get_run_dir() + logger = self.logger_cfg.build_logger(save_dir=run_dir, distributed_rank=accelerator.rank) cfg_dict = OmegaConf.to_container(OmegaConf.structured(self), resolve=True) del cfg_dict["hydra"] cfg_msg = OmegaConf.to_yaml(cfg_dict).strip().replace("\n", "\n ") logger.info(f"-------- Configurations --------\n {cfg_msg}") if self.mode == "train": - self._train(accelerator=accelerator, logger=logger, cfg_dict=cfg_dict) + self._train(accelerator=accelerator, logger=logger, cfg_dict=cfg_dict, run_dir=run_dir) + elif self.mode == "val": + if not self.resume_from: + raise ValueError("resume_from is required when mode='val'") # noqa: TRY003 + self._evaluate(accelerator=accelerator, logger=logger, module_or_module_path=self.resume_from) else: raise NotImplementedError(f"Mode {self.mode} is not implemented") def _evaluate(self, accelerator, logger, module_or_module_path, val_dataloader=None) -> None: if isinstance(module_or_module_path, str): module: nn.Module = self.module_cfg.build_module() - module.load_state_dict(torch.load(module_or_module_path)) + self.checkpoint_cfg.load_checkpoint( + module_or_module_path, + model=module, + map_location=accelerator.device, + ) module = accelerator.prepare_model(module) else: module = module_or_module_path @@ -373,13 +380,30 @@ def _evaluate(self, accelerator, logger, module_or_module_path, val_dataloader=N if self.wandb_cfg.enable_wandb and accelerator.is_main_process: wandb.log({"val_metric": eval_metric}) - def _train(self, accelerator, logger, cfg_dict) -> None: + return eval_metric + + def _train(self, accelerator, logger, cfg_dict, run_dir: str) -> None: train_dataloader = self.dataloader_cfg.build_train_dataloader(accelerator, self.redis_cache_cfg) val_dataloader = self.dataloader_cfg.build_val_dataloader(accelerator) ori_module = self.module_cfg.build_module() ori_optimizer = self.optimizer_cfg.build_optimizer(ori_module, train_dataloader, accelerator) module, optimizer = accelerator.prepare(ori_module, ori_optimizer) lr_scheduler = self.lr_scheduler_cfg.build_lr_scheduler(optimizer) + start_epoch = 0 + global_step = 0 + best_metric = None + + if self.resume_from: + checkpoint = self.checkpoint_cfg.load_checkpoint( + self.resume_from, + model=accelerator.unwrap_model(module), + optimizer=optimizer, + scheduler=lr_scheduler, + map_location=accelerator.device, + ) + start_epoch = int(checkpoint.get("epoch", -1)) + 1 + global_step = int(checkpoint.get("global_step", 0)) + best_metric = checkpoint.get("best_metric") if self.wandb_cfg.enable_wandb and accelerator.rank == 0: self.wandb_cfg.build_wandb( @@ -387,9 +411,8 @@ def _train(self, accelerator, logger, cfg_dict) -> None: ) train_iter = iter(train_dataloader) - global_step = 0 - for global_epoch in range(90): + for global_epoch in range(start_epoch, 90): module.train() epoch_start_time = time.time() @@ -425,9 +448,36 @@ def _train(self, accelerator, logger, cfg_dict) -> None: ) lr_scheduler.step() - self._evaluate( + eval_metric = self._evaluate( accelerator=accelerator, logger=logger, module_or_module_path=module, val_dataloader=val_dataloader ) + if accelerator.is_main_process: + self.checkpoint_cfg.save_checkpoint( + run_dir=run_dir, + name=self.checkpoint_cfg.last_ckpt_name, + model=accelerator.unwrap_model(module), + optimizer=optimizer, + scheduler=lr_scheduler, + epoch=global_epoch, + global_step=global_step, + best_metric=best_metric, + exp_name=self.exp_name, + exp_class=self.exp_class, + ) + if best_metric is None or eval_metric > best_metric: + best_metric = eval_metric + self.checkpoint_cfg.save_checkpoint( + run_dir=run_dir, + name=self.checkpoint_cfg.best_ckpt_name, + model=accelerator.unwrap_model(module), + optimizer=optimizer, + scheduler=lr_scheduler, + epoch=global_epoch, + global_step=global_step, + best_metric=best_metric, + exp_name=self.exp_name, + exp_class=self.exp_class, + ) if __name__ == "__main__": diff --git a/tinyexp/exceptions.py b/tinyexp/exceptions.py index 38bb8c7..b190481 100644 --- a/tinyexp/exceptions.py +++ b/tinyexp/exceptions.py @@ -44,3 +44,11 @@ def __init__(self, launcher: str, allowed: Sequence[str] = ("python", "torchrun" class CudaNotAvailableError(RuntimeError): def __init__(self) -> None: super().__init__("CUDA is required but not available.") + + +class UnsupportedCheckpointFormatError(ValueError): + def __init__(self, path: str) -> None: + self.path = path + super().__init__( + f"Checkpoint at {path} is not a supported tinyexp checkpoint format and does not contain model_state_dict." + )