From 5a72686cbcf2ad9fd4041e3011255a4e42d870f5 Mon Sep 17 00:00:00 2001 From: Ryan Peach Date: Sat, 2 Aug 2025 22:47:01 -0400 Subject: [PATCH 1/4] Converted notes to markmarksman and anki --- .github/pull_request_template.md | 10 +- .github/workflows/ci.yaml | 4 +- .gitignore | 1 + .pre-commit-config.yaml | 4 +- Justfile | 3 + README.md | 42 +- continuing_education/math/README.md | 8 - continuing_education/math/basics/logarithm.md | 4 + .../math/calculus/gradient.md | 9 + .../math/calculus/henessian.md | 9 + .../math/calculus/laplacian.md | 9 + .../math/statistics/log_probability/README.md | 9 + .../log_probability}/__init__.py | 0 .../log_probability}/log_probability.ipynb | 0 .../log_probability}/log_probability.py | 0 .../actor_critic/README.md | 15 +- .../notes}/advantage.md | 10 +- .../notes/policy-gradient.md | 0 .../policy_gradient_methods/notes}/policy.md | 0 .../reinforce/README.md | 69 +++ .../value_based_methods/dqn/README.md | 64 +++ .../value_based_methods/duelingdqn/README.md | 13 + .../value_based_methods/notes/td.md | 1 + .../value_based_methods/notes/value.md | 1 + notes/logseq/config.edn | 421 ------------------ notes/logseq/config.end | 421 ------------------ notes/pages/card.md | 0 notes/pages/contents.md | 1 - notes/pages/dqn.md | 40 -- notes/pages/duelingdqn.md | 8 - notes/pages/gradient.md | 3 - notes/pages/henessian.md | 0 notes/pages/laplacian.md | 0 notes/pages/log-probability.md | 3 - notes/pages/policy-gradient.md | 3 - notes/pages/reinforce.md | 58 --- notes/pages/td.md | 0 notes/pages/value-based-methods.md | 3 - 38 files changed, 232 insertions(+), 1014 deletions(-) delete mode 100644 continuing_education/math/README.md create mode 100644 continuing_education/math/basics/logarithm.md create mode 100644 continuing_education/math/calculus/gradient.md create mode 100644 continuing_education/math/calculus/henessian.md create mode 100644 continuing_education/math/calculus/laplacian.md create mode 100644 continuing_education/math/statistics/log_probability/README.md rename continuing_education/math/{ => statistics/log_probability}/__init__.py (100%) rename continuing_education/math/{ => statistics/log_probability}/log_probability.ipynb (100%) rename continuing_education/math/{ => statistics/log_probability}/log_probability.py (100%) rename notes/pages/actor-critic.md => continuing_education/policy_gradient_methods/actor_critic/README.md (57%) rename {notes/pages => continuing_education/policy_gradient_methods/notes}/advantage.md (67%) rename notes/logseq/custom.css => continuing_education/policy_gradient_methods/notes/policy-gradient.md (100%) rename {notes/pages => continuing_education/policy_gradient_methods/notes}/policy.md (100%) create mode 100644 continuing_education/policy_gradient_methods/reinforce/README.md create mode 100644 continuing_education/value_based_methods/dqn/README.md create mode 100644 continuing_education/value_based_methods/duelingdqn/README.md create mode 100644 continuing_education/value_based_methods/notes/td.md create mode 100644 continuing_education/value_based_methods/notes/value.md delete mode 100644 notes/logseq/config.edn delete mode 100644 notes/logseq/config.end delete mode 100644 notes/pages/card.md delete mode 100644 notes/pages/contents.md delete mode 100644 notes/pages/dqn.md delete mode 100644 notes/pages/duelingdqn.md delete mode 100644 notes/pages/gradient.md delete mode 100644 notes/pages/henessian.md delete mode 100644 notes/pages/laplacian.md delete mode 100644 notes/pages/log-probability.md delete mode 100644 notes/pages/policy-gradient.md delete mode 100644 notes/pages/reinforce.md delete mode 100644 notes/pages/td.md delete mode 100644 notes/pages/value-based-methods.md diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md index f39b4ca..c2b39a4 100644 --- a/.github/pull_request_template.md +++ b/.github/pull_request_template.md @@ -28,9 +28,9 @@ if __name__ == '__main__': foo() ``` -## Logseq +## Marksman -- [ ] Make logseq notes and flash cards. -- [ ] Use singular nouns for tags. -- [ ] Use `-` or `_` in filenames instead of spaces. -- [ ] Use aliases for spaces and plurals. +- [ ] Make marksman notes and anki flash cards. +- [ ] Use singular nouns for filenames. +- [ ] Use `-` in filenames instead of spaces. +- [ ] For any new notes, you need to go back in other notes and link to them using project search. diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 24e116c..6b456da 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -10,12 +10,12 @@ jobs: steps: - uses: actions/checkout@v4 with: - submodules: 'true' + submodules: "true" - name: Install Python uses: actions/setup-python@v4 with: - python-version: '3.11' + python-version: "3.11" - name: Install Just uses: extractions/setup-just@v2 diff --git a/.gitignore b/.gitignore index 4a24b9e..02db118 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ +anki.apkg notes/logseq/bak notes/logseq/.recycle diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 74cc46e..a57b198 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -40,9 +40,9 @@ repos: rev: v1.6.1 hooks: - id: enforce-ascii - files: notes/pages/.*\.md + files: continuing_education/.*\.md - id: mdlinker - files: notes/pages/.*\.md + files: continuing_education/.*\.md args: - "--fix" - "--allow-dirty" diff --git a/Justfile b/Justfile index ef1210e..eb48847 100644 --- a/Justfile +++ b/Justfile @@ -5,3 +5,6 @@ add path: # Sync all notebooks sync: uv run jupytext --sync **/*.ipynb + +anki: + mdanki continuing_education anki.apkg diff --git a/README.md b/README.md index 782631d..d4508ee 100644 --- a/README.md +++ b/README.md @@ -26,7 +26,7 @@ If you then wrap cells (the code you run) in `if __name__ == "__main__"` you als ## Logseq -I use [logseq](https://logseq.com/) to manage my [./notes](./notes) and [flashcards](https://hub.logseq.com/use-cases/1Sr4awszMQzD4GM5KvWim7/how-to-quickly-create-spaced-repetition-flashcards-in-logseq-using-notepad-and-excel/jxPxEdkM4BBhLfrz9PfjBr). This uses the [zettelkasten](https://zettelkasten.de/overview/) method to create a [knowledge graph](https://en.wikipedia.org/wiki/Knowledge_graph), which is perfect for studying. +I use [marksman](https://github.com/artempyanykh/marksman) to manage my markdown notes and link them to each other. I then use [mdanki](https://github.com/ashlinchak/mdanki) to convert them to flashcards. # Styleguide @@ -52,7 +52,7 @@ See [.github/pull_request_template.md](.github/pull_request_template.md) for the * Reinforcement Learning * Value Based Methods - I'm pretty much up to date with these methods, but might as well implement them. I may go into less explanation though. * πŸ“–πŸ““πŸ’» [$TD(\lambda)$](https://web.stanford.edu/class/cs234/notes/cs234-notes7.pdf) - * πŸ“–πŸ““πŸ’»β— [Deep Q Learning](https://arxiv.org/abs/1312.5602) + * πŸ“–πŸ““πŸ’»β— [Deep Q Learning](https://arxiv.org/abs/1312.5602) * * πŸ“–β—[Prioritized Experience Replay](https://arxiv.org/abs/1511.05952) * πŸ“–β—[Double Q Learning](https://arxiv.org/abs/1509.06461) @@ -85,21 +85,21 @@ See [.github/pull_request_template.md](.github/pull_request_template.md) for the * [ ] [Go-Explore](https://www.nature.com/articles/s41586-020-03157-9) * [ ] [NoisyNet](https://openreview.net/pdf?id=rywHCPkAW) * [ ] [DQN-PixelCNN](https://arxiv.org/abs/1606.01868) - * [ ] [#Exploration](http://papers.neurips.cc/paper/6868-exploration-a-study-of-count-based-exploration-for-deep-reinforcement-learning.pdf) - * [ ] [EX2](https://papers.nips.cc/paper/2017/file/1baff70e2669e8376347efd3a874a341-Paper.pdf) - * [ ] [ICM](https://arxiv.org/abs/1705.05363) - * [ ] [RND](https://arxiv.org/abs/1810.12894) - * [ ] [NGU](https://arxiv.org/abs/2002.06038) - * [ ] [Agent57](https://arxiv.org/abs/2003.13350) - * [ ] [VIME](https://arxiv.org/abs/1605.09674) - * [ ] [EMI](https://openreview.net/forum?id=H1exf64KwH) - * [ ] [DIYAN](https://arxiv.org/abs/1802.06070) - * [ ] [SAC](https://arxiv.org/abs/1801.01290) - * [ ] [BootstrappedDQN](https://arxiv.org/abs/1602.04621) - * [ ] [PSRL](https://arxiv.org/pdf/1306.0940.pdf) - * [ ] [HER](https://arxiv.org/pdf/1707.01495.pdf) - * [ ] [DQfD](https://arxiv.org/abs/1704.03732) - * [ ] [R2D3](https://arxiv.org/abs/1909.01387) + * [ ] [#Exploration](http://papers.neurips.cc/paper/6868-exploration-a-study-of-count-based-exploration-for-deep-reinforcement-learning.pdf) + * [ ] [EX2](https://papers.nips.cc/paper/2017/file/1baff70e2669e8376347efd3a874a341-Paper.pdf) + * [ ] [ICM](https://arxiv.org/abs/1705.05363) + * [ ] [RND](https://arxiv.org/abs/1810.12894) + * [ ] [NGU](https://arxiv.org/abs/2002.06038) + * [ ] [Agent57](https://arxiv.org/abs/2003.13350) + * [ ] [VIME](https://arxiv.org/abs/1605.09674) + * [ ] [EMI](https://openreview.net/forum?id=H1exf64KwH) + * [ ] [DIYAN](https://arxiv.org/abs/1802.06070) + * [ ] [SAC](https://arxiv.org/abs/1801.01290) + * [ ] [BootstrappedDQN](https://arxiv.org/abs/1602.04621) + * [ ] [PSRL](https://arxiv.org/pdf/1306.0940.pdf) + * [ ] [HER](https://arxiv.org/pdf/1707.01495.pdf) + * [ ] [DQfD](https://arxiv.org/abs/1704.03732) + * [ ] [R2D3](https://arxiv.org/abs/1909.01387) * Multi Agent RL * [ ] [Emergent Communication through Negotiation](https://arxiv.org/abs/1804.03980) * [ ] Warp Drive @@ -114,11 +114,11 @@ See [.github/pull_request_template.md](.github/pull_request_template.md) for the * Transformers * [ ] [Tokenization](https://huggingface.co/learn/nlp-course/en/chapter6/1?fw=pt) * [ ] [Word Embeddings](https://pytorch.org/tutorials/beginner/nlp/word_embeddings_tutorial.html) - * πŸ“–β—[Transformers](https://arxiv.org/abs/1706.03762) + * πŸ“–β—[Transformers](https://arxiv.org/abs/1706.03762) * * - * πŸ“–β—[BERT](https://arxiv.org/abs/1810.04805) - * [ ]❗[Sentence-BERT](https://arxiv.org/pdf/1908.10084) + * πŸ“–β—[BERT](https://arxiv.org/abs/1810.04805) + * [ ]❗[Sentence-BERT](https://arxiv.org/pdf/1908.10084) * [ ] [Fine Tuning](https://huggingface.co/learn/nlp-course/en/chapter3/1?fw=pt) * [ ] [RLHF](https://huggingface.co/blog/the_n_implementation_details_of_rlhf_with_ppo) * [ ] [Direct Preference Optimization](https://arxiv.org/pdf/2305.18290) @@ -141,7 +141,7 @@ See [.github/pull_request_template.md](.github/pull_request_template.md) for the * [Survey on Graph RAG](https://arxiv.org/abs/2408.08921) * [ ] Diffusion Models * -* [ ]❗Graph Neural Networks (GNN) +* [ ]❗Graph Neural Networks (GNN) * * Cognitive Science * [ ] [Hopfield Network](https://www.youtube.com/watch?v=1WPJdAW-sFo) diff --git a/continuing_education/math/README.md b/continuing_education/math/README.md deleted file mode 100644 index d6f1af1..0000000 --- a/continuing_education/math/README.md +++ /dev/null @@ -1,8 +0,0 @@ -- What are the [[equalities]] of [[logarithms]]? #card - - They turn multiplication into addition: $\log(a \cdot b) = \log(a) + \log(b)$ - - They turn division into subtraction: $\log(a / b) = \log(a) - \log(b)$ - - They turn exponentiation into multiplication: $\log(a^b) = b \cdot \log(a)$ -- Why would you use a negative [[log probability]] in a [[loss function]]? #card - - It's infinity at 0 and 0 at 1, which means at high confidence in something you get a low loss approaching 0, and at low confidence you get a high loss approaching infinity. It gives a strong [[gradient]] signal to the network to update its parameters. -- What is the equation for a sigmoid function? #card - - $f(x) = \frac{1}{1 + e^{-x}}$ diff --git a/continuing_education/math/basics/logarithm.md b/continuing_education/math/basics/logarithm.md new file mode 100644 index 0000000..2afbe56 --- /dev/null +++ b/continuing_education/math/basics/logarithm.md @@ -0,0 +1,4 @@ +## What are the identities of [[logarithm]]s? + - They turn multiplication into addition: $\log(a \cdot b) = \log(a) + \log(b)$ + - They turn division into subtraction: $\log(a / b) = \log(a) - \log(b)$ + - They turn exponentiation into multiplication: $\log(a^b) = b \cdot \log(a)$ diff --git a/continuing_education/math/calculus/gradient.md b/continuing_education/math/calculus/gradient.md new file mode 100644 index 0000000..0725928 --- /dev/null +++ b/continuing_education/math/calculus/gradient.md @@ -0,0 +1,9 @@ +# Gradient + +## What is the [[gradient]]? + +A vector of single partial derivatives in each dimension of the input space. + +## What does the symbol $\nabla$ mean? + +The [[gradient]] diff --git a/continuing_education/math/calculus/henessian.md b/continuing_education/math/calculus/henessian.md new file mode 100644 index 0000000..8903526 --- /dev/null +++ b/continuing_education/math/calculus/henessian.md @@ -0,0 +1,9 @@ +# Henessian + +## What is a [[Henessian]]? + +A second order [[gradient]] that produces a matrix of second derivatives. + +## What is the difference between a [[Henessian]] and a [[Laplacian]]? + +- Henessians produce a matrix of second derivatives, while Laplacians produce a scalar. Both are second order derivatives. diff --git a/continuing_education/math/calculus/laplacian.md b/continuing_education/math/calculus/laplacian.md new file mode 100644 index 0000000..95ac7ef --- /dev/null +++ b/continuing_education/math/calculus/laplacian.md @@ -0,0 +1,9 @@ +## Laplacian + +## What is the Laplacian + +The second order [[gradient]]. Produces a scalar. + +## What does the symbol $\Delta$ mean? + +The [[Laplacian]] operator. diff --git a/continuing_education/math/statistics/log_probability/README.md b/continuing_education/math/statistics/log_probability/README.md new file mode 100644 index 0000000..be1b9cb --- /dev/null +++ b/continuing_education/math/statistics/log_probability/README.md @@ -0,0 +1,9 @@ +# Log Probability + +## Why would you use a negative [[log-probability]] in a loss function? + +It's infinity at 0 and 0 at 1, which means at high confidence in something you get a low loss approaching 0, and at low confidence you get a high loss approaching infinity. It gives a strong [[gradient]] signal to the network to update its parameters. + +## What is the equation for a sigmoid function? + +$f(x) = \frac{1}{1 + e^{-x}}$ diff --git a/continuing_education/math/__init__.py b/continuing_education/math/statistics/log_probability/__init__.py similarity index 100% rename from continuing_education/math/__init__.py rename to continuing_education/math/statistics/log_probability/__init__.py diff --git a/continuing_education/math/log_probability.ipynb b/continuing_education/math/statistics/log_probability/log_probability.ipynb similarity index 100% rename from continuing_education/math/log_probability.ipynb rename to continuing_education/math/statistics/log_probability/log_probability.ipynb diff --git a/continuing_education/math/log_probability.py b/continuing_education/math/statistics/log_probability/log_probability.py similarity index 100% rename from continuing_education/math/log_probability.py rename to continuing_education/math/statistics/log_probability/log_probability.py diff --git a/notes/pages/actor-critic.md b/continuing_education/policy_gradient_methods/actor_critic/README.md similarity index 57% rename from notes/pages/actor-critic.md rename to continuing_education/policy_gradient_methods/actor_critic/README.md index 0234214..5072daf 100644 --- a/notes/pages/actor-critic.md +++ b/continuing_education/policy_gradient_methods/actor_critic/README.md @@ -1,12 +1,11 @@ ---- -alias: actor critic, a2c, a3c ---- -- What is the [[value function]] used for in [[actor critic]] methods? #card - - The [[value function]] is used to estimate the *average* expected return from a given state. +# Actor Critic + +## What is the [[value]] function used for in [[actor critic]] methods? + - The [[value]] function is used to estimate the *average* expected return from a given state. - It could theoretically be a Q-function, but in practice, it is often a state-value function using [[TD]] error. -- What is the [[advantage]] function used for in [[actor critic]] methods? #card +## What is the [[advantage]] function used for in [[actor critic]] methods? - The difference between the expected reward from a state-action pair (Q) and the average expected reward from just the state (V). - $A(s, a) = Q(s, a) - V(s)$ - It is used to normalize the [[policy gradient]], as well as to push the [[policy gradient]] towards actions that are better than average and away from actions that are worse than average. -- What is the training loop for [[a2c]] [[actor critic]]? #card -- What is the difference between [[a2c]] and [[a3c]]? #card +## What is the training loop for a2c? #todo +## What is the difference between a2c and a3c? diff --git a/notes/pages/advantage.md b/continuing_education/policy_gradient_methods/notes/advantage.md similarity index 67% rename from notes/pages/advantage.md rename to continuing_education/policy_gradient_methods/notes/advantage.md index 30bd228..5147d0a 100644 --- a/notes/pages/advantage.md +++ b/continuing_education/policy_gradient_methods/notes/advantage.md @@ -1,13 +1,9 @@ ---- -alias: advantage function ---- - -- What shape is the [[advantage]] function? #card +## What shape is the [[advantage]] function? - $A(s,a) \in \mathbb{R}^{|A|}$, where $|A|$ is the number of actions. - It works in a fixed integer number of actions. - Same shape as the Q-value function. -- What is the intuition behind the [[advantage]] function? #card +## What is the intuition behind the [[advantage]] function? - The [[advantage]] function is a measure of how much better an action is compared to the average action in a given state. - Learning relative [[advantage]] is easier and has less variance than learning absolute values. [[Advantage]] is more relevant to decision making via argmax than absolute values. -- Define the [[advantage]] function in terms of the Q-value function and the value function. #card +## Define the [[advantage]] function in terms of the Q-value function and the value function. - $A(s,a) = Q(s,a) - V(s)$ diff --git a/notes/logseq/custom.css b/continuing_education/policy_gradient_methods/notes/policy-gradient.md similarity index 100% rename from notes/logseq/custom.css rename to continuing_education/policy_gradient_methods/notes/policy-gradient.md diff --git a/notes/pages/policy.md b/continuing_education/policy_gradient_methods/notes/policy.md similarity index 100% rename from notes/pages/policy.md rename to continuing_education/policy_gradient_methods/notes/policy.md diff --git a/continuing_education/policy_gradient_methods/reinforce/README.md b/continuing_education/policy_gradient_methods/reinforce/README.md new file mode 100644 index 0000000..add1a18 --- /dev/null +++ b/continuing_education/policy_gradient_methods/reinforce/README.md @@ -0,0 +1,69 @@ +# Reinforce + +## What is the [[REINFORCE]] scoring function? + +- $L(\theta) = \frac{1}{T} \sum_{t=0}^{T-1} G_t \log \pi_{\theta}(a_t | s_t)$ + - $L(\theta)$: The loss function + - $T$: The number of time steps in the episode + - $G_t$: Cumulative discounted future reward at time step $t$ + - $\pi_{\theta}(a_t | s_t)$: The probability of taking action $a_t$ in state $s_t$ under [[policy]] $\pi_{\theta}$ +- $\Delta \theta = \alpha r \frac{\partial \log \pi_{\theta}(s, a)}{\partial \theta}$ + - $\Delta \theta$: The update to the [[policy]] parameters. The [[laplacian]] + - $\alpha$: The learning rate + - $r$: The reward + - $\partial{\log \pi_{\theta}(s, a)}$: The [[gradient]] of the [[log probability]] of taking action $a$ in state $s$ under [[policy]] $\pi_{\theta}$ + +## What is the [[policy gradient]] theorem? + +- For any differentiable [[policy]] and for any [[policy]] objective function, the [[policy gradient]] is: $\nabla_{\theta} J(\theta) = \mathbb{E}_{\pi_{\theta}}[\nabla_{\theta} \log \pi_{\theta}(a_t | s_t) R(\tau)]$ + - $J(\theta)$: The objective function to maximize + - $\pi_{\theta}(a_t | s_t)$: The probability of taking action $a_t$ in state $s_t$ under [[policy]] $\pi_{\theta}$ + - $R(\tau)$: The return of a trajectory $\tau$, which is often formulated as cumulative discounted future rewards. + - $J(\theta)$: The objective function + +## How does [[REINFORCE]] implement policy gradient? + +Using a monte carlo method over episode rollouts. + +## What is the training loop for [[REINFORCE]]? + +- Initialize the [[policy]] $\pi_{\theta}$ with random weights +- For each episode: + - Generate a trajectory $\tau$ by following the [[policy]] $\pi_{\theta}$ + - Compute the return $R(\tau)$ + - Compute the [[policy gradient]] $\nabla_{\theta} J(\theta)$ + - Update the [[policy]] parameters $\theta$ with the [[gradient]] + +## What is $\pi_{\theta}(a | s)$? + +- The function which is learned by the [[REINFORCE]] algorithm +- The probability of taking action $a$ in state $s$ under [[policy]] $\pi_{\theta}$ + +## Why do you need to normalize the rewards in [[REINFORCE]]? + +Since the rewards are arbitrary and directly part of the objective/loss function, they are normalized to make the optimization more stable. + +## What is a multinomial distribution? + +A probability distribution over a discrete number of possible outcomes, where each outcome has a probability associated with it. Like a dice roll, where each face has a probability of being rolled. + +## What are some advantages of [[policy gradient]] methods over [[value]] based methods? + +- They can learn stochastic policies +- They can learn policies in high-dimensional or continuous action spaces +- They can have better convergence properties, they can be made to change smoothly over time with sampling rather than depending on an argmax operation. + +## What are some disadvantages of [[policy gradient]] methods vs [[value]] based methods ? + +- They have high variance in the [[gradient]]s which can lead to + - Slow convergence + - Catastrophic forgetting +- They can be sensitive to the choice of step size +- They can be computationally expensive + +## What is the softmax equation with temperature? + +- $P(i) = \frac{e^{Z_i/T}}{\sum_{j} e^{Z_j/T}}$ +- $P(i)$: The probability of outcome $i$ +- $Z_i$: The logit of outcome $i$ +- $T$: The temperature parameter diff --git a/continuing_education/value_based_methods/dqn/README.md b/continuing_education/value_based_methods/dqn/README.md new file mode 100644 index 0000000..b9a3cfe --- /dev/null +++ b/continuing_education/value_based_methods/dqn/README.md @@ -0,0 +1,64 @@ +# DQN + +## What is the bellman equation? +* The Bellman equation is a recursive relationship that defines the value of a state-action pair in terms of the immediate reward and the expected value of the next state. + +* $Q(s, a) = r + \lambda * max_a'(Q(s', a')) * (1 - done(s, a))$ + * $Q(s, a)$: The Q-value of taking action $a$ in state $s$ + * $r$: The reward of taking action $a$ in state $s$ + * $\lambda$: The discount factor + * $max_a'(Q(s', a'))$: The maximum Q-value of the next state $s'$ + * $done(s, a)$: Whether the episode is done after taking action $a$ in state $s$ + +## What does the Q value represent? + +The expected cumulative future reward of taking action $a$ in state $s$ following the optimal [[policy]] thereafter. + +## What is the objective function of DQN? + +* $L(\theta) = \mathbb{E}[(Q_\theta(s, a) - (r + \lambda * max_a'(Q_\theta(s', a'))))^2]$ + * $L(\theta)$: The loss function + * $Q(s, a)$: The Q-value of taking action $a$ in state $s$ + * $Q(s', a')$: The Q-value of taking action $a'$ in next state $s'$ + * $r$: The reward of taking action $a$ in state $s$ + * $\lambda$: The discount factor + +## What are some differences in the `act` method between [[dqn]] and [[REINFORCE]] + +* QLearning uses an argmax to select the best action whereas [[REINFORCE]] uses a softmax sample to select an action +* QLearning has an epsilon greedy [[policy]] whereas [[REINFORCE]] has a stochastic [[policy]] + +## Define an epsilon greedy [[policy]] + +* An epsilon greedy [[policy]] is a [[policy]] that selects the best action with probability $1 - \epsilon$ and a random action with probability $\epsilon$ + +## What are some differences in the `train` method between [[dqn]] and [[REINFORCE]] + +* QLearning trains on each step whereas [[REINFORCE]] trains at the end of each episode +* [[REINFORCE]] needs a whole trajectory to train, because it operates on real cumulative rewards, whereas QLearning can train on each step because it operates on predicted cumulative rewards + +## What are some differences in the `collect_episodes` method between [[dqn]] and [[REINFORCE]] + +* QLearning uses SARS whereas [[REINFORCE]] uses SAR +* [[REINFORCE]] needs a whole trajectory to train, because it operates on real cumulative rewards, whereas QLearning can train on each step because it operates on predicted cumulative rewards +* This is because the bellman equation requires a mixture of one real reward and one predicted reward from the network to properly train + +## What are the differences between exploration rate and temperature in [[dqn]] and [[REINFORCE]]? + +* Exploration rate is a probability of taking a random action +* Temperature is a parameter in the softmax function that controls the stochasticity of the [[policy]] +* Exploration either happens or does not happen and is not controlled by the neural network at all +* Temperature just controls the output of the network. If the network is very confident in one action, it will still take that action with high probability, but if the network is unsure, it will take a random action with some probability. Therefore you don't need to decay temperature, but you do need to decay exploration rate. + +## What is action replay memory? Why is it needed in [[dqn]] but not [[REINFORCE]]. + +* A buffer that stores experiences for training the network. +* [[dqn]] is an off-policy method, so it can learn from past experiences. [[REINFORCE]] is an on-policy method, so it can't learn from past experiences. The replay buffer is a feature of off-policy methods, not a hindrance. + +## What is on-policy learning? + +When the agent learns from the same [[policy]] that it uses to interact with the environment. + +## What is off-policy learning? + +Off-policy learning is when the agent can learn from a different [[policy]] than the one it uses to interact with the environment. This allows for more efficient learning because the agent can learn from past experiences. diff --git a/continuing_education/value_based_methods/duelingdqn/README.md b/continuing_education/value_based_methods/duelingdqn/README.md new file mode 100644 index 0000000..92978c2 --- /dev/null +++ b/continuing_education/value_based_methods/duelingdqn/README.md @@ -0,0 +1,13 @@ +# Dueling DQN + +## What is the dueling DQN architecture? + +The Dueling [[dqn]] architecture is a modification of the standard [[dqn]] architecture that separates the [[value]] function and [[advantage]] function into two separate streams. + +## What is the mathematical identity linking the advantage function, the Q function, and the value function? + +Because the [[advantage]] function can be defined as $A(s,a) = Q(s,a) - V(s)$, we can re-arrange this to get the Q-value function as $Q(s,a) = A(s,a) - V(s)$. + +## What advantage does separating the advantage and value functions provide? + +This has the effect of constraining the bias of each network, which can help with convergence. diff --git a/continuing_education/value_based_methods/notes/td.md b/continuing_education/value_based_methods/notes/td.md new file mode 100644 index 0000000..9e0d19e --- /dev/null +++ b/continuing_education/value_based_methods/notes/td.md @@ -0,0 +1 @@ +# Temporal Difference diff --git a/continuing_education/value_based_methods/notes/value.md b/continuing_education/value_based_methods/notes/value.md new file mode 100644 index 0000000..4c11c52 --- /dev/null +++ b/continuing_education/value_based_methods/notes/value.md @@ -0,0 +1 @@ +# Value diff --git a/notes/logseq/config.edn b/notes/logseq/config.edn deleted file mode 100644 index 5ef6e65..0000000 --- a/notes/logseq/config.edn +++ /dev/null @@ -1,421 +0,0 @@ -{:meta/version 1 - - ;; Set the preferred format. - ;; Available options: - ;; - Markdown (default) - ;; - Org - ;; :preferred-format "Markdown" - - ;; Set the preferred workflow style. - ;; Available options: - ;; - :now for NOW/LATER style (default) - ;; - :todo for TODO/DOING style - :preferred-workflow :now - - ;; Exclude directories/files. - ;; Example usage: - ;; :hidden ["/archived" "/test.md" "../assets/archived"] - :hidden [] - - ;; Define the default journal page template. - ;; Enter the template name between the quotes. - :default-templates - {:journals ""} - - ;; Set a custom date format for the journal page title. - ;; Default value: "MMM do, yyyy" - ;; e.g., "Jan 19th, 2038" - ;; Example usage e.g., "Tue 19th, Jan 2038" - ;; :journal/page-title-format "EEE do, MMM yyyy" - - ;; Specify the journal filename format using a valid date format string. - ;; !Warning: - ;; This configuration is not retroactive and affects only new journals. - ;; To show old journal files in the app, manually rename the files in the - ;; journal directory to match the new format. - ;; Default value: "yyyy_MM_dd" - ;; :journal/file-name-format "yyyy_MM_dd" - - ;; Enable tooltip preview on hover. - ;; Default value: true - :ui/enable-tooltip? true - - ;; Display brackets [[]] around page references. - ;; Default value: true - ;; :ui/show-brackets? true - - ;; Display all lines of a block when referencing ((block)). - ;; Default value: false - :ui/show-full-blocks? false - - ;; Automatically expand block references when zooming in. - ;; Default value: true - :ui/auto-expand-block-refs? true - - ;; Enable Block timestamps. - ;; Default value: false - :feature/enable-block-timestamps? false - - ;; Disable accent marks when searching. - ;; After changing this setting, rebuild the search index by pressing (^C ^S). - ;; Default value: true - :feature/enable-search-remove-accents? true - - ;; Enable journals. - ;; Default value: true - ;; :feature/enable-journals? true - - ;; Enable flashcards. - ;; Default value: true - ;; :feature/enable-flashcards? true - - ;; Enable whiteboards. - ;; Default value: true - ;; :feature/enable-whiteboards? true - - ;; Disable the journal's built-in 'Scheduled tasks and deadlines' query. - ;; Default value: false - ;; :feature/disable-scheduled-and-deadline-query? false - - ;; Specify the number of days displayed in the future for - ;; the 'scheduled tasks and deadlines' query. - ;; Example usage: - ;; Display all scheduled and deadline blocks for the next 14 days: - ;; :scheduled/future-days 14 - ;; Default value: 7 - ;; :scheduled/future-days 7 - - ;; Specify the first day of the week. - ;; Available options: - ;; - integer from 0 to 6 (Monday to Sunday) - ;; Default value: 6 (Sunday) - :start-of-week 6 - - ;; Specify a custom CSS import. - ;; This option takes precedence over the local `logseq/custom.css` file. - ;; Example usage: - ;; :custom-css-url "@import url('https://cdn.jsdelivr.net/gh/dracula/logseq@master/custom.css');" - - ;; Specify a custom JS import. - ;; This option takes precedence over the local `logseq/custom.js` file. - ;; Example usage: - ;; :custom-js-url "https://cdn.logseq.com/custom.js" - - ;; Set a custom Arweave gateway - ;; Default gateway: https://arweave.net - ;; :arweave/gateway "https://arweave.net" - - ;; Set bullet indentation when exporting - ;; Available options: - ;; - `:eight-spaces` as eight spaces - ;; - `:four-spaces` as four spaces - ;; - `:two-spaces` as two spaces - ;; - `:tab` as a tab character (default) - ;; :export/bullet-indentation :tab - - ;; Publish all pages within the Graph - ;; Regardless of whether individual pages have been marked as public. - ;; Default value: false - ;; :publishing/all-pages-public? false - - ;; Define the default home page and sidebar status. - ;; If unspecified, the journal page will be loaded on startup and the right sidebar will stay hidden. - ;; The `:page` value represents the name of the page displayed at startup. - ;; Available options for `:sidebar` are: - ;; - "Contents" to display the Contents page in the right sidebar. - ;; - A specific page name to display in the right sidebar. - ;; - An array of multiple pages, e.g., ["Contents" "Page A" "Page B"]. - ;; If `:sidebar` remains unset, the right sidebar will stay hidden. - ;; Examples: - ;; 1. Set "Changelog" as the home page and display "Contents" in the right sidebar: - ;; :default-home {:page "Changelog", :sidebar "Contents"} - ;; 2. Set "Jun 3rd, 2021" as the home page without the right sidebar: - ;; :default-home {:page "Jun 3rd, 2021"} - ;; 3. Set "home" as the home page and display multiple pages in the right sidebar: - ;; :default-home {:page "home", :sidebar ["Page A" "Page B"]} - - ;; Set the default location for storing notes. - ;; Default value: "pages" - ;; :pages-directory "pages" - - ;; Set the default location for storing journals. - ;; Default value: "journals" - ;; :journals-directory "journals" - - ;; Set the default location for storing whiteboards. - ;; Default value: "whiteboards" - ;; :whiteboards-directory "whiteboards" - - ;; Enabling this option converts - ;; [[Grant Ideas]] to [[file:./grant_ideas.org][Grant Ideas]] for org-mode. - ;; For more information, visit https://github.com/logseq/logseq/issues/672 - ;; :org-mode/insert-file-link? false - - ;; Configure custom shortcuts. - ;; Syntax: - ;; 1. + indicates simultaneous key presses, e.g., `Ctrl+Shift+a`. - ;; 2. A space between keys represents key chords, e.g., `t s` means - ;; pressing `t` followed by `s`. - ;; 3. mod refers to `Ctrl` for Windows/Linux and `Command` for Mac. - ;; 4. Use false to disable a specific shortcut. - ;; 5. You can define multiple bindings for a single action, e.g., ["ctrl+j" "down"]. - ;; The full list of configurable shortcuts is available at: - ;; https://github.com/logseq/logseq/blob/master/src/main/frontend/modules/shortcut/config.cljs - ;; Example: - ;; :shortcuts - ;; {:editor/new-block "enter" - ;; :editor/new-line "shift+enter" - ;; :editor/insert-link "mod+shift+k" - ;; :editor/highlight false - ;; :ui/toggle-settings "t s" - ;; :editor/up ["ctrl+k" "up"] - ;; :editor/down ["ctrl+j" "down"] - ;; :editor/left ["ctrl+h" "left"] - ;; :editor/right ["ctrl+l" "right"]} - :shortcuts {} - - ;; Configure the behavior of pressing Enter in document mode. - ;; if set to true, pressing Enter will create a new block. - ;; Default value: false - :shortcut/doc-mode-enter-for-new-block? false - - ;; Block content larger than `block/content-max-length` will not be searchable - ;; or editable for performance. - ;; Default value: 10000 - :block/content-max-length 10000 - - ;; Display command documentation on hover. - ;; Default value: true - :ui/show-command-doc? true - - ;; Display empty bullet points. - ;; Default value: false - :ui/show-empty-bullets? false - - ;; Pre-defined :view function to use with advanced queries. - :query/views - {:pprint - (fn [r] [:pre.code (pprint r)])} - - ;; Advanced queries `:result-transform` function. - ;; Transform the query result before displaying it. - :query/result-transforms - {:sort-by-priority - (fn [result] (sort-by (fn [h] (get h :block/priority "Z")) result))} - - ;; The following queries will be displayed at the bottom of today's journal page. - ;; The "NOW" query returns tasks with "NOW" or "DOING" status. - ;; The "NEXT" query returns tasks with "NOW", "LATER", or "TODO" status. - :default-queries - {:journals - [{:title "πŸ”¨ NOW" - :query [:find (pull ?h [*]) - :in $ ?start ?today - :where - [?h :block/marker ?marker] - [(contains? #{"NOW" "DOING"} ?marker)] - [?h :block/page ?p] - [?p :block/journal? true] - [?p :block/journal-day ?d] - [(>= ?d ?start)] - [(<= ?d ?today)]] - :inputs [:14d :today] - :result-transform (fn [result] - (sort-by (fn [h] - (get h :block/priority "Z")) result)) - :group-by-page? false - :collapsed? false} - {:title "πŸ“… NEXT" - :query [:find (pull ?h [*]) - :in $ ?start ?next - :where - [?h :block/marker ?marker] - [(contains? #{"NOW" "LATER" "TODO"} ?marker)] - [?h :block/page ?p] - [?p :block/journal? true] - [?p :block/journal-day ?d] - [(> ?d ?start)] - [(< ?d ?next)]] - :inputs [:today :7d-after] - :group-by-page? false - :collapsed? false}]} - - ;; Add custom commands to the command palette - ;; Example usage: - ;; :commands - ;; [ - ;; ["js" "Javascript"] - ;; ["md" "Markdown"] - ;; ] - :commands [] - - ;; Enable collapsing blocks with titles but no children. - ;; By default, only blocks with children can be collapsed. - ;; Setting `:outliner/block-title-collapse-enabled?` to true allows collapsing - ;; blocks with titles (multiple lines) and content. For example: - ;; - block title - ;; block content - ;; Default value: false - :outliner/block-title-collapse-enabled? false - - ;; Macros replace texts and will make you more productive. - ;; Example usage: - ;; Change the :macros value below to: - ;; {"poem" "Rose is $1, violet's $2. Life's ordered: Org assists you."} - ;; input "{{poem red,blue}}" - ;; becomes - ;; Rose is red, violet's blue. Life's ordered: Org assists you. - :macros {} - - ;; Configure the default expansion level for linked references. - ;; For example, consider the following block hierarchy: - ;; - a [[page]] (level 1) - ;; - b (level 2) - ;; - c (level 3) - ;; - d (level 4) - ;; - ;; With the default value of level 2, block b will be collapsed. - ;; If the level's value is set to 3, block c will be collapsed. - ;; Default value: 2 - :ref/default-open-blocks-level 2 - - ;; Configure the threshold for linked references before collapsing. - ;; Default value: 100 - :ref/linked-references-collapsed-threshold 50 - - ;; Graph view configuration. - ;; Example usage: - ;; :graph/settings - ;; {:orphan-pages? true ; Default value: true - ;; :builtin-pages? false ; Default value: false - ;; :excluded-pages? false ; Default value: false - ;; :journal? false} ; Default value: false - - ;; Graph view configuration. - ;; Example usage: - ;; :graph/forcesettings - ;; {:link-dist 180 ; Default value: 180 - ;; :charge-strength -600 ; Default value: -600 - ;; :charge-range 600} ; Default value: 600 - - ;; Favorites to list on the left sidebar - :favorites [] - - ;; Set flashcards interval. - ;; Expected value: - ;; - Float between 0 and 1 - ;; higher values result in faster changes to the next review interval. - ;; Default value: 0.5 - ;; :srs/learning-fraction 0.5 - - ;; Set the initial interval after the first successful review of a card. - ;; Default value: 4 - ;; :srs/initial-interval 4 - - ;; Hide specific block properties. - ;; Example usage: - ;; :block-hidden-properties #{:public :icon} - - ;; Create a page for all properties. - ;; Default value: true - :property-pages/enabled? true - - ;; Properties to exclude from having property pages - ;; Example usage: - ;; :property-pages/excludelist #{:duration :author} - - ;; By default, property value separated by commas will not be treated as - ;; page references. You can add properties to enable it. - ;; Example usage: - ;; :property/separated-by-commas #{:alias :tags} - - ;; Properties that are ignored when parsing property values for references - ;; Example usage: - ;; :ignored-page-references-keywords #{:author :website} - - ;; logbook configuration. - ;; :logbook/settings - ;; {:with-second-support? false ;limit logbook to minutes, seconds will be eliminated - ;; :enabled-in-all-blocks true ;display logbook in all blocks after timetracking - ;; :enabled-in-timestamped-blocks false ;don't display logbook at all - ;; } - - ;; Mobile photo upload configuration. - ;; :mobile/photo - ;; {:allow-editing? true - ;; :quality 80} - - ;; Mobile features options - ;; Gestures - ;; Example usage: - ;; :mobile - ;; {:gestures/disabled-in-block-with-tags ["kanban"]} - - ;; Extra CodeMirror options - ;; See https://codemirror.net/5/doc/manual.html#config for possible options - ;; Example usage: - ;; :editor/extra-codemirror-options - ;; {:lineWrapping false ; Default value: false - ;; :lineNumbers true ; Default value: true - ;; :readOnly false} ; Default value: false - - ;; Enable logical outdenting - ;; Default value: false - ;; :editor/logical-outdenting? false - - ;; Prefer pasting the file when text and a file are in the clipboard. - ;; Default value: false - ;; :editor/preferred-pasting-file? false - - ;; Quick capture templates for receiving content from other apps. - ;; Each template contains three elements {time}, {text} and {url}, which can be auto-expanded - ;; by receiving content from other apps. Note: the {} cannot be omitted. - ;; - {time}: capture time - ;; - {date}: capture date using current date format, use `[[{date}]]` to get a page reference - ;; - {text}: text that users selected before sharing. - ;; - {url}: URL or assets path for media files stored in Logseq. - ;; You can also reorder them or use only one or two of them in the template. - ;; You can also insert or format any text in the template, as shown in the following examples. - ;; :quick-capture-templates - ;; {:text "[[quick capture]] **{time}**: {text} from {url}" - ;; :media "[[quick capture]] **{time}**: {url}"} - - ;; Quick capture options. - ;; - insert-today? Insert the capture at the end of today's journal page (boolean). - ;; - redirect-page? Redirect to the quick capture page after capturing (boolean). - ;; - default-page The default page to capture to if insert-today? is false (string). - ;; :quick-capture-options - ;; {:insert-today? false ;; Default value: true - ;; :redirect-page? false ;; Default value: false - ;; :default-page "quick capture"} ;; Default page: "quick capture" - - ;; File sync options - ;; Ignore these files when syncing, regexp is supported. - ;; :file-sync/ignore-files [] - - ;; Configure the Enter key behavior for - ;; context-aware editing with DWIM (Do What I Mean). - ;; context-aware Enter key behavior implies that pressing Enter will - ;; have different outcomes based on the context. - ;; For instance, pressing Enter within a list generates a new list item, - ;; whereas pressing Enter in a block reference opens the referenced block. - ;; :dwim/settings - ;; {:admonition&src? true ;; Default value: true - ;; :markup? false ;; Default value: false - ;; :block-ref? true ;; Default value: true - ;; :page-ref? true ;; Default value: true - ;; :properties? true ;; Default value: true - ;; :list? false} ;; Default value: false - - ;; Configure the escaping method for special characters in page titles. - ;; Warning: - ;; This is a dangerous operation. To modify the setting, - ;; access the 'Filename format' setting and follow the instructions. - ;; Otherwise, You may need to manually rename all affected files and - ;; re-index them on all clients after synchronization. - ;; Incorrect handling may result in messy page titles. - ;; Available options: - ;; - :triple-lowbar (default) - ;; ;use triple underscore `___` for slash `/` in page title - ;; ;use Percent-encoding for other invalid characters - :file/name-format :triple-lowbar} diff --git a/notes/logseq/config.end b/notes/logseq/config.end deleted file mode 100644 index 5ef6e65..0000000 --- a/notes/logseq/config.end +++ /dev/null @@ -1,421 +0,0 @@ -{:meta/version 1 - - ;; Set the preferred format. - ;; Available options: - ;; - Markdown (default) - ;; - Org - ;; :preferred-format "Markdown" - - ;; Set the preferred workflow style. - ;; Available options: - ;; - :now for NOW/LATER style (default) - ;; - :todo for TODO/DOING style - :preferred-workflow :now - - ;; Exclude directories/files. - ;; Example usage: - ;; :hidden ["/archived" "/test.md" "../assets/archived"] - :hidden [] - - ;; Define the default journal page template. - ;; Enter the template name between the quotes. - :default-templates - {:journals ""} - - ;; Set a custom date format for the journal page title. - ;; Default value: "MMM do, yyyy" - ;; e.g., "Jan 19th, 2038" - ;; Example usage e.g., "Tue 19th, Jan 2038" - ;; :journal/page-title-format "EEE do, MMM yyyy" - - ;; Specify the journal filename format using a valid date format string. - ;; !Warning: - ;; This configuration is not retroactive and affects only new journals. - ;; To show old journal files in the app, manually rename the files in the - ;; journal directory to match the new format. - ;; Default value: "yyyy_MM_dd" - ;; :journal/file-name-format "yyyy_MM_dd" - - ;; Enable tooltip preview on hover. - ;; Default value: true - :ui/enable-tooltip? true - - ;; Display brackets [[]] around page references. - ;; Default value: true - ;; :ui/show-brackets? true - - ;; Display all lines of a block when referencing ((block)). - ;; Default value: false - :ui/show-full-blocks? false - - ;; Automatically expand block references when zooming in. - ;; Default value: true - :ui/auto-expand-block-refs? true - - ;; Enable Block timestamps. - ;; Default value: false - :feature/enable-block-timestamps? false - - ;; Disable accent marks when searching. - ;; After changing this setting, rebuild the search index by pressing (^C ^S). - ;; Default value: true - :feature/enable-search-remove-accents? true - - ;; Enable journals. - ;; Default value: true - ;; :feature/enable-journals? true - - ;; Enable flashcards. - ;; Default value: true - ;; :feature/enable-flashcards? true - - ;; Enable whiteboards. - ;; Default value: true - ;; :feature/enable-whiteboards? true - - ;; Disable the journal's built-in 'Scheduled tasks and deadlines' query. - ;; Default value: false - ;; :feature/disable-scheduled-and-deadline-query? false - - ;; Specify the number of days displayed in the future for - ;; the 'scheduled tasks and deadlines' query. - ;; Example usage: - ;; Display all scheduled and deadline blocks for the next 14 days: - ;; :scheduled/future-days 14 - ;; Default value: 7 - ;; :scheduled/future-days 7 - - ;; Specify the first day of the week. - ;; Available options: - ;; - integer from 0 to 6 (Monday to Sunday) - ;; Default value: 6 (Sunday) - :start-of-week 6 - - ;; Specify a custom CSS import. - ;; This option takes precedence over the local `logseq/custom.css` file. - ;; Example usage: - ;; :custom-css-url "@import url('https://cdn.jsdelivr.net/gh/dracula/logseq@master/custom.css');" - - ;; Specify a custom JS import. - ;; This option takes precedence over the local `logseq/custom.js` file. - ;; Example usage: - ;; :custom-js-url "https://cdn.logseq.com/custom.js" - - ;; Set a custom Arweave gateway - ;; Default gateway: https://arweave.net - ;; :arweave/gateway "https://arweave.net" - - ;; Set bullet indentation when exporting - ;; Available options: - ;; - `:eight-spaces` as eight spaces - ;; - `:four-spaces` as four spaces - ;; - `:two-spaces` as two spaces - ;; - `:tab` as a tab character (default) - ;; :export/bullet-indentation :tab - - ;; Publish all pages within the Graph - ;; Regardless of whether individual pages have been marked as public. - ;; Default value: false - ;; :publishing/all-pages-public? false - - ;; Define the default home page and sidebar status. - ;; If unspecified, the journal page will be loaded on startup and the right sidebar will stay hidden. - ;; The `:page` value represents the name of the page displayed at startup. - ;; Available options for `:sidebar` are: - ;; - "Contents" to display the Contents page in the right sidebar. - ;; - A specific page name to display in the right sidebar. - ;; - An array of multiple pages, e.g., ["Contents" "Page A" "Page B"]. - ;; If `:sidebar` remains unset, the right sidebar will stay hidden. - ;; Examples: - ;; 1. Set "Changelog" as the home page and display "Contents" in the right sidebar: - ;; :default-home {:page "Changelog", :sidebar "Contents"} - ;; 2. Set "Jun 3rd, 2021" as the home page without the right sidebar: - ;; :default-home {:page "Jun 3rd, 2021"} - ;; 3. Set "home" as the home page and display multiple pages in the right sidebar: - ;; :default-home {:page "home", :sidebar ["Page A" "Page B"]} - - ;; Set the default location for storing notes. - ;; Default value: "pages" - ;; :pages-directory "pages" - - ;; Set the default location for storing journals. - ;; Default value: "journals" - ;; :journals-directory "journals" - - ;; Set the default location for storing whiteboards. - ;; Default value: "whiteboards" - ;; :whiteboards-directory "whiteboards" - - ;; Enabling this option converts - ;; [[Grant Ideas]] to [[file:./grant_ideas.org][Grant Ideas]] for org-mode. - ;; For more information, visit https://github.com/logseq/logseq/issues/672 - ;; :org-mode/insert-file-link? false - - ;; Configure custom shortcuts. - ;; Syntax: - ;; 1. + indicates simultaneous key presses, e.g., `Ctrl+Shift+a`. - ;; 2. A space between keys represents key chords, e.g., `t s` means - ;; pressing `t` followed by `s`. - ;; 3. mod refers to `Ctrl` for Windows/Linux and `Command` for Mac. - ;; 4. Use false to disable a specific shortcut. - ;; 5. You can define multiple bindings for a single action, e.g., ["ctrl+j" "down"]. - ;; The full list of configurable shortcuts is available at: - ;; https://github.com/logseq/logseq/blob/master/src/main/frontend/modules/shortcut/config.cljs - ;; Example: - ;; :shortcuts - ;; {:editor/new-block "enter" - ;; :editor/new-line "shift+enter" - ;; :editor/insert-link "mod+shift+k" - ;; :editor/highlight false - ;; :ui/toggle-settings "t s" - ;; :editor/up ["ctrl+k" "up"] - ;; :editor/down ["ctrl+j" "down"] - ;; :editor/left ["ctrl+h" "left"] - ;; :editor/right ["ctrl+l" "right"]} - :shortcuts {} - - ;; Configure the behavior of pressing Enter in document mode. - ;; if set to true, pressing Enter will create a new block. - ;; Default value: false - :shortcut/doc-mode-enter-for-new-block? false - - ;; Block content larger than `block/content-max-length` will not be searchable - ;; or editable for performance. - ;; Default value: 10000 - :block/content-max-length 10000 - - ;; Display command documentation on hover. - ;; Default value: true - :ui/show-command-doc? true - - ;; Display empty bullet points. - ;; Default value: false - :ui/show-empty-bullets? false - - ;; Pre-defined :view function to use with advanced queries. - :query/views - {:pprint - (fn [r] [:pre.code (pprint r)])} - - ;; Advanced queries `:result-transform` function. - ;; Transform the query result before displaying it. - :query/result-transforms - {:sort-by-priority - (fn [result] (sort-by (fn [h] (get h :block/priority "Z")) result))} - - ;; The following queries will be displayed at the bottom of today's journal page. - ;; The "NOW" query returns tasks with "NOW" or "DOING" status. - ;; The "NEXT" query returns tasks with "NOW", "LATER", or "TODO" status. - :default-queries - {:journals - [{:title "πŸ”¨ NOW" - :query [:find (pull ?h [*]) - :in $ ?start ?today - :where - [?h :block/marker ?marker] - [(contains? #{"NOW" "DOING"} ?marker)] - [?h :block/page ?p] - [?p :block/journal? true] - [?p :block/journal-day ?d] - [(>= ?d ?start)] - [(<= ?d ?today)]] - :inputs [:14d :today] - :result-transform (fn [result] - (sort-by (fn [h] - (get h :block/priority "Z")) result)) - :group-by-page? false - :collapsed? false} - {:title "πŸ“… NEXT" - :query [:find (pull ?h [*]) - :in $ ?start ?next - :where - [?h :block/marker ?marker] - [(contains? #{"NOW" "LATER" "TODO"} ?marker)] - [?h :block/page ?p] - [?p :block/journal? true] - [?p :block/journal-day ?d] - [(> ?d ?start)] - [(< ?d ?next)]] - :inputs [:today :7d-after] - :group-by-page? false - :collapsed? false}]} - - ;; Add custom commands to the command palette - ;; Example usage: - ;; :commands - ;; [ - ;; ["js" "Javascript"] - ;; ["md" "Markdown"] - ;; ] - :commands [] - - ;; Enable collapsing blocks with titles but no children. - ;; By default, only blocks with children can be collapsed. - ;; Setting `:outliner/block-title-collapse-enabled?` to true allows collapsing - ;; blocks with titles (multiple lines) and content. For example: - ;; - block title - ;; block content - ;; Default value: false - :outliner/block-title-collapse-enabled? false - - ;; Macros replace texts and will make you more productive. - ;; Example usage: - ;; Change the :macros value below to: - ;; {"poem" "Rose is $1, violet's $2. Life's ordered: Org assists you."} - ;; input "{{poem red,blue}}" - ;; becomes - ;; Rose is red, violet's blue. Life's ordered: Org assists you. - :macros {} - - ;; Configure the default expansion level for linked references. - ;; For example, consider the following block hierarchy: - ;; - a [[page]] (level 1) - ;; - b (level 2) - ;; - c (level 3) - ;; - d (level 4) - ;; - ;; With the default value of level 2, block b will be collapsed. - ;; If the level's value is set to 3, block c will be collapsed. - ;; Default value: 2 - :ref/default-open-blocks-level 2 - - ;; Configure the threshold for linked references before collapsing. - ;; Default value: 100 - :ref/linked-references-collapsed-threshold 50 - - ;; Graph view configuration. - ;; Example usage: - ;; :graph/settings - ;; {:orphan-pages? true ; Default value: true - ;; :builtin-pages? false ; Default value: false - ;; :excluded-pages? false ; Default value: false - ;; :journal? false} ; Default value: false - - ;; Graph view configuration. - ;; Example usage: - ;; :graph/forcesettings - ;; {:link-dist 180 ; Default value: 180 - ;; :charge-strength -600 ; Default value: -600 - ;; :charge-range 600} ; Default value: 600 - - ;; Favorites to list on the left sidebar - :favorites [] - - ;; Set flashcards interval. - ;; Expected value: - ;; - Float between 0 and 1 - ;; higher values result in faster changes to the next review interval. - ;; Default value: 0.5 - ;; :srs/learning-fraction 0.5 - - ;; Set the initial interval after the first successful review of a card. - ;; Default value: 4 - ;; :srs/initial-interval 4 - - ;; Hide specific block properties. - ;; Example usage: - ;; :block-hidden-properties #{:public :icon} - - ;; Create a page for all properties. - ;; Default value: true - :property-pages/enabled? true - - ;; Properties to exclude from having property pages - ;; Example usage: - ;; :property-pages/excludelist #{:duration :author} - - ;; By default, property value separated by commas will not be treated as - ;; page references. You can add properties to enable it. - ;; Example usage: - ;; :property/separated-by-commas #{:alias :tags} - - ;; Properties that are ignored when parsing property values for references - ;; Example usage: - ;; :ignored-page-references-keywords #{:author :website} - - ;; logbook configuration. - ;; :logbook/settings - ;; {:with-second-support? false ;limit logbook to minutes, seconds will be eliminated - ;; :enabled-in-all-blocks true ;display logbook in all blocks after timetracking - ;; :enabled-in-timestamped-blocks false ;don't display logbook at all - ;; } - - ;; Mobile photo upload configuration. - ;; :mobile/photo - ;; {:allow-editing? true - ;; :quality 80} - - ;; Mobile features options - ;; Gestures - ;; Example usage: - ;; :mobile - ;; {:gestures/disabled-in-block-with-tags ["kanban"]} - - ;; Extra CodeMirror options - ;; See https://codemirror.net/5/doc/manual.html#config for possible options - ;; Example usage: - ;; :editor/extra-codemirror-options - ;; {:lineWrapping false ; Default value: false - ;; :lineNumbers true ; Default value: true - ;; :readOnly false} ; Default value: false - - ;; Enable logical outdenting - ;; Default value: false - ;; :editor/logical-outdenting? false - - ;; Prefer pasting the file when text and a file are in the clipboard. - ;; Default value: false - ;; :editor/preferred-pasting-file? false - - ;; Quick capture templates for receiving content from other apps. - ;; Each template contains three elements {time}, {text} and {url}, which can be auto-expanded - ;; by receiving content from other apps. Note: the {} cannot be omitted. - ;; - {time}: capture time - ;; - {date}: capture date using current date format, use `[[{date}]]` to get a page reference - ;; - {text}: text that users selected before sharing. - ;; - {url}: URL or assets path for media files stored in Logseq. - ;; You can also reorder them or use only one or two of them in the template. - ;; You can also insert or format any text in the template, as shown in the following examples. - ;; :quick-capture-templates - ;; {:text "[[quick capture]] **{time}**: {text} from {url}" - ;; :media "[[quick capture]] **{time}**: {url}"} - - ;; Quick capture options. - ;; - insert-today? Insert the capture at the end of today's journal page (boolean). - ;; - redirect-page? Redirect to the quick capture page after capturing (boolean). - ;; - default-page The default page to capture to if insert-today? is false (string). - ;; :quick-capture-options - ;; {:insert-today? false ;; Default value: true - ;; :redirect-page? false ;; Default value: false - ;; :default-page "quick capture"} ;; Default page: "quick capture" - - ;; File sync options - ;; Ignore these files when syncing, regexp is supported. - ;; :file-sync/ignore-files [] - - ;; Configure the Enter key behavior for - ;; context-aware editing with DWIM (Do What I Mean). - ;; context-aware Enter key behavior implies that pressing Enter will - ;; have different outcomes based on the context. - ;; For instance, pressing Enter within a list generates a new list item, - ;; whereas pressing Enter in a block reference opens the referenced block. - ;; :dwim/settings - ;; {:admonition&src? true ;; Default value: true - ;; :markup? false ;; Default value: false - ;; :block-ref? true ;; Default value: true - ;; :page-ref? true ;; Default value: true - ;; :properties? true ;; Default value: true - ;; :list? false} ;; Default value: false - - ;; Configure the escaping method for special characters in page titles. - ;; Warning: - ;; This is a dangerous operation. To modify the setting, - ;; access the 'Filename format' setting and follow the instructions. - ;; Otherwise, You may need to manually rename all affected files and - ;; re-index them on all clients after synchronization. - ;; Incorrect handling may result in messy page titles. - ;; Available options: - ;; - :triple-lowbar (default) - ;; ;use triple underscore `___` for slash `/` in page title - ;; ;use Percent-encoding for other invalid characters - :file/name-format :triple-lowbar} diff --git a/notes/pages/card.md b/notes/pages/card.md deleted file mode 100644 index e69de29..0000000 diff --git a/notes/pages/contents.md b/notes/pages/contents.md deleted file mode 100644 index 39cdd0d..0000000 --- a/notes/pages/contents.md +++ /dev/null @@ -1 +0,0 @@ -- diff --git a/notes/pages/dqn.md b/notes/pages/dqn.md deleted file mode 100644 index 6a826ad..0000000 --- a/notes/pages/dqn.md +++ /dev/null @@ -1,40 +0,0 @@ -* What is the bellman equation? #card - * $Q(s, a) = r + \lambda * max_a'(Q(s', a')) * (1 - done(s, a))$ - * $Q(s, a)$: The Q-value of taking action $a$ in state $s$ - * $r$: The reward of taking action $a$ in state $s$ - * $\lambda$: The discount factor - * $max_a'(Q(s', a'))$: The maximum Q-value of the next state $s'$ - * $done(s, a)$: Whether the episode is done after taking action $a$ in state $s$ -* What does the Q value represent? #card - * The expected cumulative future reward of taking action $a$ in state $s$ following the optimal [[policy]] thereafter. -* What is the objective function of Q-learning? #card - * $L(\theta) = \mathbb{E}[(Q_\theta(s, a) - (r + \lambda * max_a'(Q_\theta(s', a'))))^2]$ - * $L(\theta)$: The loss function - * $Q(s, a)$: The Q-value of taking action $a$ in state $s$ - * $Q(s', a')$: The Q-value of taking action $a'$ in next state $s'$ - * $r$: The reward of taking action $a$ in state $s$ - * $\lambda$: The discount factor -* What are some differences in the `act` method between `QLearning` and `REINFORCE` #card - * QLearning uses an argmax to select the best action whereas [[REINFORCE]] uses a softmax sample to select an action - * QLearning has an epsilon greedy [[policy]] whereas [[REINFORCE]] has a stochastic [[policy]] -* Define epsilon greedy [[policy]] #card - * An epsilon greedy [[policy]] is a [[policy]] that selects the best action with probability $1 - \epsilon$ and a random action with probability $\epsilon$ -* What are some differences in the `train` method between `QLearning` and `REINFORCE` #card - * QLearning trains on each step whereas [[REINFORCE]] trains at the end of each episode - * [[REINFORCE]] needs a whole trajectory to train, because it operates on real cumulative rewards, whereas QLearning can train on each step because it operates on predicted cumulative rewards -* What are some differences in the `collect_episodes` method between `QLearning` and `REINFORCE` #card - * QLearning uses SARS whereas [[REINFORCE]] uses SAR - * [[REINFORCE]] needs a whole trajectory to train, because it operates on real cumulative rewards, whereas QLearning can train on each step because it operates on predicted cumulative rewards - * This is because the bellman equation requires a mixture of one real reward and one predicted reward from the network to properly train -* What are the differences between exploration rate and temperature in Q-learning and REINFORCE? #card - * Exploration rate is a probability of taking a random action - * Temperature is a parameter in the softmax function that controls the stochasticity of the [[policy]] - * Exploration either happens or does not happen and is not controlled by the neural network at all - * Temperature just controls the output of the network. If the network is very confident in one action, it will still take that action with high probability, but if the network is unsure, it will take a random action with some probability. Therefore you don't need to decay temperature, but you do need to decay exploration rate. -* What is action replay memory? Why is it needed in QLearning but not REINFORCE. #card - * A buffer that stores experiences for training the network. - * Q Learning is an off-policy method, so it can learn from past experiences. [[REINFORCE]] is an on-policy method, so it can't learn from past experiences. The replay buffer is a feature of off-policy methods, not a hindrance. -* What is on-policy learning? #card - * On-policy learning is when the agent learns from the same [[policy]] that it uses to interact with the environment. -* What is off-policy learning? #card - * Off-policy learning is when the agent can learn from a different [[policy]] than the one it uses to interact with the environment. This allows for more efficient learning because the agent can learn from past experiences. diff --git a/notes/pages/duelingdqn.md b/notes/pages/duelingdqn.md deleted file mode 100644 index f158df0..0000000 --- a/notes/pages/duelingdqn.md +++ /dev/null @@ -1,8 +0,0 @@ ---- -alias: dueling deep q network ---- - -- The Dueling [[DQN]] architecture is a modification of the standard [[DQN]] architecture that separates the [[value function]] and [[advantage function]] into two separate streams. -- Because the [[advantage]] function can be defined as $A(s,a) = Q(s,a) - V(s)$, we can re-arrange this to get the Q-value function as $Q(s,a) = A(s,a) - V(s)$. - - As such, we can create two networks instead of one, a $V(s)$ network and an $A(s,a)$ network, and combine them to get the Q-value function. - - This has the effect of constraining the bias of each network, which can help with convergence. diff --git a/notes/pages/gradient.md b/notes/pages/gradient.md deleted file mode 100644 index 17aeee8..0000000 --- a/notes/pages/gradient.md +++ /dev/null @@ -1,3 +0,0 @@ ---- -alias: gradients ---- diff --git a/notes/pages/henessian.md b/notes/pages/henessian.md deleted file mode 100644 index e69de29..0000000 diff --git a/notes/pages/laplacian.md b/notes/pages/laplacian.md deleted file mode 100644 index e69de29..0000000 diff --git a/notes/pages/log-probability.md b/notes/pages/log-probability.md deleted file mode 100644 index 971b862..0000000 --- a/notes/pages/log-probability.md +++ /dev/null @@ -1,3 +0,0 @@ ---- -alias: log probability, log prob ---- diff --git a/notes/pages/policy-gradient.md b/notes/pages/policy-gradient.md deleted file mode 100644 index ca64048..0000000 --- a/notes/pages/policy-gradient.md +++ /dev/null @@ -1,3 +0,0 @@ ---- -alias: policy gradient, pg ---- diff --git a/notes/pages/reinforce.md b/notes/pages/reinforce.md deleted file mode 100644 index 7949334..0000000 --- a/notes/pages/reinforce.md +++ /dev/null @@ -1,58 +0,0 @@ -- What is the [[REINFORCE]] scoring function? #card - - $L(\theta) = \frac{1}{T} \sum_{t=0}^{T-1} G_t \log \pi_{\theta}(a_t | s_t)$ - - $L(\theta)$: The loss function - - $T$: The number of time steps in the episode - - $G_t$: Cumulative discounted future reward at time step $t$ - - $\pi_{\theta}(a_t | s_t)$: The probability of taking action $a_t$ in state $s_t$ under [[policy]] $\pi_{\theta}$ - - $\Delta \theta = \alpha r \frac{\partial \log \pi_{\theta}(s, a)}{\partial \theta}$ - - $\Delta \theta$: The update to the [[policy]] parameters - - $\Delta$ is the first or second [[gradient]]? What is it called? #card - - Second - - [[Laplacian]] operator - - What is the difference between a [[Henessian]] and a [[Laplacian]]? #card - - Henessians produce a matrix of second derivatives, while Laplacians produce a scalar. Both are second order derivatives. - - $\alpha$: The learning rate - - $r$: The reward - - $\partial{\log \pi_{\theta}(s, a)}$: The [[gradient]] of the [[log probability]] of taking action $a$ in state $s$ under [[policy]] $\pi_{\theta}$ -- What is the [[policy gradient]] theorem? #card - - For any differentiable [[policy]] and for any [[policy]] objective function, the [[policy gradient]] is: $\nabla_{\theta} J(\theta) = \mathbb{E}_{\pi_{\theta}}[\nabla_{\theta} \log \pi_{\theta}(a_t | s_t) R(\tau)]$ - - What does the symbol $\nabla$ mean? #card - - The [[gradient]], which is a vector of single partial derivatives in each dimension of the input space. - - $J(\theta)$: The objective function to maximize - - $\pi_{\theta}(a_t | s_t)$: The probability of taking action $a_t$ in state $s_t$ under [[policy]] $\pi_{\theta}$ - - $R(\tau)$: The return of a trajectory $\tau$, which is often formulated as cumulative discounted future rewards. - - How is this different from the [[REINFORCE]] scoring function? #card - - It's more general - - It's formulated as an expectation over a [[gradient]] - - It's a maximization rather than a loss - - But very similar to the [[REINFORCE]] scoring function - - $J(\theta)$: The objective function -- What is the training loop for [[REINFORCE]]? #card - - Initialize the [[policy]] $\pi_{\theta}$ with random weights - - For each episode: - - Generate a trajectory $\tau$ by following the [[policy]] $\pi_{\theta}$ - - Compute the return $R(\tau)$ - - Compute the [[policy gradient]] $\nabla_{\theta} J(\theta)$ - - Update the [[policy]] parameters $\theta$ with the [[gradient]] -- What is $\pi_{\theta}(a | s)$? #card - - The function which is learned by the [[REINFORCE]] algorithm - - The probability of taking action $a$ in state $s$ under [[policy]] $\pi_{\theta}$ -- Why do you need to normalize the rewards in [[REINFORCE]]? #card - - Since the rewards are arbitrary and directly part of the objective/loss function, they are normalized to make the optimization more stable. -- What is a multinomial distribution? #card - - A probability distribution over a discrete number of possible outcomes, where each outcome has a probability associated with it. Like a dice roll, where each face has a probability of being rolled. -- What are some advantages of [[policy gradient]] methods over [[value based methods]]? #card - - They can learn stochastic policies - - They can learn policies in high-dimensional or continuous action spaces - - They can have better convergence properties, they can be made to change smoothly over time with sampling rather than depending on an argmax operation. -- What are some disadvantages of [[policy gradient]] methods vs [[value based methods]] ? #card - - They have high variance in the [[gradients]] which can lead to # [[card]] - - Slow convergence - - Catastrophic forgetting - - They can be sensitive to the choice of step size - - They can be computationally expensive -- What is the softmax equation with temperature? #card - - $P(i) = \frac{e^{Z_i/T}}{\sum_{j} e^{Z_j/T}}$ - - $P(i)$: The probability of outcome $i$ - - $Z_i$: The logit of outcome $i$ - - $T$: The temperature parameter diff --git a/notes/pages/td.md b/notes/pages/td.md deleted file mode 100644 index e69de29..0000000 diff --git a/notes/pages/value-based-methods.md b/notes/pages/value-based-methods.md deleted file mode 100644 index 38167ae..0000000 --- a/notes/pages/value-based-methods.md +++ /dev/null @@ -1,3 +0,0 @@ ---- -alias: value-based methods, value based methods, value methods, value function ---- From 4abb20bc10e3d7b14efdf5f928677f495544df2e Mon Sep 17 00:00:00 2001 From: Ryan Peach Date: Sun, 3 Aug 2025 03:03:55 -0400 Subject: [PATCH 2/4] Upgraded mdlinker --- .pre-commit-config.yaml | 72 ++++++++++++++++++++--------------------- mdlinker.toml | 7 ++-- 2 files changed, 40 insertions(+), 39 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index a57b198..fafd178 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,5 +1,5 @@ repos: -- repo: https://github.com/pre-commit/pre-commit-hooks + - repo: https://github.com/pre-commit/pre-commit-hooks rev: v5.0.0 hooks: - id: end-of-file-fixer @@ -11,38 +11,38 @@ repos: - id: detect-private-key - id: mixed-line-ending args: ["--fix=lf"] -- repo: https://github.com/Lucas-C/pre-commit-hooks - rev: v1.4.1 - hooks: - - id: remove-tabs - - id: remove-crlf -- repo: https://github.com/astral-sh/ruff-pre-commit - # Ruff version. - rev: v0.9.6 - hooks: - # Run the linter. - - id: ruff - types_or: [ python, pyi, jupyter ] - args: [ --fix ] - # Run the formatter. - - id: ruff-format - types_or: [ python, pyi, jupyter ] -- repo: https://github.com/RobertCraigie/pyright-python - rev: v1.1.394 - hooks: - - id: pyright -- repo: https://github.com/mwouts/jupytext - rev: v1.16.7 - hooks: - - id: jupytext - args: [--sync] -- repo: https://github.com/ryanpeach/mdlinker - rev: v1.6.1 - hooks: - - id: enforce-ascii - files: continuing_education/.*\.md - - id: mdlinker - files: continuing_education/.*\.md - args: - - "--fix" - - "--allow-dirty" + - repo: https://github.com/Lucas-C/pre-commit-hooks + rev: v1.4.1 + hooks: + - id: remove-tabs + - id: remove-crlf + - repo: https://github.com/astral-sh/ruff-pre-commit + # Ruff version. + rev: v0.9.6 + hooks: + # Run the linter. + - id: ruff + types_or: [python, pyi, jupyter] + args: [--fix] + # Run the formatter. + - id: ruff-format + types_or: [python, pyi, jupyter] + - repo: https://github.com/RobertCraigie/pyright-python + rev: v1.1.394 + hooks: + - id: pyright + - repo: https://github.com/mwouts/jupytext + rev: v1.16.7 + hooks: + - id: jupytext + args: [--sync] + - repo: https://github.com/ryanpeach/mdlinker + rev: v1.7.2 + hooks: + - id: enforce-ascii + files: continuing_education/.*\.md + - id: mdlinker + files: continuing_education/.*\.md + args: + - "--fix" + - "--allow-dirty" diff --git a/mdlinker.toml b/mdlinker.toml index 1e221d4..034ce20 100644 --- a/mdlinker.toml +++ b/mdlinker.toml @@ -1,3 +1,4 @@ -pages_directory = "notes/pages" - -other_directories = ["notes/journal"] +new_files_directory = "notes" +files = [ + "continuing_education/**/*.md", +] From bd4c0d0d79df53d40b9c7592113a0374d610b792 Mon Sep 17 00:00:00 2001 From: Ryan Peach Date: Sun, 3 Aug 2025 15:32:46 -0400 Subject: [PATCH 3/4] Some new notes --- .github/pull_request_template.md | 3 ++ .../math/classic_ml/notes/bagging.md | 7 +++ .../math/statistics/notes/bootstrapping.md | 13 ++++++ .../actor_critic/README.md | 44 ++++++++++++++++--- .../value_based_methods/notes/td.md | 41 +++++++++++++++++ mdlinker.toml | 1 + 6 files changed, 102 insertions(+), 7 deletions(-) create mode 100644 continuing_education/math/classic_ml/notes/bagging.md create mode 100644 continuing_education/math/statistics/notes/bootstrapping.md diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md index c2b39a4..257055a 100644 --- a/.github/pull_request_template.md +++ b/.github/pull_request_template.md @@ -4,6 +4,8 @@ Here are some tasks to complete before merging this PR: # Styleguide +- [ ] Did you update the version in Cargo.toml? + ## Jupyter - [ ] Add references to the bottom of the notebook. @@ -34,3 +36,4 @@ if __name__ == '__main__': - [ ] Use singular nouns for filenames. - [ ] Use `-` in filenames instead of spaces. - [ ] For any new notes, you need to go back in other notes and link to them using project search. +- [ ] Put sources on each flashcard. Mark AI as the source if you used AI to generate the flashcard. These can be revisited later. Mark yourself as the source if you wrote the flashcard yourself. diff --git a/continuing_education/math/classic_ml/notes/bagging.md b/continuing_education/math/classic_ml/notes/bagging.md new file mode 100644 index 0000000..a957019 --- /dev/null +++ b/continuing_education/math/classic_ml/notes/bagging.md @@ -0,0 +1,7 @@ +# Bagging + +## What is Bagging? + +Bootstrap aggregating, also called bagging (from bootstrap aggregating) or bootstrapping, is a machine learning (ML) ensemble meta-algorithm designed to improve the stability and accuracy of ML classification and regression algorithms. It also reduces variance and overfitting. Although it is usually applied to decision tree methods, it can be used with any type of method. Bagging is a special case of the ensemble averaging approach. + +https://en.wikipedia.org/wiki/Bootstrap_aggregating diff --git a/continuing_education/math/statistics/notes/bootstrapping.md b/continuing_education/math/statistics/notes/bootstrapping.md new file mode 100644 index 0000000..67da6a4 --- /dev/null +++ b/continuing_education/math/statistics/notes/bootstrapping.md @@ -0,0 +1,13 @@ +# Bootstrapping + +## What is Bootstrapping? + +Bootstrapping is a procedure for estimating the distribution of an estimator by resampling (often with replacement) one's data or a model estimated from the data. Bootstrapping assigns measures of accuracy (bias, variance, confidence intervals, prediction error, etc.) to sample estimates. This technique allows estimation of the sampling distribution of almost any statistic using random sampling methods. + +https://en.wikipedia.org/wiki/Bootstrapping_(statistics) + +## What is the difference between bootstrapping and bagging? + +Bagging is a specific type of bootstrapping that involves creating multiple subsets of the training data by sampling with replacement, training a model on each subset, and then averaging the predictions of the models to improve accuracy and reduce variance. Bootstrapping, in general, refers to the process of resampling data to estimate statistics or build confidence intervals. + +source: AI diff --git a/continuing_education/policy_gradient_methods/actor_critic/README.md b/continuing_education/policy_gradient_methods/actor_critic/README.md index 5072daf..645ad07 100644 --- a/continuing_education/policy_gradient_methods/actor_critic/README.md +++ b/continuing_education/policy_gradient_methods/actor_critic/README.md @@ -1,11 +1,41 @@ # Actor Critic ## What is the [[value]] function used for in [[actor critic]] methods? - - The [[value]] function is used to estimate the *average* expected return from a given state. - - It could theoretically be a Q-function, but in practice, it is often a state-value function using [[TD]] error. + +- The [[value]] function is used by the critic to evaluate the expected return from a given state or state-action pair. +- It provides a baseline for the [[policy gradient]] updates, helping to reduce variance in the learning process. It does this by providing an estimate of the expected return from a state, which can be subtracted from the actual return to compute the state [[advantage]] function. +- The [[value]] function is used to estimate the *average* expected return from a given state. +- It could theoretically be a Q-function, but in practice, it is often a [[value]] function. + +source: Myself + ## What is the [[advantage]] function used for in [[actor critic]] methods? - - The difference between the expected reward from a state-action pair (Q) and the average expected reward from just the state (V). - - $A(s, a) = Q(s, a) - V(s)$ - - It is used to normalize the [[policy gradient]], as well as to push the [[policy gradient]] towards actions that are better than average and away from actions that are worse than average. -## What is the training loop for a2c? #todo -## What is the difference between a2c and a3c? + +- The difference between the expected reward from a state-action pair (Q) and the average expected reward from just the state (V). + - $A(s, a) = Q(s, a) - V(s)$ +- It is normalized making the [[policy gradient]] normalized as well (providing updates around +-0) +- It pushes the [[policy gradient]] towards actions that are better than average and away from actions that are worse than average. + +source: Myself + +## What kinds of normalization are used in [[actor critic]] methods? What are their effects? + + + +## What is the training loop for A2C? + +## What is the training loop for A3c? + +## What does A2C stand for? + +A2C stands for Advantage Actor-Critic. + +## What does A3C stand for? + +A3C stands for Asynchronous Advantage Actor-Critic. + +## What is the difference between A2C and A3C? + +A3C is parallel and asynchronous, meaning it uses multiple agents to explore the environment in parallel and updates the model asynchronously. A2C is synchronous, meaning it uses a single agent to explore the environment and updates the model synchronously. + +source: https://en.wikipedia.org/wiki/Actor-critic_algorithm#Variants diff --git a/continuing_education/value_based_methods/notes/td.md b/continuing_education/value_based_methods/notes/td.md index 9e0d19e..97b6298 100644 --- a/continuing_education/value_based_methods/notes/td.md +++ b/continuing_education/value_based_methods/notes/td.md @@ -1 +1,42 @@ # Temporal Difference + +## What is Temporal Difference (TD) learning? + +Temporal difference ([[TD]]) learning refers to a class of model-free reinforcement learning methods which learn by bootstrapping from the current estimate of the value function. These methods sample from the environment, like [[Monte Carlo]] methods, and perform updates based on current estimates, like dynamic programming methods. + +https://en.wikipedia.org/wiki/Actor-critic_algorithm + +## What is the difference between TD learning and Monte Carlo methods? + +While [[Monte Carlo]] methods only adjust their estimates once the final outcome is known, [[TD]] methods adjust predictions to match later, more accurate, predictions about the future before the final outcome is known. This is a form of [[bootstrapping]]. + +source: https://en.wikipedia.org/wiki/Temporal_difference_learning + +## What are the key features of TD learning? + +- **[[Bootstrapping]]**: [[TD]] methods update estimates based on other learned estimates, rather than waiting for the final outcome. +- **Model-free**: [[TD]] learning does not require a model of the environment, making it +suitable for environments where the dynamics are unknown. +- **Online learning**: [[TD]] methods can learn from each step of interaction with the environment, allowing for continuous updates and learning. +- **Temporal credit assignment**: [[TD]] learning assigns credit to actions based on their contribution to future rewards, allowing for more efficient learning in environments with delayed rewards. + +source: AI + +## What is the SARSA algorithm? + + +## What is the TD(0) algorithm? + + +## What is the $TD\lambda$ algorithm? + + +## Biological Inspiration of TD Learning + +The [[TD]] algorithm has also received attention in the field of neuroscience. Researchers discovered that the firing rate of dopamine neurons in the ventral tegmental area (VTA) and substantia nigra (SNc) appear to mimic the error function in the algorithm. The error function reports back the difference between the estimated reward at any given state or time step and the actual reward received. The larger the error function, the larger the difference between the expected and actual reward. When this is paired with a stimulus that accurately reflects a future reward, the error can be used to associate the stimulus with the future reward. + +Dopamine cells appear to behave in a similar manner. In one experiment measurements of dopamine cells were made while training a monkey to associate a stimulus with the reward of juice. Initially the dopamine cells increased firing rates when the monkey received juice, indicating a difference in expected and actual rewards. Over time this increase in firing back propagated to the earliest reliable stimulus for the reward. Once the monkey was fully trained, there was no increase in firing rate upon presentation of the predicted reward. Subsequently, the firing rate for the dopamine cells decreased below normal activation when the expected reward was not produced. This mimics closely how the error function in [[TD]] is used for reinforcement learning. + +The relationship between the model and potential neurological function has produced research attempting to use [[TD]] to explain many aspects of behavioral research. It has also been used to study conditions such as schizophrenia or the consequences of pharmacological manipulations of dopamine on learning. + +https://en.wikipedia.org/wiki/Temporal_difference_learning#In_neuroscience diff --git a/mdlinker.toml b/mdlinker.toml index 034ce20..217580e 100644 --- a/mdlinker.toml +++ b/mdlinker.toml @@ -1,4 +1,5 @@ new_files_directory = "notes" files = [ "continuing_education/**/*.md", + "notes/**/*.md", ] From 1e48ceeccdfb782c15c17b27c77fa83b64fc9485 Mon Sep 17 00:00:00 2001 From: Ryan Peach Date: Fri, 7 Nov 2025 00:23:10 -0500 Subject: [PATCH 4/4] Using readmes at the root of directories now to indicate their concept --- .pre-commit-config.yaml | 20 ++++++------- .../math/calculus/gradient.md | 2 +- .../math/calculus/henessian.md | 2 +- .../math/calculus/laplacian.md | 4 +-- .../math/statistics/log_probability/README.md | 2 +- .../policy_gradient_methods/README.md | 2 ++ .../actor_critic/README.md | 6 ++-- .../notes/advantage.md | 2 +- .../reinforce/README.md | 24 ++++++++-------- .../value_based_methods/dqn/README.md | 28 +++++++++---------- .../value_based_methods/duelingdqn/README.md | 6 ++-- .../value_based_methods/notes/value.md | 2 +- .../actor critic.md | 0 notes/dqn.md | 0 notes/log probability.md | 0 notes/log-probability.md | 0 notes/policy-gradient.md | 0 notes/reinforce.md | 0 18 files changed, 51 insertions(+), 49 deletions(-) create mode 100644 continuing_education/policy_gradient_methods/README.md rename continuing_education/policy_gradient_methods/notes/policy-gradient.md => notes/actor critic.md (100%) create mode 100644 notes/dqn.md create mode 100644 notes/log probability.md create mode 100644 notes/log-probability.md create mode 100644 notes/policy-gradient.md create mode 100644 notes/reinforce.md diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index fafd178..e2c0e00 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -36,13 +36,13 @@ repos: hooks: - id: jupytext args: [--sync] - - repo: https://github.com/ryanpeach/mdlinker - rev: v1.7.2 - hooks: - - id: enforce-ascii - files: continuing_education/.*\.md - - id: mdlinker - files: continuing_education/.*\.md - args: - - "--fix" - - "--allow-dirty" + # - repo: https://github.com/ryanpeach/mdlinker + # rev: v1.7.2 + # hooks: + # - id: enforce-ascii + # files: continuing_education/.*\.md + # - id: mdlinker + # files: continuing_education/.*\.md + # args: + # - "--fix" + # - "--allow-dirty" diff --git a/continuing_education/math/calculus/gradient.md b/continuing_education/math/calculus/gradient.md index 0725928..d0cc670 100644 --- a/continuing_education/math/calculus/gradient.md +++ b/continuing_education/math/calculus/gradient.md @@ -1,4 +1,4 @@ -# Gradient +# [[Gradient]] ## What is the [[gradient]]? diff --git a/continuing_education/math/calculus/henessian.md b/continuing_education/math/calculus/henessian.md index 8903526..3e4a0d0 100644 --- a/continuing_education/math/calculus/henessian.md +++ b/continuing_education/math/calculus/henessian.md @@ -1,4 +1,4 @@ -# Henessian +# [[Henessian]] ## What is a [[Henessian]]? diff --git a/continuing_education/math/calculus/laplacian.md b/continuing_education/math/calculus/laplacian.md index 95ac7ef..bcecbf6 100644 --- a/continuing_education/math/calculus/laplacian.md +++ b/continuing_education/math/calculus/laplacian.md @@ -1,6 +1,6 @@ -## Laplacian +## [[Laplacian]] -## What is the Laplacian +## What is the [[Laplacian]] The second order [[gradient]]. Produces a scalar. diff --git a/continuing_education/math/statistics/log_probability/README.md b/continuing_education/math/statistics/log_probability/README.md index be1b9cb..34ff54a 100644 --- a/continuing_education/math/statistics/log_probability/README.md +++ b/continuing_education/math/statistics/log_probability/README.md @@ -1,6 +1,6 @@ # Log Probability -## Why would you use a negative [[log-probability]] in a loss function? +## Why would you use a negative [[continuing_education/math/statistics/log_probability/README|log-probability]] in a loss function? It's infinity at 0 and 0 at 1, which means at high confidence in something you get a low loss approaching 0, and at low confidence you get a high loss approaching infinity. It gives a strong [[gradient]] signal to the network to update its parameters. diff --git a/continuing_education/policy_gradient_methods/README.md b/continuing_education/policy_gradient_methods/README.md new file mode 100644 index 0000000..1a043ac --- /dev/null +++ b/continuing_education/policy_gradient_methods/README.md @@ -0,0 +1,2 @@ + +# [[Policy]]-[[gradient]] methods diff --git a/continuing_education/policy_gradient_methods/actor_critic/README.md b/continuing_education/policy_gradient_methods/actor_critic/README.md index 5072daf..e227154 100644 --- a/continuing_education/policy_gradient_methods/actor_critic/README.md +++ b/continuing_education/policy_gradient_methods/actor_critic/README.md @@ -1,11 +1,11 @@ # Actor Critic -## What is the [[value]] function used for in [[actor critic]] methods? +## What is the [[value]] function used for in [[continuing_education/value_based_methods/dqn/README|actor critic]] methods? - The [[value]] function is used to estimate the *average* expected return from a given state. - It could theoretically be a Q-function, but in practice, it is often a state-value function using [[TD]] error. -## What is the [[advantage]] function used for in [[actor critic]] methods? +## What is the [[advantage]] function used for in [[continuing_education/value_based_methods/dqn/README|actor critic]] methods? - The difference between the expected reward from a state-action pair (Q) and the average expected reward from just the state (V). - $A(s, a) = Q(s, a) - V(s)$ - - It is used to normalize the [[policy gradient]], as well as to push the [[policy gradient]] towards actions that are better than average and away from actions that are worse than average. + - It is used to normalize the [[continuing_education/policy_gradient_methods/README|policy-gradient]], as well as to push the [[continuing_education/policy_gradient_methods/README|policy-gradient]] towards actions that are better than average and away from actions that are worse than average. ## What is the training loop for a2c? #todo ## What is the difference between a2c and a3c? diff --git a/continuing_education/policy_gradient_methods/notes/advantage.md b/continuing_education/policy_gradient_methods/notes/advantage.md index 5147d0a..53b495b 100644 --- a/continuing_education/policy_gradient_methods/notes/advantage.md +++ b/continuing_education/policy_gradient_methods/notes/advantage.md @@ -5,5 +5,5 @@ ## What is the intuition behind the [[advantage]] function? - The [[advantage]] function is a measure of how much better an action is compared to the average action in a given state. - Learning relative [[advantage]] is easier and has less variance than learning absolute values. [[Advantage]] is more relevant to decision making via argmax than absolute values. -## Define the [[advantage]] function in terms of the Q-value function and the value function. +## Define the [[advantage]] function in terms of the Q-value function and the [[value]] function. - $A(s,a) = Q(s,a) - V(s)$ diff --git a/continuing_education/policy_gradient_methods/reinforce/README.md b/continuing_education/policy_gradient_methods/reinforce/README.md index add1a18..91cfc5b 100644 --- a/continuing_education/policy_gradient_methods/reinforce/README.md +++ b/continuing_education/policy_gradient_methods/reinforce/README.md @@ -1,6 +1,6 @@ -# Reinforce +# REINFORCE -## What is the [[REINFORCE]] scoring function? +## What is the [[continuing_education/policy_gradient_methods/reinforce/README|REINFORCE]] scoring function? - $L(\theta) = \frac{1}{T} \sum_{t=0}^{T-1} G_t \log \pi_{\theta}(a_t | s_t)$ - $L(\theta)$: The loss function @@ -11,35 +11,35 @@ - $\Delta \theta$: The update to the [[policy]] parameters. The [[laplacian]] - $\alpha$: The learning rate - $r$: The reward - - $\partial{\log \pi_{\theta}(s, a)}$: The [[gradient]] of the [[log probability]] of taking action $a$ in state $s$ under [[policy]] $\pi_{\theta}$ + - $\partial{\log \pi_{\theta}(s, a)}$: The [[gradient]] of the [[continuing_education/math/statistics/log_probability/README|log probability]] of taking action $a$ in state $s$ under [[policy]] $\pi_{\theta}$ -## What is the [[policy gradient]] theorem? +## What is the [[continuing_education/policy_gradient_methods/README|policy-gradient]] theorem? -- For any differentiable [[policy]] and for any [[policy]] objective function, the [[policy gradient]] is: $\nabla_{\theta} J(\theta) = \mathbb{E}_{\pi_{\theta}}[\nabla_{\theta} \log \pi_{\theta}(a_t | s_t) R(\tau)]$ +- For any differentiable [[policy]] and for any [[policy]] objective function, the [[continuing_education/policy_gradient_methods/README|policy-gradient]] is: $\nabla_{\theta} J(\theta) = \mathbb{E}_{\pi_{\theta}}[\nabla_{\theta} \log \pi_{\theta}(a_t | s_t) R(\tau)]$ - $J(\theta)$: The objective function to maximize - $\pi_{\theta}(a_t | s_t)$: The probability of taking action $a_t$ in state $s_t$ under [[policy]] $\pi_{\theta}$ - $R(\tau)$: The return of a trajectory $\tau$, which is often formulated as cumulative discounted future rewards. - $J(\theta)$: The objective function -## How does [[REINFORCE]] implement policy gradient? +## How does [[continuing_education/policy_gradient_methods/reinforce/README|REINFORCE]] implement [[policy]] gradient? Using a monte carlo method over episode rollouts. -## What is the training loop for [[REINFORCE]]? +## What is the training loop for [[continuing_education/policy_gradient_methods/reinforce/README|REINFORCE]]? - Initialize the [[policy]] $\pi_{\theta}$ with random weights - For each episode: - Generate a trajectory $\tau$ by following the [[policy]] $\pi_{\theta}$ - Compute the return $R(\tau)$ - - Compute the [[policy gradient]] $\nabla_{\theta} J(\theta)$ + - Compute the [[continuing_education/policy_gradient_methods/README|policy-gradient]] $\nabla_{\theta} J(\theta)$ - Update the [[policy]] parameters $\theta$ with the [[gradient]] ## What is $\pi_{\theta}(a | s)$? -- The function which is learned by the [[REINFORCE]] algorithm +- The function which is learned by the [[continuing_education/policy_gradient_methods/reinforce/README|REINFORCE]] algorithm - The probability of taking action $a$ in state $s$ under [[policy]] $\pi_{\theta}$ -## Why do you need to normalize the rewards in [[REINFORCE]]? +## Why do you need to normalize the rewards in [[continuing_education/policy_gradient_methods/reinforce/README|REINFORCE]]? Since the rewards are arbitrary and directly part of the objective/loss function, they are normalized to make the optimization more stable. @@ -47,13 +47,13 @@ Since the rewards are arbitrary and directly part of the objective/loss function A probability distribution over a discrete number of possible outcomes, where each outcome has a probability associated with it. Like a dice roll, where each face has a probability of being rolled. -## What are some advantages of [[policy gradient]] methods over [[value]] based methods? +## What are some advantages of [[continuing_education/policy_gradient_methods/README|policy-gradient]] methods over [[value]] based methods? - They can learn stochastic policies - They can learn policies in high-dimensional or continuous action spaces - They can have better convergence properties, they can be made to change smoothly over time with sampling rather than depending on an argmax operation. -## What are some disadvantages of [[policy gradient]] methods vs [[value]] based methods ? +## What are some disadvantages of [[continuing_education/policy_gradient_methods/README|policy-gradient]] methods vs [[value]] based methods ? - They have high variance in the [[gradient]]s which can lead to - Slow convergence diff --git a/continuing_education/value_based_methods/dqn/README.md b/continuing_education/value_based_methods/dqn/README.md index b9a3cfe..7e71dfe 100644 --- a/continuing_education/value_based_methods/dqn/README.md +++ b/continuing_education/value_based_methods/dqn/README.md @@ -1,7 +1,7 @@ # DQN ## What is the bellman equation? -* The Bellman equation is a recursive relationship that defines the value of a state-action pair in terms of the immediate reward and the expected value of the next state. +* The Bellman equation is a recursive relationship that defines the [[value]] of a state-action pair in terms of the immediate reward and the expected value of the next state. * $Q(s, a) = r + \lambda * max_a'(Q(s', a')) * (1 - done(s, a))$ * $Q(s, a)$: The Q-value of taking action $a$ in state $s$ @@ -10,7 +10,7 @@ * $max_a'(Q(s', a'))$: The maximum Q-value of the next state $s'$ * $done(s, a)$: Whether the episode is done after taking action $a$ in state $s$ -## What does the Q value represent? +## What does the Q [[value]] represent? The expected cumulative future reward of taking action $a$ in state $s$ following the optimal [[policy]] thereafter. @@ -23,37 +23,37 @@ The expected cumulative future reward of taking action $a$ in state $s$ followin * $r$: The reward of taking action $a$ in state $s$ * $\lambda$: The discount factor -## What are some differences in the `act` method between [[dqn]] and [[REINFORCE]] +## What are some differences in the `act` method between [[continuing_education/value_based_methods/dqn/README|dqn]] and [[continuing_education/policy_gradient_methods/reinforce/README|REINFORCE]] -* QLearning uses an argmax to select the best action whereas [[REINFORCE]] uses a softmax sample to select an action -* QLearning has an epsilon greedy [[policy]] whereas [[REINFORCE]] has a stochastic [[policy]] +* QLearning uses an argmax to select the best action whereas [[continuing_education/policy_gradient_methods/reinforce/README|REINFORCE]] uses a softmax sample to select an action +* QLearning has an epsilon greedy [[policy]] whereas [[continuing_education/policy_gradient_methods/reinforce/README|REINFORCE]] has a stochastic [[policy]] ## Define an epsilon greedy [[policy]] * An epsilon greedy [[policy]] is a [[policy]] that selects the best action with probability $1 - \epsilon$ and a random action with probability $\epsilon$ -## What are some differences in the `train` method between [[dqn]] and [[REINFORCE]] +## What are some differences in the `train` method between [[continuing_education/value_based_methods/dqn/README|dqn]] and [[continuing_education/policy_gradient_methods/reinforce/README|REINFORCE]] -* QLearning trains on each step whereas [[REINFORCE]] trains at the end of each episode -* [[REINFORCE]] needs a whole trajectory to train, because it operates on real cumulative rewards, whereas QLearning can train on each step because it operates on predicted cumulative rewards +* QLearning trains on each step whereas [[continuing_education/policy_gradient_methods/reinforce/README|REINFORCE]] trains at the end of each episode +* [[continuing_education/policy_gradient_methods/reinforce/README|REINFORCE]] needs a whole trajectory to train, because it operates on real cumulative rewards, whereas QLearning can train on each step because it operates on predicted cumulative rewards -## What are some differences in the `collect_episodes` method between [[dqn]] and [[REINFORCE]] +## What are some differences in the `collect_episodes` method between [[continuing_education/value_based_methods/dqn/README|dqn]] and [[continuing_education/policy_gradient_methods/reinforce/README|REINFORCE]] -* QLearning uses SARS whereas [[REINFORCE]] uses SAR -* [[REINFORCE]] needs a whole trajectory to train, because it operates on real cumulative rewards, whereas QLearning can train on each step because it operates on predicted cumulative rewards +* QLearning uses SARS whereas [[continuing_education/policy_gradient_methods/reinforce/README|REINFORCE]] uses SAR +* [[continuing_education/policy_gradient_methods/reinforce/README|REINFORCE]] needs a whole trajectory to train, because it operates on real cumulative rewards, whereas QLearning can train on each step because it operates on predicted cumulative rewards * This is because the bellman equation requires a mixture of one real reward and one predicted reward from the network to properly train -## What are the differences between exploration rate and temperature in [[dqn]] and [[REINFORCE]]? +## What are the differences between exploration rate and temperature in [[continuing_education/value_based_methods/dqn/README|dqn]] and [[continuing_education/policy_gradient_methods/reinforce/README|REINFORCE]]? * Exploration rate is a probability of taking a random action * Temperature is a parameter in the softmax function that controls the stochasticity of the [[policy]] * Exploration either happens or does not happen and is not controlled by the neural network at all * Temperature just controls the output of the network. If the network is very confident in one action, it will still take that action with high probability, but if the network is unsure, it will take a random action with some probability. Therefore you don't need to decay temperature, but you do need to decay exploration rate. -## What is action replay memory? Why is it needed in [[dqn]] but not [[REINFORCE]]. +## What is action replay memory? Why is it needed in [[continuing_education/value_based_methods/dqn/README|dqn]] but not [[continuing_education/policy_gradient_methods/reinforce/README|REINFORCE]]. * A buffer that stores experiences for training the network. -* [[dqn]] is an off-policy method, so it can learn from past experiences. [[REINFORCE]] is an on-policy method, so it can't learn from past experiences. The replay buffer is a feature of off-policy methods, not a hindrance. +* [[continuing_education/value_based_methods/dqn/README|dqn]] is an off-policy method, so it can learn from past experiences. [[continuing_education/policy_gradient_methods/reinforce/README|REINFORCE]] is an on-policy method, so it can't learn from past experiences. The replay buffer is a feature of off-policy methods, not a hindrance. ## What is on-policy learning? diff --git a/continuing_education/value_based_methods/duelingdqn/README.md b/continuing_education/value_based_methods/duelingdqn/README.md index 92978c2..76af7b1 100644 --- a/continuing_education/value_based_methods/duelingdqn/README.md +++ b/continuing_education/value_based_methods/duelingdqn/README.md @@ -2,12 +2,12 @@ ## What is the dueling DQN architecture? -The Dueling [[dqn]] architecture is a modification of the standard [[dqn]] architecture that separates the [[value]] function and [[advantage]] function into two separate streams. +The Dueling [[continuing_education/value_based_methods/dqn/README|dqn]] architecture is a modification of the standard [[continuing_education/value_based_methods/dqn/README|dqn]] architecture that separates the [[value]] function and [[advantage]] function into two separate streams. -## What is the mathematical identity linking the advantage function, the Q function, and the value function? +## What is the mathematical identity linking the [[advantage]] function, the Q function, and the [[value]] function? Because the [[advantage]] function can be defined as $A(s,a) = Q(s,a) - V(s)$, we can re-arrange this to get the Q-value function as $Q(s,a) = A(s,a) - V(s)$. -## What advantage does separating the advantage and value functions provide? +## What [[advantage]] does separating the advantage and [[value]] functions provide? This has the effect of constraining the bias of each network, which can help with convergence. diff --git a/continuing_education/value_based_methods/notes/value.md b/continuing_education/value_based_methods/notes/value.md index 4c11c52..42f4a5c 100644 --- a/continuing_education/value_based_methods/notes/value.md +++ b/continuing_education/value_based_methods/notes/value.md @@ -1 +1 @@ -# Value +# [[Value]] diff --git a/continuing_education/policy_gradient_methods/notes/policy-gradient.md b/notes/actor critic.md similarity index 100% rename from continuing_education/policy_gradient_methods/notes/policy-gradient.md rename to notes/actor critic.md diff --git a/notes/dqn.md b/notes/dqn.md new file mode 100644 index 0000000..e69de29 diff --git a/notes/log probability.md b/notes/log probability.md new file mode 100644 index 0000000..e69de29 diff --git a/notes/log-probability.md b/notes/log-probability.md new file mode 100644 index 0000000..e69de29 diff --git a/notes/policy-gradient.md b/notes/policy-gradient.md new file mode 100644 index 0000000..e69de29 diff --git a/notes/reinforce.md b/notes/reinforce.md new file mode 100644 index 0000000..e69de29