diff --git a/esl_mixdchlet.c b/esl_mixdchlet.c index d7fb111..37b16a8 100644 --- a/esl_mixdchlet.c +++ b/esl_mixdchlet.c @@ -46,6 +46,12 @@ */ ESL_MIXDCHLET * esl_mixdchlet_Create(int Q, int K) +{ + return esl_mixdchlet_CreateForFitting(Q, K, 0); +} + +ESL_MIXDCHLET * +esl_mixdchlet_CreateForFitting(int Q, int K, int isDnaStrandSymmetric) { ESL_MIXDCHLET *dchl = NULL; int status; @@ -64,6 +70,7 @@ esl_mixdchlet_Create(int Q, int K) dchl->Q = Q; dchl->K = K; + dchl->isDnaStrandSymmetric = isDnaStrandSymmetric; return dchl; ERROR: @@ -218,6 +225,25 @@ mixdchlet_pack_paramvector(ESL_MIXDCHLET *dchl, double *p) int j = 0; /* counter in packed parameter vector
*/ int k,a; /* indices over components, residues */ + if (dchl->isDnaStrandSymmetric) + { + for (k = 0; k < dchl->Q - 1; k += 2) // paired components + p[j++] = log(dchl->q[k] * 2); // Prob(component) * 2 = Prob(pair) + + if (k == dchl->Q - 1) // unpaired self-symmetric component + p[j++] = log(dchl->q[k]); + + for (k = 0; k < dchl->Q - 1; k += 2) // paired components + for (a = 0; a < dchl->K; a++) + p[j++] = log(dchl->alpha[k][a]); + + if (k == dchl->Q - 1) // unpaired self-symmetric component + for (a = 0; a < dchl->K / 2; a++) + p[j++] = log(dchl->alpha[k][a]); + + return; + } + /* mixture coefficients */ for (k = 0; k < dchl->Q; k++) p[j++] = log(dchl->q[k]); @@ -240,6 +266,27 @@ mixdchlet_unpack_paramvector(double *p, ESL_MIXDCHLET *dchl) int j = 0; /* counter in packed parameter vector
*/
int k,a; /* indices over components, residues */
+ if (dchl->isDnaStrandSymmetric)
+ {
+ for (k = 0; k < dchl->Q - 1; k += 2) // paired components
+ dchl->q[k] = dchl->q[k+1] = exp(p[j++]) / 2;
+
+ if (k == dchl->Q - 1) // unpaired self-symmetric component
+ dchl->q[k] = exp(p[j++]);
+
+ esl_vec_DNorm(dchl->q, dchl->Q);
+
+ for (k = 0; k < dchl->Q - 1; k += 2) // paired components
+ for (a = 0; a < dchl->K; a++)
+ dchl->alpha[k][a] = dchl->alpha[k+1][dchl->K - 1 - a] = exp(p[j++]);
+
+ if (k == dchl->Q - 1) // unpaired self-symmetric component
+ for (a = 0; a < dchl->K / 2; a++)
+ dchl->alpha[k][a] = dchl->alpha[k][dchl->K - 1 - a] = exp(p[j++]);
+
+ return;
+ }
+
/* mixture coefficients */
for (k = 0; k < dchl->Q; k++)
dchl->q[k] = exp(p[j++]);
@@ -278,6 +325,7 @@ mixdchlet_gradient(double *p, int np, void *dptr, double *dp)
double sum_alpha; // |alpha_k|
double sum_c; // |c_i|
double psi1; // \Psi(c_ia + alpha_ka)
+ double psi1b; // \Psi(c_ia' + alpha_ka)
double psi2; // \Psi( |c_i| + |alpha_k| )
double psi3; // \Psi( |alpha_k| )
double psi4; // \Psi( alpha_ka )
@@ -289,9 +337,61 @@ mixdchlet_gradient(double *p, int np, void *dptr, double *dp)
{
mixdchlet_postq(dchl, data->c[i]); // d->postq[q] is now P(q | c_i, theta)
sum_c = esl_vec_DSum(data->c[i], dchl->K); // |c_i|
+ j = 0;
+ if (dchl->isDnaStrandSymmetric)
+ {
+ for (k = 0; k < dchl->Q - 1; k += 2) // paired components
+ dp[j++] -= dchl->postq[k] + dchl->postq[k+1] - 2 * dchl->q[k];
+
+ if (k == dchl->Q - 1) // unpaired self-symmetric component
+ dp[j++] -= dchl->postq[k] - dchl->q[k];
+
+ for (k = 0; k < dchl->Q - 1; k += 2) // paired components
+ {
+ double strand_weights[2] = {0, 0};
+ for (a = 0; a < dchl->K; a++)
+ {
+ int b = dchl->K - 1 - a; // complement of DNA base a
+ strand_weights[0] += lgamma(dchl->alpha[k][a] + data->c[i][a]);
+ strand_weights[1] += lgamma(dchl->alpha[k][a] + data->c[i][b]);
+ }
+ esl_vec_DLogNorm(strand_weights, 2);
+
+ sum_alpha = esl_vec_DSum(dchl->alpha[k], dchl->K);
+ esl_stats_Psi( sum_alpha + sum_c, &psi2);
+ esl_stats_Psi( sum_alpha, &psi3);
+ for (a = 0; a < dchl->K; a++)
+ {
+ int b = dchl->K - 1 - a; // complement of DNA base a
+ esl_stats_Psi( dchl->alpha[k][a] + data->c[i][a], &psi1);
+ esl_stats_Psi( dchl->alpha[k][a] + data->c[i][b], &psi1b);
+ esl_stats_Psi( dchl->alpha[k][a], &psi4);
+ dp[j++] -= dchl->alpha[k][a] * (dchl->postq[k] + dchl->postq[k+1])
+ * (strand_weights[0] * psi1 + strand_weights[1] * psi1b - psi2 + psi3 - psi4);
+ }
+ }
+
+ if (k == dchl->Q - 1) // unpaired self-symmetric component
+ {
+ sum_alpha = esl_vec_DSum(dchl->alpha[k], dchl->K);
+ esl_stats_Psi( sum_alpha + sum_c, &psi2);
+ esl_stats_Psi( sum_alpha, &psi3);
+ for (a = 0; a < dchl->K / 2; a++)
+ {
+ int b = dchl->K - 1 - a; // complement of DNA base a
+ esl_stats_Psi( dchl->alpha[k][a] + data->c[i][a], &psi1);
+ esl_stats_Psi( dchl->alpha[k][a] + data->c[i][b], &psi1b);
+ esl_stats_Psi( dchl->alpha[k][a], &psi4);
+ dp[j++] -= dchl->alpha[k][a] * dchl->postq[k]
+ * (psi1 + psi1b - 2 * psi2 + 2 * psi3 - 2 * psi4);
+ }
+ }
+
+ continue;
+ }
+
/* mixture coefficient gradient */
- j = 0;
for (k = 0; k < dchl->Q; k++)
dp[j++] -= dchl->postq[k] - dchl->q[k];
@@ -351,6 +451,8 @@ esl_mixdchlet_Fit(double **c, int N, ESL_MIXDCHLET *dchl, double *opt_nll)
double fx;
int status;
+ if (dchl->isDnaStrandSymmetric) nparam = (nparam + 1) / 2;
+
cfg = esl_min_cfg_Create(nparam);
if (! cfg) { status = eslEMEM; goto ERROR; }
cfg->cg_rtol = 3e-5;
@@ -419,6 +521,23 @@ esl_mixdchlet_Sample(ESL_RANDOMNESS *rng, ESL_MIXDCHLET *dchl)
{
int k,a;
+ if (dchl->isDnaStrandSymmetric)
+ {
+ esl_dirichlet_DSampleUniform(rng, dchl->Q, dchl->q);
+ for (k = 0; k < dchl->Q - 1; k += 2) // paired components
+ dchl->q[k] = dchl->q[k+1] = (dchl->q[k] + dchl->q[k+1]) / 2;
+
+ for (k = 0; k < dchl->Q - 1; k += 2) // paired components
+ for (a = 0; a < dchl->K; a++)
+ dchl->alpha[k][a] = dchl->alpha[k+1][dchl->K - 1 - a] = 2.0 * esl_rnd_UniformPositive(rng);
+
+ if (k == dchl->Q - 1) // unpaired self-symmetric component
+ for (a = 0; a < dchl->K / 2; a++)
+ dchl->alpha[k][a] = dchl->alpha[k][dchl->K - 1 - a] = 2.0 * esl_rnd_UniformPositive(rng);
+
+ return eslOK;
+ }
+
esl_dirichlet_DSampleUniform(rng, dchl->Q, dchl->q);
for (k = 0; k < dchl->Q; k++)
for (a = 0; a < dchl->K; a++)
diff --git a/esl_mixdchlet.h b/esl_mixdchlet.h
index 4640076..bd7a2be 100644
--- a/esl_mixdchlet.h
+++ b/esl_mixdchlet.h
@@ -20,6 +20,7 @@ typedef struct {
double **alpha; /* Dirichlet params alpha[0..Q-1][0..K-1] */
int Q; /* number of mixtures, e.g. 9 for Sjolander */
int K; /* alphabet size, e.g. 20 */
+ int isDnaStrandSymmetric; /* is it DNA-strand-symmetric for A C G T? */
double *postq; /* temp space 0..Q-1: for posterior P(k|c) for example */
/*::cexcerpt::dirichlet_mixdchlet::end::*/
@@ -27,6 +28,7 @@ typedef struct {
extern ESL_MIXDCHLET *esl_mixdchlet_Create(int Q, int K);
+extern ESL_MIXDCHLET *esl_mixdchlet_CreateForFitting(int Q, int K, int isDnaStrandSymmetric);
extern void esl_mixdchlet_Destroy(ESL_MIXDCHLET *dchl);
extern double esl_mixdchlet_logp_c (ESL_MIXDCHLET *dchl, double *c);
diff --git a/esl_mixdchlet.tex b/esl_mixdchlet.tex
index 8c5ef28..38c0910 100644
--- a/esl_mixdchlet.tex
+++ b/esl_mixdchlet.tex
@@ -1,6 +1,7 @@
\documentclass[11pt]{article}
\setcounter{secnumdepth}{0}
+\usepackage{amsmath}
\usepackage{relsize}
\newcommand{\mono}[1]{{\smaller\texttt{#1}}} % literal (to be typed): code, program names
@@ -38,7 +39,7 @@ \section{Fitting a mixture Dirichlet to counts}
\[
P(c_i \mid \alpha_k) = \frac{ |c_i|! }
{ \prod_a c_{ia}! }
- \frac{ \prod_a \Gamma \left( c_{ia} + \alpha_{ia} \right) }
+ \frac{ \prod_a \Gamma \left( c_{ia} + \alpha_{ka} \right) }
{ \Gamma ( |c_i + \alpha_k| ) }
\frac{ \Gamma ( |\alpha_k| ) }
{ \prod_a \Gamma \left( \alpha_{ka} \right) }
@@ -91,6 +92,55 @@ \section{Fitting a mixture Dirichlet to counts}
maximizer. The implementation provides the negative log likelihood
and the negative gradient to the CG routine.
+\subsection{Fitting a mixture Dirichlet with DNA strand symmetry}
-\end{document}
+Here, each count vector has $K=4$ counts of DNA bases. For a
+strand-symmetric mixture Dirichlet, each component must be either
+self-symmetric: $\alpha_{ka} = \alpha_{ka'}$ (where $a'$ is the
+complement of base $a$), or paired with a symmetric component:
+$\alpha_{ka} = \alpha_{k'a'}$ and $q_k = q_{k'}$ (where $k$ and $k'$
+indicate paired components). Define $R$ to be the number of pairs
+plus the number of unpaired components. It's convenient to make these
+definitions:
+\[
+r_k = \begin{cases}
+ 2 q_k & \text{for a paired component (i.e., the prior probability of the pair),} \\
+ q_k & \text{for an unpaired component}
+\end{cases}
+\]
+\[
+\hat{P}(k \mid \theta, c_i) = \begin{cases}
+ P(k \mid \theta, c_i) + P(k' \mid \theta, c_i) & \text{for a paired component,}\\
+ P(k \mid \theta, c_i) & \text{for an unpaired component}
+\end{cases}
+\]
+Here, the number of $\lambda$ parameters is $R$, and the number of
+$\beta$ parameters is: $K \times \text{(number of components) / 2}$.
+We can define $\beta_{ka}$ as above, and $\lambda_k$ like this:
+\begin{eqnarray*}
+ r_k & = & \frac{ e^{\lambda_k} } { \sum_{m=1}^R e^{\lambda_m} } \,.
+\end{eqnarray*}
+Then,
+\begin{eqnarray*}
+\frac{\partial L}{\partial \lambda_k} &=& \sum_i \bigl( \hat{P}(k \mid \theta, c_i) - r_k \bigr) \\
+\frac{\partial L}{\partial \beta_{ka}} &=&
+ \begin{cases}
+\sum_i \alpha_{ka} \hat{P}(k \mid \theta, c_i)
+ (X_{ika} + Z_{ika}) & \text{for a paired component,} \\
+\sum_i \alpha_{ka} \hat{P}(k \mid \theta, c_i)
+ (Y_{ika} + 2 Z_{ika}) & \text{for an unpaired component}
+ \end{cases}
+\end{eqnarray*}
+where
+\begin{eqnarray*}
+X_{ika} & = &
+\frac{ \Psi \left( c_{ia} + \alpha_{ka} \right) \prod_a \Gamma \left( c_{ia} + \alpha_{ka} \right) +
+ \Psi \left( c_{ia'} + \alpha_{ka} \right) \prod_a \Gamma \left( c_{ia'} + \alpha_{ka} \right) }
+ { \prod_a \Gamma \left( c_{ia} + \alpha_{ka} \right) + \prod_a \Gamma \left( c_{ia'} + \alpha_{ka} \right) } \,,\\
+Y_{ika} & = & \Psi \left( c_{ia} + \alpha_{ka} \right) + \Psi \left( c_{ia'} + \alpha_{ka} \right) \,,\\
+Z_{ika} & = & - \Psi \left( | c_i | + | \alpha_k | \right)
+ + \Psi \left( | \alpha_k | \right)
+ - \Psi \left( \alpha_{ka} \right) \,.
+\end{eqnarray*}
+\end{document}
diff --git a/miniapps/Makefile.in b/miniapps/Makefile.in
index 4a8caab..18584a4 100644
--- a/miniapps/Makefile.in
+++ b/miniapps/Makefile.in
@@ -41,6 +41,8 @@ LDFLAGS = @LDFLAGS@
DEFS = @DEFS@
LIBS = -leasel @LIBGSL@ @LIBS@ @PTHREAD_LIBS@ -lm
+PROGS = esl-mixdchlet
+
## list of the miniapp commands to compile.
#
SUBCMDOBJS = \
@@ -97,6 +99,9 @@ check: ${PROGS} easel
easel: % : %.c ../libeasel.a ${SUBCMDOBJS}
${QUIET_GEN}${CC} ${CPPFLAGS} ${CFLAGS} ${PTHREAD_CFLAGS} ${DEFS} ${LDFLAGS} -L.. -I. -I.. -I${srcdir} -I${srcdir}/.. -o $@ $< ${SUBCMDOBJS} ${LIBS}
+${PROGS}: % : %.c ../libeasel.a
+ ${QUIET_GEN}${CC} ${CPPFLAGS} ${CFLAGS} ${PTHREAD_CFLAGS} ${DEFS} ${LDFLAGS} -L.. -I. -I.. -I${srcdir} -I${srcdir}/.. -o $@ $< ${LIBS}
+
${SUBCMDOBJS}: %.o : %.c ../libeasel.a
${QUIET_CC}${CC} -I. -I.. -I${srcdir} -I${srcdir}/.. ${CPPFLAGS} ${CFLAGS} ${PTHREAD_CFLAGS} ${SIMD_CFLAGS} ${DEFS} -c $<
diff --git a/miniapps/esl-mixdchlet.c b/miniapps/esl-mixdchlet.c
index fa38f8f..23150c4 100644
--- a/miniapps/esl-mixdchlet.c
+++ b/miniapps/esl-mixdchlet.c
@@ -111,6 +111,7 @@ static ESL_OPTIONS fit_options[] = {
/* name type default env range toggles reqs incomp help docgroup*/
{ "-h", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "show brief help on version and usage", 0 },
{ "-s", eslARG_INT, "0", NULL, NULL, NULL, NULL, NULL, "set random number seed to