diff --git a/esl_mixdchlet.c b/esl_mixdchlet.c index d7fb111..37b16a8 100644 --- a/esl_mixdchlet.c +++ b/esl_mixdchlet.c @@ -46,6 +46,12 @@ */ ESL_MIXDCHLET * esl_mixdchlet_Create(int Q, int K) +{ + return esl_mixdchlet_CreateForFitting(Q, K, 0); +} + +ESL_MIXDCHLET * +esl_mixdchlet_CreateForFitting(int Q, int K, int isDnaStrandSymmetric) { ESL_MIXDCHLET *dchl = NULL; int status; @@ -64,6 +70,7 @@ esl_mixdchlet_Create(int Q, int K) dchl->Q = Q; dchl->K = K; + dchl->isDnaStrandSymmetric = isDnaStrandSymmetric; return dchl; ERROR: @@ -218,6 +225,25 @@ mixdchlet_pack_paramvector(ESL_MIXDCHLET *dchl, double *p) int j = 0; /* counter in packed parameter vector

*/ int k,a; /* indices over components, residues */ + if (dchl->isDnaStrandSymmetric) + { + for (k = 0; k < dchl->Q - 1; k += 2) // paired components + p[j++] = log(dchl->q[k] * 2); // Prob(component) * 2 = Prob(pair) + + if (k == dchl->Q - 1) // unpaired self-symmetric component + p[j++] = log(dchl->q[k]); + + for (k = 0; k < dchl->Q - 1; k += 2) // paired components + for (a = 0; a < dchl->K; a++) + p[j++] = log(dchl->alpha[k][a]); + + if (k == dchl->Q - 1) // unpaired self-symmetric component + for (a = 0; a < dchl->K / 2; a++) + p[j++] = log(dchl->alpha[k][a]); + + return; + } + /* mixture coefficients */ for (k = 0; k < dchl->Q; k++) p[j++] = log(dchl->q[k]); @@ -240,6 +266,27 @@ mixdchlet_unpack_paramvector(double *p, ESL_MIXDCHLET *dchl) int j = 0; /* counter in packed parameter vector

*/ int k,a; /* indices over components, residues */ + if (dchl->isDnaStrandSymmetric) + { + for (k = 0; k < dchl->Q - 1; k += 2) // paired components + dchl->q[k] = dchl->q[k+1] = exp(p[j++]) / 2; + + if (k == dchl->Q - 1) // unpaired self-symmetric component + dchl->q[k] = exp(p[j++]); + + esl_vec_DNorm(dchl->q, dchl->Q); + + for (k = 0; k < dchl->Q - 1; k += 2) // paired components + for (a = 0; a < dchl->K; a++) + dchl->alpha[k][a] = dchl->alpha[k+1][dchl->K - 1 - a] = exp(p[j++]); + + if (k == dchl->Q - 1) // unpaired self-symmetric component + for (a = 0; a < dchl->K / 2; a++) + dchl->alpha[k][a] = dchl->alpha[k][dchl->K - 1 - a] = exp(p[j++]); + + return; + } + /* mixture coefficients */ for (k = 0; k < dchl->Q; k++) dchl->q[k] = exp(p[j++]); @@ -278,6 +325,7 @@ mixdchlet_gradient(double *p, int np, void *dptr, double *dp) double sum_alpha; // |alpha_k| double sum_c; // |c_i| double psi1; // \Psi(c_ia + alpha_ka) + double psi1b; // \Psi(c_ia' + alpha_ka) double psi2; // \Psi( |c_i| + |alpha_k| ) double psi3; // \Psi( |alpha_k| ) double psi4; // \Psi( alpha_ka ) @@ -289,9 +337,61 @@ mixdchlet_gradient(double *p, int np, void *dptr, double *dp) { mixdchlet_postq(dchl, data->c[i]); // d->postq[q] is now P(q | c_i, theta) sum_c = esl_vec_DSum(data->c[i], dchl->K); // |c_i| + j = 0; + if (dchl->isDnaStrandSymmetric) + { + for (k = 0; k < dchl->Q - 1; k += 2) // paired components + dp[j++] -= dchl->postq[k] + dchl->postq[k+1] - 2 * dchl->q[k]; + + if (k == dchl->Q - 1) // unpaired self-symmetric component + dp[j++] -= dchl->postq[k] - dchl->q[k]; + + for (k = 0; k < dchl->Q - 1; k += 2) // paired components + { + double strand_weights[2] = {0, 0}; + for (a = 0; a < dchl->K; a++) + { + int b = dchl->K - 1 - a; // complement of DNA base a + strand_weights[0] += lgamma(dchl->alpha[k][a] + data->c[i][a]); + strand_weights[1] += lgamma(dchl->alpha[k][a] + data->c[i][b]); + } + esl_vec_DLogNorm(strand_weights, 2); + + sum_alpha = esl_vec_DSum(dchl->alpha[k], dchl->K); + esl_stats_Psi( sum_alpha + sum_c, &psi2); + esl_stats_Psi( sum_alpha, &psi3); + for (a = 0; a < dchl->K; a++) + { + int b = dchl->K - 1 - a; // complement of DNA base a + esl_stats_Psi( dchl->alpha[k][a] + data->c[i][a], &psi1); + esl_stats_Psi( dchl->alpha[k][a] + data->c[i][b], &psi1b); + esl_stats_Psi( dchl->alpha[k][a], &psi4); + dp[j++] -= dchl->alpha[k][a] * (dchl->postq[k] + dchl->postq[k+1]) + * (strand_weights[0] * psi1 + strand_weights[1] * psi1b - psi2 + psi3 - psi4); + } + } + + if (k == dchl->Q - 1) // unpaired self-symmetric component + { + sum_alpha = esl_vec_DSum(dchl->alpha[k], dchl->K); + esl_stats_Psi( sum_alpha + sum_c, &psi2); + esl_stats_Psi( sum_alpha, &psi3); + for (a = 0; a < dchl->K / 2; a++) + { + int b = dchl->K - 1 - a; // complement of DNA base a + esl_stats_Psi( dchl->alpha[k][a] + data->c[i][a], &psi1); + esl_stats_Psi( dchl->alpha[k][a] + data->c[i][b], &psi1b); + esl_stats_Psi( dchl->alpha[k][a], &psi4); + dp[j++] -= dchl->alpha[k][a] * dchl->postq[k] + * (psi1 + psi1b - 2 * psi2 + 2 * psi3 - 2 * psi4); + } + } + + continue; + } + /* mixture coefficient gradient */ - j = 0; for (k = 0; k < dchl->Q; k++) dp[j++] -= dchl->postq[k] - dchl->q[k]; @@ -351,6 +451,8 @@ esl_mixdchlet_Fit(double **c, int N, ESL_MIXDCHLET *dchl, double *opt_nll) double fx; int status; + if (dchl->isDnaStrandSymmetric) nparam = (nparam + 1) / 2; + cfg = esl_min_cfg_Create(nparam); if (! cfg) { status = eslEMEM; goto ERROR; } cfg->cg_rtol = 3e-5; @@ -419,6 +521,23 @@ esl_mixdchlet_Sample(ESL_RANDOMNESS *rng, ESL_MIXDCHLET *dchl) { int k,a; + if (dchl->isDnaStrandSymmetric) + { + esl_dirichlet_DSampleUniform(rng, dchl->Q, dchl->q); + for (k = 0; k < dchl->Q - 1; k += 2) // paired components + dchl->q[k] = dchl->q[k+1] = (dchl->q[k] + dchl->q[k+1]) / 2; + + for (k = 0; k < dchl->Q - 1; k += 2) // paired components + for (a = 0; a < dchl->K; a++) + dchl->alpha[k][a] = dchl->alpha[k+1][dchl->K - 1 - a] = 2.0 * esl_rnd_UniformPositive(rng); + + if (k == dchl->Q - 1) // unpaired self-symmetric component + for (a = 0; a < dchl->K / 2; a++) + dchl->alpha[k][a] = dchl->alpha[k][dchl->K - 1 - a] = 2.0 * esl_rnd_UniformPositive(rng); + + return eslOK; + } + esl_dirichlet_DSampleUniform(rng, dchl->Q, dchl->q); for (k = 0; k < dchl->Q; k++) for (a = 0; a < dchl->K; a++) diff --git a/esl_mixdchlet.h b/esl_mixdchlet.h index 4640076..bd7a2be 100644 --- a/esl_mixdchlet.h +++ b/esl_mixdchlet.h @@ -20,6 +20,7 @@ typedef struct { double **alpha; /* Dirichlet params alpha[0..Q-1][0..K-1] */ int Q; /* number of mixtures, e.g. 9 for Sjolander */ int K; /* alphabet size, e.g. 20 */ + int isDnaStrandSymmetric; /* is it DNA-strand-symmetric for A C G T? */ double *postq; /* temp space 0..Q-1: for posterior P(k|c) for example */ /*::cexcerpt::dirichlet_mixdchlet::end::*/ @@ -27,6 +28,7 @@ typedef struct { extern ESL_MIXDCHLET *esl_mixdchlet_Create(int Q, int K); +extern ESL_MIXDCHLET *esl_mixdchlet_CreateForFitting(int Q, int K, int isDnaStrandSymmetric); extern void esl_mixdchlet_Destroy(ESL_MIXDCHLET *dchl); extern double esl_mixdchlet_logp_c (ESL_MIXDCHLET *dchl, double *c); diff --git a/esl_mixdchlet.tex b/esl_mixdchlet.tex index 8c5ef28..38c0910 100644 --- a/esl_mixdchlet.tex +++ b/esl_mixdchlet.tex @@ -1,6 +1,7 @@ \documentclass[11pt]{article} \setcounter{secnumdepth}{0} +\usepackage{amsmath} \usepackage{relsize} \newcommand{\mono}[1]{{\smaller\texttt{#1}}} % literal (to be typed): code, program names @@ -38,7 +39,7 @@ \section{Fitting a mixture Dirichlet to counts} \[ P(c_i \mid \alpha_k) = \frac{ |c_i|! } { \prod_a c_{ia}! } - \frac{ \prod_a \Gamma \left( c_{ia} + \alpha_{ia} \right) } + \frac{ \prod_a \Gamma \left( c_{ia} + \alpha_{ka} \right) } { \Gamma ( |c_i + \alpha_k| ) } \frac{ \Gamma ( |\alpha_k| ) } { \prod_a \Gamma \left( \alpha_{ka} \right) } @@ -91,6 +92,55 @@ \section{Fitting a mixture Dirichlet to counts} maximizer. The implementation provides the negative log likelihood and the negative gradient to the CG routine. +\subsection{Fitting a mixture Dirichlet with DNA strand symmetry} -\end{document} +Here, each count vector has $K=4$ counts of DNA bases. For a +strand-symmetric mixture Dirichlet, each component must be either +self-symmetric: $\alpha_{ka} = \alpha_{ka'}$ (where $a'$ is the +complement of base $a$), or paired with a symmetric component: +$\alpha_{ka} = \alpha_{k'a'}$ and $q_k = q_{k'}$ (where $k$ and $k'$ +indicate paired components). Define $R$ to be the number of pairs +plus the number of unpaired components. It's convenient to make these +definitions: +\[ +r_k = \begin{cases} + 2 q_k & \text{for a paired component (i.e., the prior probability of the pair),} \\ + q_k & \text{for an unpaired component} +\end{cases} +\] +\[ +\hat{P}(k \mid \theta, c_i) = \begin{cases} + P(k \mid \theta, c_i) + P(k' \mid \theta, c_i) & \text{for a paired component,}\\ + P(k \mid \theta, c_i) & \text{for an unpaired component} +\end{cases} +\] +Here, the number of $\lambda$ parameters is $R$, and the number of +$\beta$ parameters is: $K \times \text{(number of components) / 2}$. +We can define $\beta_{ka}$ as above, and $\lambda_k$ like this: +\begin{eqnarray*} + r_k & = & \frac{ e^{\lambda_k} } { \sum_{m=1}^R e^{\lambda_m} } \,. +\end{eqnarray*} +Then, +\begin{eqnarray*} +\frac{\partial L}{\partial \lambda_k} &=& \sum_i \bigl( \hat{P}(k \mid \theta, c_i) - r_k \bigr) \\ +\frac{\partial L}{\partial \beta_{ka}} &=& + \begin{cases} +\sum_i \alpha_{ka} \hat{P}(k \mid \theta, c_i) + (X_{ika} + Z_{ika}) & \text{for a paired component,} \\ +\sum_i \alpha_{ka} \hat{P}(k \mid \theta, c_i) + (Y_{ika} + 2 Z_{ika}) & \text{for an unpaired component} + \end{cases} +\end{eqnarray*} +where +\begin{eqnarray*} +X_{ika} & = & +\frac{ \Psi \left( c_{ia} + \alpha_{ka} \right) \prod_a \Gamma \left( c_{ia} + \alpha_{ka} \right) + + \Psi \left( c_{ia'} + \alpha_{ka} \right) \prod_a \Gamma \left( c_{ia'} + \alpha_{ka} \right) } + { \prod_a \Gamma \left( c_{ia} + \alpha_{ka} \right) + \prod_a \Gamma \left( c_{ia'} + \alpha_{ka} \right) } \,,\\ +Y_{ika} & = & \Psi \left( c_{ia} + \alpha_{ka} \right) + \Psi \left( c_{ia'} + \alpha_{ka} \right) \,,\\ +Z_{ika} & = & - \Psi \left( | c_i | + | \alpha_k | \right) + + \Psi \left( | \alpha_k | \right) + - \Psi \left( \alpha_{ka} \right) \,. +\end{eqnarray*} +\end{document} diff --git a/miniapps/Makefile.in b/miniapps/Makefile.in index 4a8caab..18584a4 100644 --- a/miniapps/Makefile.in +++ b/miniapps/Makefile.in @@ -41,6 +41,8 @@ LDFLAGS = @LDFLAGS@ DEFS = @DEFS@ LIBS = -leasel @LIBGSL@ @LIBS@ @PTHREAD_LIBS@ -lm +PROGS = esl-mixdchlet + ## list of the miniapp commands to compile. # SUBCMDOBJS = \ @@ -97,6 +99,9 @@ check: ${PROGS} easel easel: % : %.c ../libeasel.a ${SUBCMDOBJS} ${QUIET_GEN}${CC} ${CPPFLAGS} ${CFLAGS} ${PTHREAD_CFLAGS} ${DEFS} ${LDFLAGS} -L.. -I. -I.. -I${srcdir} -I${srcdir}/.. -o $@ $< ${SUBCMDOBJS} ${LIBS} +${PROGS}: % : %.c ../libeasel.a + ${QUIET_GEN}${CC} ${CPPFLAGS} ${CFLAGS} ${PTHREAD_CFLAGS} ${DEFS} ${LDFLAGS} -L.. -I. -I.. -I${srcdir} -I${srcdir}/.. -o $@ $< ${LIBS} + ${SUBCMDOBJS}: %.o : %.c ../libeasel.a ${QUIET_CC}${CC} -I. -I.. -I${srcdir} -I${srcdir}/.. ${CPPFLAGS} ${CFLAGS} ${PTHREAD_CFLAGS} ${SIMD_CFLAGS} ${DEFS} -c $< diff --git a/miniapps/esl-mixdchlet.c b/miniapps/esl-mixdchlet.c index fa38f8f..23150c4 100644 --- a/miniapps/esl-mixdchlet.c +++ b/miniapps/esl-mixdchlet.c @@ -111,6 +111,7 @@ static ESL_OPTIONS fit_options[] = { /* name type default env range toggles reqs incomp help docgroup*/ { "-h", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "show brief help on version and usage", 0 }, { "-s", eslARG_INT, "0", NULL, NULL, NULL, NULL, NULL, "set random number seed to ", 0 }, + { "-d", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "fit a DNA strand-symmetric mixture to A C G T counts", 0 }, { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, }; @@ -119,13 +120,14 @@ cmd_fit(const char *topcmd, const ESL_SUBCMD *sub, int argc, char **argv) { ESL_GETOPTS *go = esl_subcmd_CreateDefaultApp(topcmd, sub, fit_options, argc, argv, /*custom opthelp?:*/NULL); ESL_RANDOMNESS *rng = esl_randomness_Create( esl_opt_GetInteger(go, "-s")); + int isDna = esl_opt_GetBoolean(go, "-d"); // a complement-symmetric mixture Dirichlet for DNA base counts? int Q = strtol(esl_opt_GetArg(go, 1), NULL, 10); // number of mixture components int K = strtol(esl_opt_GetArg(go, 2), NULL, 10); // size of probability/parameter vectors - length of count vectors char *ctfile = esl_opt_GetArg(go, 3); // count file to input char *outfile = esl_opt_GetArg(go, 4); // mixture Dirichlet file output ESL_FILEPARSER *efp = NULL; // open fileparser for reading FILE *ofp = NULL; // open output file for writing - ESL_MIXDCHLET *dchl = esl_mixdchlet_Create(Q,K); // mixture Dirichlet being estimated + ESL_MIXDCHLET *dchl = esl_mixdchlet_CreateForFitting(Q,K,isDna); // mixture Dirichlet being estimated int Nalloc = 1024; // initial allocation for ct[] double **ct = esl_mat_DCreate(Nalloc, K); // count vectors, [0..N-1][0..K-1] int N = 0; // number of count vectors so far