From c464606b11ed8b31e6533de72add4ed0ebe7c8a0 Mon Sep 17 00:00:00 2001 From: "Martin C. Frith" Date: Mon, 13 Oct 2025 01:33:33 +0000 Subject: [PATCH 1/3] Adds esl-mixdchlet DNA strand symmetry option --- esl_mixdchlet.c | 73 ++++++++++++++++++++++++++++++++++------ esl_mixdchlet.h | 2 ++ esl_mixdchlet.tex | 47 +++++++++++++++++++++++++- miniapps/Makefile.in | 5 +++ miniapps/esl-mixdchlet.c | 5 ++- 5 files changed, 119 insertions(+), 13 deletions(-) diff --git a/esl_mixdchlet.c b/esl_mixdchlet.c index d7fb1117..638f6dee 100644 --- a/esl_mixdchlet.c +++ b/esl_mixdchlet.c @@ -45,11 +45,12 @@ * Throws: NULL on allocation failure. */ ESL_MIXDCHLET * -esl_mixdchlet_Create(int Q, int K) +esl_mixdchlet_CreateForFitting(int Q, int K, int isDnaStrandSymmetric) { ESL_MIXDCHLET *dchl = NULL; int status; + if (isDnaStrandSymmetric) Q /= 2; // Q components => Q/2 pairs of components ESL_DASSERT1( (Q > 0) ); ESL_DASSERT1( (K > 0) ); @@ -64,14 +65,18 @@ esl_mixdchlet_Create(int Q, int K) dchl->Q = Q; dchl->K = K; + dchl->isDnaStrandSymmetric = isDnaStrandSymmetric; return dchl; ERROR: esl_mixdchlet_Destroy(dchl); return NULL; } - - +ESL_MIXDCHLET * +esl_mixdchlet_Create(int Q, int K) +{ + return esl_mixdchlet_CreateForFitting(Q, K, 0); +} /* Function: esl_mixdchlet_Destroy() * Synopsis: Free a mixture Dirichlet. @@ -94,6 +99,28 @@ esl_mixdchlet_Destroy(ESL_MIXDCHLET *dchl) * 2. Likelihoods, posteriors, inference *****************************************************************/ +// Return the log probability of a vector c of counts of A/C/G/T, given a +// DNA-strand-symmetric pair of Dirichlet distributions: log P(c | alpha) +static double +mixdchlet_logprob_c_dna(double *c, double *alpha, int K) // presumably, K=4 +{ + double lgamma_sums[2] = {0, 0}; + double sum1, sum2, sum3, logp; + sum1 = sum2 = sum3 = logp = 0; + + for (int a = 0; a < K; a++) // loop over 4 DNA bases + { + sum1 += c[a] + alpha[a]; + sum2 += alpha[a]; + sum3 += c[a]; + lgamma_sums[0] += lgamma(alpha[a] + c[a]); + lgamma_sums[1] += lgamma(alpha[a] + c[K - 1 - a]); // complement of a + logp -= lgamma(c[a] + 1) + lgamma(alpha[a]); + } + + return logp + esl_vec_DLogSum(lgamma_sums, 2) - eslCONST_LOG2 + + lgamma(sum2) + lgamma(sum3 + 1) - lgamma(sum1); +} /* mixchlet_postq() * Calculate P(q | c), the posterior probability of component q. @@ -103,7 +130,9 @@ mixdchlet_postq(ESL_MIXDCHLET *dchl, double *c) { int k; for (k = 0; k < dchl->Q; k++) - if (dchl->q[k] > 0.) dchl->postq[k] = log(dchl->q[k]) + esl_dirichlet_logpdf_c(c, dchl->alpha[k], dchl->K); + if (dchl->q[k] > 0.) + dchl->postq[k] = log(dchl->q[k]) + + (dchl->isDnaStrandSymmetric ? mixdchlet_logprob_c_dna : esl_dirichlet_logpdf_c)(c, dchl->alpha[k], dchl->K); else dchl->postq[k] = -eslINFINITY; esl_vec_DLogNorm(dchl->postq, dchl->Q); } @@ -128,7 +157,9 @@ esl_mixdchlet_logp_c(ESL_MIXDCHLET *dchl, double *c) { int k; for (k = 0; k < dchl->Q; k++) - if (dchl->q[k] > 0.) dchl->postq[k] = log(dchl->q[k]) + esl_dirichlet_logpdf_c(c, dchl->alpha[k], dchl->K); + if (dchl->q[k] > 0.) + dchl->postq[k] = log(dchl->q[k]) + + (dchl->isDnaStrandSymmetric ? mixdchlet_logprob_c_dna : esl_dirichlet_logpdf_c)(c, dchl->alpha[k], dchl->K); else dchl->postq[k] = -eslINFINITY; return esl_vec_DLogSum(dchl->postq, dchl->Q); } @@ -278,6 +309,7 @@ mixdchlet_gradient(double *p, int np, void *dptr, double *dp) double sum_alpha; // |alpha_k| double sum_c; // |c_i| double psi1; // \Psi(c_ia + alpha_ka) + double psi1b; // \Psi(c_ia' + alpha_ka) double psi2; // \Psi( |c_i| + |alpha_k| ) double psi3; // \Psi( |alpha_k| ) double psi4; // \Psi( alpha_ka ) @@ -297,12 +329,27 @@ mixdchlet_gradient(double *p, int np, void *dptr, double *dp) for (k = 0; k < dchl->Q; k++) { + double strand_weights[2] = {0, 0}; + for (a = 0; a < dchl->K; a++) + { + int b = dchl->K - 1 - a; // complement of DNA base a + strand_weights[0] += lgamma(dchl->alpha[k][a] + data->c[i][a]); + strand_weights[1] += lgamma(dchl->alpha[k][a] + data->c[i][b]); + } + esl_vec_DLogNorm(strand_weights, 2); + sum_alpha = esl_vec_DSum(dchl->alpha[k], dchl->K); esl_stats_Psi( sum_alpha + sum_c, &psi2); esl_stats_Psi( sum_alpha, &psi3); for (a = 0; a < dchl->K; a++) { esl_stats_Psi( dchl->alpha[k][a] + data->c[i][a], &psi1); + if (dchl->isDnaStrandSymmetric) + { + int b = dchl->K - 1 - a; // complement of DNA base a + esl_stats_Psi( dchl->alpha[k][a] + data->c[i][b], &psi1b); + psi1 = strand_weights[0] * psi1 + strand_weights[1] * psi1b; + } esl_stats_Psi( dchl->alpha[k][a], &psi4); dp[j++] -= dchl->alpha[k][a] * dchl->postq[k] * (psi1 - psi2 + psi3 - psi4); } @@ -518,16 +565,20 @@ esl_mixdchlet_Read(ESL_FILEPARSER *efp, ESL_MIXDCHLET **ret_dchl) int esl_mixdchlet_Write(FILE *fp, const ESL_MIXDCHLET *dchl) { - int k,a; + int k,s,a; int status; + int n = 1 + dchl->isDnaStrandSymmetric; - if ((status = esl_fprintf(fp, "%d %d\n", dchl->K, dchl->Q)) != eslOK) return status; + if ((status = esl_fprintf(fp, "%d %d\n", dchl->K, dchl->Q * n)) != eslOK) return status; for (k = 0; k < dchl->Q; k++) { - if ((status = esl_fprintf(fp, "%.4f ", dchl->q[k])) != eslOK) return status; - for (a = 0; a < dchl->K; a++) - if ((status = esl_fprintf(fp, "%.4f ", dchl->alpha[k][a])) != eslOK) return status; - if ((status = esl_fprintf(fp, "\n")) != eslOK) return status; + for (s = 0; s < n; ++s) + { + if ((status = esl_fprintf(fp, "%.4f ", dchl->q[k] / n)) != eslOK) return status; + for (a = 0; a < dchl->K; a++) + if ((status = esl_fprintf(fp, "%.4f ", dchl->alpha[k][s ? dchl->K - 1 - a : a])) != eslOK) return status; + if ((status = esl_fprintf(fp, "\n")) != eslOK) return status; + } } return eslOK; } diff --git a/esl_mixdchlet.h b/esl_mixdchlet.h index 46400767..bd7a2bea 100644 --- a/esl_mixdchlet.h +++ b/esl_mixdchlet.h @@ -20,6 +20,7 @@ typedef struct { double **alpha; /* Dirichlet params alpha[0..Q-1][0..K-1] */ int Q; /* number of mixtures, e.g. 9 for Sjolander */ int K; /* alphabet size, e.g. 20 */ + int isDnaStrandSymmetric; /* is it DNA-strand-symmetric for A C G T? */ double *postq; /* temp space 0..Q-1: for posterior P(k|c) for example */ /*::cexcerpt::dirichlet_mixdchlet::end::*/ @@ -27,6 +28,7 @@ typedef struct { extern ESL_MIXDCHLET *esl_mixdchlet_Create(int Q, int K); +extern ESL_MIXDCHLET *esl_mixdchlet_CreateForFitting(int Q, int K, int isDnaStrandSymmetric); extern void esl_mixdchlet_Destroy(ESL_MIXDCHLET *dchl); extern double esl_mixdchlet_logp_c (ESL_MIXDCHLET *dchl, double *c); diff --git a/esl_mixdchlet.tex b/esl_mixdchlet.tex index 8c5ef284..93b69544 100644 --- a/esl_mixdchlet.tex +++ b/esl_mixdchlet.tex @@ -91,6 +91,51 @@ \section{Fitting a mixture Dirichlet to counts} maximizer. The implementation provides the negative log likelihood and the negative gradient to the CG routine. +\subsection{Fitting a mixture Dirichlet with DNA strand symmetry} -\end{document} +Here, each count vector has $K=4$ counts of DNA bases. The mixture +Dirichlet consists of $Q$ \emph{pairs} of components. Each component +in a pair is identical to the other after swapping DNA bases with +their complements. So, the two components have identical mixture +coefficients: $q_k/2$. (It's also possible to have unpaired +self-symmetric components: that isn't considered here.) + +Here, we define $P(c_i \mid \alpha_k)$ to be the probability of one +count vector given one pair of Dirichlet components: + +\[ +P(c_i \mid \alpha_k) = \frac{ |c_i|! } + { \prod_a c_{ia}! } + \frac{ \prod_a \Gamma \left( c_{ia} + \alpha_{ia} \right) + \prod_a \Gamma \left( c_{ia'} + \alpha_{ia} \right) } + { 2 \enspace \Gamma ( |c_i + \alpha_k| ) } + \frac{ \Gamma ( |\alpha_k| ) } + { \prod_a \Gamma \left( \alpha_{ka} \right) }\,, +\] +where $a'$ means the complement of base $a$. +Then, the equation for $\frac{\partial L}{\partial \lambda_k}$ is the +same as above, except we're using the new definition of $P(c_i \mid +\alpha_k)$: +\[ +P(k \mid \theta, c_i) = q_k P(c_i \mid \alpha_k) \Big/ \sum_k q_k P(c_i \mid \alpha_k)\,. +\] +The partial derivative with respect to $\beta_{ka}$ is +\[ +\frac{\partial L}{\partial \beta_{ka}} = \sum_i + \alpha_{ka} P(k \mid \theta, c_i) + \left( X_{ika} + - \Psi \left( | c_i | + | \alpha_k | \right) + + \Psi \left( | \alpha_k | \right) + - \Psi \left( \alpha_{ka} \right) + \right)\,, +\] +where +\[ +X_{ika} = +\frac{ \Psi \left( c_{ia} + \alpha_{ka} \right) \prod_a \Gamma \left( c_{ia} + \alpha_{ka} \right) + + \Psi \left( c_{ia'} + \alpha_{ka} \right) \prod_a \Gamma \left( c_{ia'} + \alpha_{ka} \right) } + { \prod_a \Gamma \left( c_{ia} + \alpha_{ka} \right) + \prod_a \Gamma \left( c_{ia'} + \alpha_{ka} \right) }\,. +\] + + +\end{document} diff --git a/miniapps/Makefile.in b/miniapps/Makefile.in index 4a8caab4..18584a42 100644 --- a/miniapps/Makefile.in +++ b/miniapps/Makefile.in @@ -41,6 +41,8 @@ LDFLAGS = @LDFLAGS@ DEFS = @DEFS@ LIBS = -leasel @LIBGSL@ @LIBS@ @PTHREAD_LIBS@ -lm +PROGS = esl-mixdchlet + ## list of the miniapp commands to compile. # SUBCMDOBJS = \ @@ -97,6 +99,9 @@ check: ${PROGS} easel easel: % : %.c ../libeasel.a ${SUBCMDOBJS} ${QUIET_GEN}${CC} ${CPPFLAGS} ${CFLAGS} ${PTHREAD_CFLAGS} ${DEFS} ${LDFLAGS} -L.. -I. -I.. -I${srcdir} -I${srcdir}/.. -o $@ $< ${SUBCMDOBJS} ${LIBS} +${PROGS}: % : %.c ../libeasel.a + ${QUIET_GEN}${CC} ${CPPFLAGS} ${CFLAGS} ${PTHREAD_CFLAGS} ${DEFS} ${LDFLAGS} -L.. -I. -I.. -I${srcdir} -I${srcdir}/.. -o $@ $< ${LIBS} + ${SUBCMDOBJS}: %.o : %.c ../libeasel.a ${QUIET_CC}${CC} -I. -I.. -I${srcdir} -I${srcdir}/.. ${CPPFLAGS} ${CFLAGS} ${PTHREAD_CFLAGS} ${SIMD_CFLAGS} ${DEFS} -c $< diff --git a/miniapps/esl-mixdchlet.c b/miniapps/esl-mixdchlet.c index fa38f8fb..df72f389 100644 --- a/miniapps/esl-mixdchlet.c +++ b/miniapps/esl-mixdchlet.c @@ -111,6 +111,7 @@ static ESL_OPTIONS fit_options[] = { /* name type default env range toggles reqs incomp help docgroup*/ { "-h", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "show brief help on version and usage", 0 }, { "-s", eslARG_INT, "0", NULL, NULL, NULL, NULL, NULL, "set random number seed to ", 0 }, + { "-d", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "fit a DNA strand-symmetric mixture to A C G T counts (Q must be even)", 0 }, { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, }; @@ -119,13 +120,15 @@ cmd_fit(const char *topcmd, const ESL_SUBCMD *sub, int argc, char **argv) { ESL_GETOPTS *go = esl_subcmd_CreateDefaultApp(topcmd, sub, fit_options, argc, argv, /*custom opthelp?:*/NULL); ESL_RANDOMNESS *rng = esl_randomness_Create( esl_opt_GetInteger(go, "-s")); + int isDna = esl_opt_GetBoolean(go, "-d"); // a complement-symmetric mixture Dirichlet for DNA base counts? int Q = strtol(esl_opt_GetArg(go, 1), NULL, 10); // number of mixture components int K = strtol(esl_opt_GetArg(go, 2), NULL, 10); // size of probability/parameter vectors - length of count vectors char *ctfile = esl_opt_GetArg(go, 3); // count file to input char *outfile = esl_opt_GetArg(go, 4); // mixture Dirichlet file output ESL_FILEPARSER *efp = NULL; // open fileparser for reading + if (isDna && Q % 2) esl_fatal("option -d requires number of mixture components Q to be even"); FILE *ofp = NULL; // open output file for writing - ESL_MIXDCHLET *dchl = esl_mixdchlet_Create(Q,K); // mixture Dirichlet being estimated + ESL_MIXDCHLET *dchl = esl_mixdchlet_CreateForFitting(Q,K,isDna); // mixture Dirichlet being estimated int Nalloc = 1024; // initial allocation for ct[] double **ct = esl_mat_DCreate(Nalloc, K); // count vectors, [0..N-1][0..K-1] int N = 0; // number of count vectors so far From 1fdc27d094cb8fa9f5bd01a3c33c3866cb042e0b Mon Sep 17 00:00:00 2001 From: "Martin C. Frith" Date: Thu, 23 Oct 2025 08:56:08 +0000 Subject: [PATCH 2/3] Revert "Adds esl-mixdchlet DNA strand symmetry option" This reverts commit c464606b11ed8b31e6533de72add4ed0ebe7c8a0. --- esl_mixdchlet.c | 73 ++++++---------------------------------- esl_mixdchlet.h | 2 -- esl_mixdchlet.tex | 47 +------------------------- miniapps/Makefile.in | 5 --- miniapps/esl-mixdchlet.c | 5 +-- 5 files changed, 13 insertions(+), 119 deletions(-) diff --git a/esl_mixdchlet.c b/esl_mixdchlet.c index 638f6dee..d7fb1117 100644 --- a/esl_mixdchlet.c +++ b/esl_mixdchlet.c @@ -45,12 +45,11 @@ * Throws: NULL on allocation failure. */ ESL_MIXDCHLET * -esl_mixdchlet_CreateForFitting(int Q, int K, int isDnaStrandSymmetric) +esl_mixdchlet_Create(int Q, int K) { ESL_MIXDCHLET *dchl = NULL; int status; - if (isDnaStrandSymmetric) Q /= 2; // Q components => Q/2 pairs of components ESL_DASSERT1( (Q > 0) ); ESL_DASSERT1( (K > 0) ); @@ -65,18 +64,14 @@ esl_mixdchlet_CreateForFitting(int Q, int K, int isDnaStrandSymmetric) dchl->Q = Q; dchl->K = K; - dchl->isDnaStrandSymmetric = isDnaStrandSymmetric; return dchl; ERROR: esl_mixdchlet_Destroy(dchl); return NULL; } -ESL_MIXDCHLET * -esl_mixdchlet_Create(int Q, int K) -{ - return esl_mixdchlet_CreateForFitting(Q, K, 0); -} + + /* Function: esl_mixdchlet_Destroy() * Synopsis: Free a mixture Dirichlet. @@ -99,28 +94,6 @@ esl_mixdchlet_Destroy(ESL_MIXDCHLET *dchl) * 2. Likelihoods, posteriors, inference *****************************************************************/ -// Return the log probability of a vector c of counts of A/C/G/T, given a -// DNA-strand-symmetric pair of Dirichlet distributions: log P(c | alpha) -static double -mixdchlet_logprob_c_dna(double *c, double *alpha, int K) // presumably, K=4 -{ - double lgamma_sums[2] = {0, 0}; - double sum1, sum2, sum3, logp; - sum1 = sum2 = sum3 = logp = 0; - - for (int a = 0; a < K; a++) // loop over 4 DNA bases - { - sum1 += c[a] + alpha[a]; - sum2 += alpha[a]; - sum3 += c[a]; - lgamma_sums[0] += lgamma(alpha[a] + c[a]); - lgamma_sums[1] += lgamma(alpha[a] + c[K - 1 - a]); // complement of a - logp -= lgamma(c[a] + 1) + lgamma(alpha[a]); - } - - return logp + esl_vec_DLogSum(lgamma_sums, 2) - eslCONST_LOG2 - + lgamma(sum2) + lgamma(sum3 + 1) - lgamma(sum1); -} /* mixchlet_postq() * Calculate P(q | c), the posterior probability of component q. @@ -130,9 +103,7 @@ mixdchlet_postq(ESL_MIXDCHLET *dchl, double *c) { int k; for (k = 0; k < dchl->Q; k++) - if (dchl->q[k] > 0.) - dchl->postq[k] = log(dchl->q[k]) + - (dchl->isDnaStrandSymmetric ? mixdchlet_logprob_c_dna : esl_dirichlet_logpdf_c)(c, dchl->alpha[k], dchl->K); + if (dchl->q[k] > 0.) dchl->postq[k] = log(dchl->q[k]) + esl_dirichlet_logpdf_c(c, dchl->alpha[k], dchl->K); else dchl->postq[k] = -eslINFINITY; esl_vec_DLogNorm(dchl->postq, dchl->Q); } @@ -157,9 +128,7 @@ esl_mixdchlet_logp_c(ESL_MIXDCHLET *dchl, double *c) { int k; for (k = 0; k < dchl->Q; k++) - if (dchl->q[k] > 0.) - dchl->postq[k] = log(dchl->q[k]) + - (dchl->isDnaStrandSymmetric ? mixdchlet_logprob_c_dna : esl_dirichlet_logpdf_c)(c, dchl->alpha[k], dchl->K); + if (dchl->q[k] > 0.) dchl->postq[k] = log(dchl->q[k]) + esl_dirichlet_logpdf_c(c, dchl->alpha[k], dchl->K); else dchl->postq[k] = -eslINFINITY; return esl_vec_DLogSum(dchl->postq, dchl->Q); } @@ -309,7 +278,6 @@ mixdchlet_gradient(double *p, int np, void *dptr, double *dp) double sum_alpha; // |alpha_k| double sum_c; // |c_i| double psi1; // \Psi(c_ia + alpha_ka) - double psi1b; // \Psi(c_ia' + alpha_ka) double psi2; // \Psi( |c_i| + |alpha_k| ) double psi3; // \Psi( |alpha_k| ) double psi4; // \Psi( alpha_ka ) @@ -329,27 +297,12 @@ mixdchlet_gradient(double *p, int np, void *dptr, double *dp) for (k = 0; k < dchl->Q; k++) { - double strand_weights[2] = {0, 0}; - for (a = 0; a < dchl->K; a++) - { - int b = dchl->K - 1 - a; // complement of DNA base a - strand_weights[0] += lgamma(dchl->alpha[k][a] + data->c[i][a]); - strand_weights[1] += lgamma(dchl->alpha[k][a] + data->c[i][b]); - } - esl_vec_DLogNorm(strand_weights, 2); - sum_alpha = esl_vec_DSum(dchl->alpha[k], dchl->K); esl_stats_Psi( sum_alpha + sum_c, &psi2); esl_stats_Psi( sum_alpha, &psi3); for (a = 0; a < dchl->K; a++) { esl_stats_Psi( dchl->alpha[k][a] + data->c[i][a], &psi1); - if (dchl->isDnaStrandSymmetric) - { - int b = dchl->K - 1 - a; // complement of DNA base a - esl_stats_Psi( dchl->alpha[k][a] + data->c[i][b], &psi1b); - psi1 = strand_weights[0] * psi1 + strand_weights[1] * psi1b; - } esl_stats_Psi( dchl->alpha[k][a], &psi4); dp[j++] -= dchl->alpha[k][a] * dchl->postq[k] * (psi1 - psi2 + psi3 - psi4); } @@ -565,20 +518,16 @@ esl_mixdchlet_Read(ESL_FILEPARSER *efp, ESL_MIXDCHLET **ret_dchl) int esl_mixdchlet_Write(FILE *fp, const ESL_MIXDCHLET *dchl) { - int k,s,a; + int k,a; int status; - int n = 1 + dchl->isDnaStrandSymmetric; - if ((status = esl_fprintf(fp, "%d %d\n", dchl->K, dchl->Q * n)) != eslOK) return status; + if ((status = esl_fprintf(fp, "%d %d\n", dchl->K, dchl->Q)) != eslOK) return status; for (k = 0; k < dchl->Q; k++) { - for (s = 0; s < n; ++s) - { - if ((status = esl_fprintf(fp, "%.4f ", dchl->q[k] / n)) != eslOK) return status; - for (a = 0; a < dchl->K; a++) - if ((status = esl_fprintf(fp, "%.4f ", dchl->alpha[k][s ? dchl->K - 1 - a : a])) != eslOK) return status; - if ((status = esl_fprintf(fp, "\n")) != eslOK) return status; - } + if ((status = esl_fprintf(fp, "%.4f ", dchl->q[k])) != eslOK) return status; + for (a = 0; a < dchl->K; a++) + if ((status = esl_fprintf(fp, "%.4f ", dchl->alpha[k][a])) != eslOK) return status; + if ((status = esl_fprintf(fp, "\n")) != eslOK) return status; } return eslOK; } diff --git a/esl_mixdchlet.h b/esl_mixdchlet.h index bd7a2bea..46400767 100644 --- a/esl_mixdchlet.h +++ b/esl_mixdchlet.h @@ -20,7 +20,6 @@ typedef struct { double **alpha; /* Dirichlet params alpha[0..Q-1][0..K-1] */ int Q; /* number of mixtures, e.g. 9 for Sjolander */ int K; /* alphabet size, e.g. 20 */ - int isDnaStrandSymmetric; /* is it DNA-strand-symmetric for A C G T? */ double *postq; /* temp space 0..Q-1: for posterior P(k|c) for example */ /*::cexcerpt::dirichlet_mixdchlet::end::*/ @@ -28,7 +27,6 @@ typedef struct { extern ESL_MIXDCHLET *esl_mixdchlet_Create(int Q, int K); -extern ESL_MIXDCHLET *esl_mixdchlet_CreateForFitting(int Q, int K, int isDnaStrandSymmetric); extern void esl_mixdchlet_Destroy(ESL_MIXDCHLET *dchl); extern double esl_mixdchlet_logp_c (ESL_MIXDCHLET *dchl, double *c); diff --git a/esl_mixdchlet.tex b/esl_mixdchlet.tex index 93b69544..8c5ef284 100644 --- a/esl_mixdchlet.tex +++ b/esl_mixdchlet.tex @@ -91,51 +91,6 @@ \section{Fitting a mixture Dirichlet to counts} maximizer. The implementation provides the negative log likelihood and the negative gradient to the CG routine. -\subsection{Fitting a mixture Dirichlet with DNA strand symmetry} - -Here, each count vector has $K=4$ counts of DNA bases. The mixture -Dirichlet consists of $Q$ \emph{pairs} of components. Each component -in a pair is identical to the other after swapping DNA bases with -their complements. So, the two components have identical mixture -coefficients: $q_k/2$. (It's also possible to have unpaired -self-symmetric components: that isn't considered here.) - -Here, we define $P(c_i \mid \alpha_k)$ to be the probability of one -count vector given one pair of Dirichlet components: - -\[ -P(c_i \mid \alpha_k) = \frac{ |c_i|! } - { \prod_a c_{ia}! } - \frac{ \prod_a \Gamma \left( c_{ia} + \alpha_{ia} \right) + \prod_a \Gamma \left( c_{ia'} + \alpha_{ia} \right) } - { 2 \enspace \Gamma ( |c_i + \alpha_k| ) } - \frac{ \Gamma ( |\alpha_k| ) } - { \prod_a \Gamma \left( \alpha_{ka} \right) }\,, -\] -where $a'$ means the complement of base $a$. - -Then, the equation for $\frac{\partial L}{\partial \lambda_k}$ is the -same as above, except we're using the new definition of $P(c_i \mid -\alpha_k)$: -\[ -P(k \mid \theta, c_i) = q_k P(c_i \mid \alpha_k) \Big/ \sum_k q_k P(c_i \mid \alpha_k)\,. -\] -The partial derivative with respect to $\beta_{ka}$ is -\[ -\frac{\partial L}{\partial \beta_{ka}} = \sum_i - \alpha_{ka} P(k \mid \theta, c_i) - \left( X_{ika} - - \Psi \left( | c_i | + | \alpha_k | \right) - + \Psi \left( | \alpha_k | \right) - - \Psi \left( \alpha_{ka} \right) - \right)\,, -\] -where -\[ -X_{ika} = -\frac{ \Psi \left( c_{ia} + \alpha_{ka} \right) \prod_a \Gamma \left( c_{ia} + \alpha_{ka} \right) + - \Psi \left( c_{ia'} + \alpha_{ka} \right) \prod_a \Gamma \left( c_{ia'} + \alpha_{ka} \right) } - { \prod_a \Gamma \left( c_{ia} + \alpha_{ka} \right) + \prod_a \Gamma \left( c_{ia'} + \alpha_{ka} \right) }\,. -\] - \end{document} + diff --git a/miniapps/Makefile.in b/miniapps/Makefile.in index 18584a42..4a8caab4 100644 --- a/miniapps/Makefile.in +++ b/miniapps/Makefile.in @@ -41,8 +41,6 @@ LDFLAGS = @LDFLAGS@ DEFS = @DEFS@ LIBS = -leasel @LIBGSL@ @LIBS@ @PTHREAD_LIBS@ -lm -PROGS = esl-mixdchlet - ## list of the miniapp commands to compile. # SUBCMDOBJS = \ @@ -99,9 +97,6 @@ check: ${PROGS} easel easel: % : %.c ../libeasel.a ${SUBCMDOBJS} ${QUIET_GEN}${CC} ${CPPFLAGS} ${CFLAGS} ${PTHREAD_CFLAGS} ${DEFS} ${LDFLAGS} -L.. -I. -I.. -I${srcdir} -I${srcdir}/.. -o $@ $< ${SUBCMDOBJS} ${LIBS} -${PROGS}: % : %.c ../libeasel.a - ${QUIET_GEN}${CC} ${CPPFLAGS} ${CFLAGS} ${PTHREAD_CFLAGS} ${DEFS} ${LDFLAGS} -L.. -I. -I.. -I${srcdir} -I${srcdir}/.. -o $@ $< ${LIBS} - ${SUBCMDOBJS}: %.o : %.c ../libeasel.a ${QUIET_CC}${CC} -I. -I.. -I${srcdir} -I${srcdir}/.. ${CPPFLAGS} ${CFLAGS} ${PTHREAD_CFLAGS} ${SIMD_CFLAGS} ${DEFS} -c $< diff --git a/miniapps/esl-mixdchlet.c b/miniapps/esl-mixdchlet.c index df72f389..fa38f8fb 100644 --- a/miniapps/esl-mixdchlet.c +++ b/miniapps/esl-mixdchlet.c @@ -111,7 +111,6 @@ static ESL_OPTIONS fit_options[] = { /* name type default env range toggles reqs incomp help docgroup*/ { "-h", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "show brief help on version and usage", 0 }, { "-s", eslARG_INT, "0", NULL, NULL, NULL, NULL, NULL, "set random number seed to ", 0 }, - { "-d", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "fit a DNA strand-symmetric mixture to A C G T counts (Q must be even)", 0 }, { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, }; @@ -120,15 +119,13 @@ cmd_fit(const char *topcmd, const ESL_SUBCMD *sub, int argc, char **argv) { ESL_GETOPTS *go = esl_subcmd_CreateDefaultApp(topcmd, sub, fit_options, argc, argv, /*custom opthelp?:*/NULL); ESL_RANDOMNESS *rng = esl_randomness_Create( esl_opt_GetInteger(go, "-s")); - int isDna = esl_opt_GetBoolean(go, "-d"); // a complement-symmetric mixture Dirichlet for DNA base counts? int Q = strtol(esl_opt_GetArg(go, 1), NULL, 10); // number of mixture components int K = strtol(esl_opt_GetArg(go, 2), NULL, 10); // size of probability/parameter vectors - length of count vectors char *ctfile = esl_opt_GetArg(go, 3); // count file to input char *outfile = esl_opt_GetArg(go, 4); // mixture Dirichlet file output ESL_FILEPARSER *efp = NULL; // open fileparser for reading - if (isDna && Q % 2) esl_fatal("option -d requires number of mixture components Q to be even"); FILE *ofp = NULL; // open output file for writing - ESL_MIXDCHLET *dchl = esl_mixdchlet_CreateForFitting(Q,K,isDna); // mixture Dirichlet being estimated + ESL_MIXDCHLET *dchl = esl_mixdchlet_Create(Q,K); // mixture Dirichlet being estimated int Nalloc = 1024; // initial allocation for ct[] double **ct = esl_mat_DCreate(Nalloc, K); // count vectors, [0..N-1][0..K-1] int N = 0; // number of count vectors so far From 27b2f949e2eed7e55a8391137468f5936e448c82 Mon Sep 17 00:00:00 2001 From: "Martin C. Frith" Date: Thu, 23 Oct 2025 09:22:21 +0000 Subject: [PATCH 3/3] Adds esl-mixdchlet DNA strand symmetry option --- esl_mixdchlet.c | 121 ++++++++++++++++++++++++++++++++++++++- esl_mixdchlet.h | 2 + esl_mixdchlet.tex | 54 ++++++++++++++++- miniapps/Makefile.in | 5 ++ miniapps/esl-mixdchlet.c | 4 +- 5 files changed, 182 insertions(+), 4 deletions(-) diff --git a/esl_mixdchlet.c b/esl_mixdchlet.c index d7fb1117..37b16a88 100644 --- a/esl_mixdchlet.c +++ b/esl_mixdchlet.c @@ -46,6 +46,12 @@ */ ESL_MIXDCHLET * esl_mixdchlet_Create(int Q, int K) +{ + return esl_mixdchlet_CreateForFitting(Q, K, 0); +} + +ESL_MIXDCHLET * +esl_mixdchlet_CreateForFitting(int Q, int K, int isDnaStrandSymmetric) { ESL_MIXDCHLET *dchl = NULL; int status; @@ -64,6 +70,7 @@ esl_mixdchlet_Create(int Q, int K) dchl->Q = Q; dchl->K = K; + dchl->isDnaStrandSymmetric = isDnaStrandSymmetric; return dchl; ERROR: @@ -218,6 +225,25 @@ mixdchlet_pack_paramvector(ESL_MIXDCHLET *dchl, double *p) int j = 0; /* counter in packed parameter vector

*/ int k,a; /* indices over components, residues */ + if (dchl->isDnaStrandSymmetric) + { + for (k = 0; k < dchl->Q - 1; k += 2) // paired components + p[j++] = log(dchl->q[k] * 2); // Prob(component) * 2 = Prob(pair) + + if (k == dchl->Q - 1) // unpaired self-symmetric component + p[j++] = log(dchl->q[k]); + + for (k = 0; k < dchl->Q - 1; k += 2) // paired components + for (a = 0; a < dchl->K; a++) + p[j++] = log(dchl->alpha[k][a]); + + if (k == dchl->Q - 1) // unpaired self-symmetric component + for (a = 0; a < dchl->K / 2; a++) + p[j++] = log(dchl->alpha[k][a]); + + return; + } + /* mixture coefficients */ for (k = 0; k < dchl->Q; k++) p[j++] = log(dchl->q[k]); @@ -240,6 +266,27 @@ mixdchlet_unpack_paramvector(double *p, ESL_MIXDCHLET *dchl) int j = 0; /* counter in packed parameter vector

*/ int k,a; /* indices over components, residues */ + if (dchl->isDnaStrandSymmetric) + { + for (k = 0; k < dchl->Q - 1; k += 2) // paired components + dchl->q[k] = dchl->q[k+1] = exp(p[j++]) / 2; + + if (k == dchl->Q - 1) // unpaired self-symmetric component + dchl->q[k] = exp(p[j++]); + + esl_vec_DNorm(dchl->q, dchl->Q); + + for (k = 0; k < dchl->Q - 1; k += 2) // paired components + for (a = 0; a < dchl->K; a++) + dchl->alpha[k][a] = dchl->alpha[k+1][dchl->K - 1 - a] = exp(p[j++]); + + if (k == dchl->Q - 1) // unpaired self-symmetric component + for (a = 0; a < dchl->K / 2; a++) + dchl->alpha[k][a] = dchl->alpha[k][dchl->K - 1 - a] = exp(p[j++]); + + return; + } + /* mixture coefficients */ for (k = 0; k < dchl->Q; k++) dchl->q[k] = exp(p[j++]); @@ -278,6 +325,7 @@ mixdchlet_gradient(double *p, int np, void *dptr, double *dp) double sum_alpha; // |alpha_k| double sum_c; // |c_i| double psi1; // \Psi(c_ia + alpha_ka) + double psi1b; // \Psi(c_ia' + alpha_ka) double psi2; // \Psi( |c_i| + |alpha_k| ) double psi3; // \Psi( |alpha_k| ) double psi4; // \Psi( alpha_ka ) @@ -289,9 +337,61 @@ mixdchlet_gradient(double *p, int np, void *dptr, double *dp) { mixdchlet_postq(dchl, data->c[i]); // d->postq[q] is now P(q | c_i, theta) sum_c = esl_vec_DSum(data->c[i], dchl->K); // |c_i| + j = 0; + if (dchl->isDnaStrandSymmetric) + { + for (k = 0; k < dchl->Q - 1; k += 2) // paired components + dp[j++] -= dchl->postq[k] + dchl->postq[k+1] - 2 * dchl->q[k]; + + if (k == dchl->Q - 1) // unpaired self-symmetric component + dp[j++] -= dchl->postq[k] - dchl->q[k]; + + for (k = 0; k < dchl->Q - 1; k += 2) // paired components + { + double strand_weights[2] = {0, 0}; + for (a = 0; a < dchl->K; a++) + { + int b = dchl->K - 1 - a; // complement of DNA base a + strand_weights[0] += lgamma(dchl->alpha[k][a] + data->c[i][a]); + strand_weights[1] += lgamma(dchl->alpha[k][a] + data->c[i][b]); + } + esl_vec_DLogNorm(strand_weights, 2); + + sum_alpha = esl_vec_DSum(dchl->alpha[k], dchl->K); + esl_stats_Psi( sum_alpha + sum_c, &psi2); + esl_stats_Psi( sum_alpha, &psi3); + for (a = 0; a < dchl->K; a++) + { + int b = dchl->K - 1 - a; // complement of DNA base a + esl_stats_Psi( dchl->alpha[k][a] + data->c[i][a], &psi1); + esl_stats_Psi( dchl->alpha[k][a] + data->c[i][b], &psi1b); + esl_stats_Psi( dchl->alpha[k][a], &psi4); + dp[j++] -= dchl->alpha[k][a] * (dchl->postq[k] + dchl->postq[k+1]) + * (strand_weights[0] * psi1 + strand_weights[1] * psi1b - psi2 + psi3 - psi4); + } + } + + if (k == dchl->Q - 1) // unpaired self-symmetric component + { + sum_alpha = esl_vec_DSum(dchl->alpha[k], dchl->K); + esl_stats_Psi( sum_alpha + sum_c, &psi2); + esl_stats_Psi( sum_alpha, &psi3); + for (a = 0; a < dchl->K / 2; a++) + { + int b = dchl->K - 1 - a; // complement of DNA base a + esl_stats_Psi( dchl->alpha[k][a] + data->c[i][a], &psi1); + esl_stats_Psi( dchl->alpha[k][a] + data->c[i][b], &psi1b); + esl_stats_Psi( dchl->alpha[k][a], &psi4); + dp[j++] -= dchl->alpha[k][a] * dchl->postq[k] + * (psi1 + psi1b - 2 * psi2 + 2 * psi3 - 2 * psi4); + } + } + + continue; + } + /* mixture coefficient gradient */ - j = 0; for (k = 0; k < dchl->Q; k++) dp[j++] -= dchl->postq[k] - dchl->q[k]; @@ -351,6 +451,8 @@ esl_mixdchlet_Fit(double **c, int N, ESL_MIXDCHLET *dchl, double *opt_nll) double fx; int status; + if (dchl->isDnaStrandSymmetric) nparam = (nparam + 1) / 2; + cfg = esl_min_cfg_Create(nparam); if (! cfg) { status = eslEMEM; goto ERROR; } cfg->cg_rtol = 3e-5; @@ -419,6 +521,23 @@ esl_mixdchlet_Sample(ESL_RANDOMNESS *rng, ESL_MIXDCHLET *dchl) { int k,a; + if (dchl->isDnaStrandSymmetric) + { + esl_dirichlet_DSampleUniform(rng, dchl->Q, dchl->q); + for (k = 0; k < dchl->Q - 1; k += 2) // paired components + dchl->q[k] = dchl->q[k+1] = (dchl->q[k] + dchl->q[k+1]) / 2; + + for (k = 0; k < dchl->Q - 1; k += 2) // paired components + for (a = 0; a < dchl->K; a++) + dchl->alpha[k][a] = dchl->alpha[k+1][dchl->K - 1 - a] = 2.0 * esl_rnd_UniformPositive(rng); + + if (k == dchl->Q - 1) // unpaired self-symmetric component + for (a = 0; a < dchl->K / 2; a++) + dchl->alpha[k][a] = dchl->alpha[k][dchl->K - 1 - a] = 2.0 * esl_rnd_UniformPositive(rng); + + return eslOK; + } + esl_dirichlet_DSampleUniform(rng, dchl->Q, dchl->q); for (k = 0; k < dchl->Q; k++) for (a = 0; a < dchl->K; a++) diff --git a/esl_mixdchlet.h b/esl_mixdchlet.h index 46400767..bd7a2bea 100644 --- a/esl_mixdchlet.h +++ b/esl_mixdchlet.h @@ -20,6 +20,7 @@ typedef struct { double **alpha; /* Dirichlet params alpha[0..Q-1][0..K-1] */ int Q; /* number of mixtures, e.g. 9 for Sjolander */ int K; /* alphabet size, e.g. 20 */ + int isDnaStrandSymmetric; /* is it DNA-strand-symmetric for A C G T? */ double *postq; /* temp space 0..Q-1: for posterior P(k|c) for example */ /*::cexcerpt::dirichlet_mixdchlet::end::*/ @@ -27,6 +28,7 @@ typedef struct { extern ESL_MIXDCHLET *esl_mixdchlet_Create(int Q, int K); +extern ESL_MIXDCHLET *esl_mixdchlet_CreateForFitting(int Q, int K, int isDnaStrandSymmetric); extern void esl_mixdchlet_Destroy(ESL_MIXDCHLET *dchl); extern double esl_mixdchlet_logp_c (ESL_MIXDCHLET *dchl, double *c); diff --git a/esl_mixdchlet.tex b/esl_mixdchlet.tex index 8c5ef284..38c0910c 100644 --- a/esl_mixdchlet.tex +++ b/esl_mixdchlet.tex @@ -1,6 +1,7 @@ \documentclass[11pt]{article} \setcounter{secnumdepth}{0} +\usepackage{amsmath} \usepackage{relsize} \newcommand{\mono}[1]{{\smaller\texttt{#1}}} % literal (to be typed): code, program names @@ -38,7 +39,7 @@ \section{Fitting a mixture Dirichlet to counts} \[ P(c_i \mid \alpha_k) = \frac{ |c_i|! } { \prod_a c_{ia}! } - \frac{ \prod_a \Gamma \left( c_{ia} + \alpha_{ia} \right) } + \frac{ \prod_a \Gamma \left( c_{ia} + \alpha_{ka} \right) } { \Gamma ( |c_i + \alpha_k| ) } \frac{ \Gamma ( |\alpha_k| ) } { \prod_a \Gamma \left( \alpha_{ka} \right) } @@ -91,6 +92,55 @@ \section{Fitting a mixture Dirichlet to counts} maximizer. The implementation provides the negative log likelihood and the negative gradient to the CG routine. +\subsection{Fitting a mixture Dirichlet with DNA strand symmetry} -\end{document} +Here, each count vector has $K=4$ counts of DNA bases. For a +strand-symmetric mixture Dirichlet, each component must be either +self-symmetric: $\alpha_{ka} = \alpha_{ka'}$ (where $a'$ is the +complement of base $a$), or paired with a symmetric component: +$\alpha_{ka} = \alpha_{k'a'}$ and $q_k = q_{k'}$ (where $k$ and $k'$ +indicate paired components). Define $R$ to be the number of pairs +plus the number of unpaired components. It's convenient to make these +definitions: +\[ +r_k = \begin{cases} + 2 q_k & \text{for a paired component (i.e., the prior probability of the pair),} \\ + q_k & \text{for an unpaired component} +\end{cases} +\] +\[ +\hat{P}(k \mid \theta, c_i) = \begin{cases} + P(k \mid \theta, c_i) + P(k' \mid \theta, c_i) & \text{for a paired component,}\\ + P(k \mid \theta, c_i) & \text{for an unpaired component} +\end{cases} +\] +Here, the number of $\lambda$ parameters is $R$, and the number of +$\beta$ parameters is: $K \times \text{(number of components) / 2}$. +We can define $\beta_{ka}$ as above, and $\lambda_k$ like this: +\begin{eqnarray*} + r_k & = & \frac{ e^{\lambda_k} } { \sum_{m=1}^R e^{\lambda_m} } \,. +\end{eqnarray*} +Then, +\begin{eqnarray*} +\frac{\partial L}{\partial \lambda_k} &=& \sum_i \bigl( \hat{P}(k \mid \theta, c_i) - r_k \bigr) \\ +\frac{\partial L}{\partial \beta_{ka}} &=& + \begin{cases} +\sum_i \alpha_{ka} \hat{P}(k \mid \theta, c_i) + (X_{ika} + Z_{ika}) & \text{for a paired component,} \\ +\sum_i \alpha_{ka} \hat{P}(k \mid \theta, c_i) + (Y_{ika} + 2 Z_{ika}) & \text{for an unpaired component} + \end{cases} +\end{eqnarray*} +where +\begin{eqnarray*} +X_{ika} & = & +\frac{ \Psi \left( c_{ia} + \alpha_{ka} \right) \prod_a \Gamma \left( c_{ia} + \alpha_{ka} \right) + + \Psi \left( c_{ia'} + \alpha_{ka} \right) \prod_a \Gamma \left( c_{ia'} + \alpha_{ka} \right) } + { \prod_a \Gamma \left( c_{ia} + \alpha_{ka} \right) + \prod_a \Gamma \left( c_{ia'} + \alpha_{ka} \right) } \,,\\ +Y_{ika} & = & \Psi \left( c_{ia} + \alpha_{ka} \right) + \Psi \left( c_{ia'} + \alpha_{ka} \right) \,,\\ +Z_{ika} & = & - \Psi \left( | c_i | + | \alpha_k | \right) + + \Psi \left( | \alpha_k | \right) + - \Psi \left( \alpha_{ka} \right) \,. +\end{eqnarray*} +\end{document} diff --git a/miniapps/Makefile.in b/miniapps/Makefile.in index 4a8caab4..18584a42 100644 --- a/miniapps/Makefile.in +++ b/miniapps/Makefile.in @@ -41,6 +41,8 @@ LDFLAGS = @LDFLAGS@ DEFS = @DEFS@ LIBS = -leasel @LIBGSL@ @LIBS@ @PTHREAD_LIBS@ -lm +PROGS = esl-mixdchlet + ## list of the miniapp commands to compile. # SUBCMDOBJS = \ @@ -97,6 +99,9 @@ check: ${PROGS} easel easel: % : %.c ../libeasel.a ${SUBCMDOBJS} ${QUIET_GEN}${CC} ${CPPFLAGS} ${CFLAGS} ${PTHREAD_CFLAGS} ${DEFS} ${LDFLAGS} -L.. -I. -I.. -I${srcdir} -I${srcdir}/.. -o $@ $< ${SUBCMDOBJS} ${LIBS} +${PROGS}: % : %.c ../libeasel.a + ${QUIET_GEN}${CC} ${CPPFLAGS} ${CFLAGS} ${PTHREAD_CFLAGS} ${DEFS} ${LDFLAGS} -L.. -I. -I.. -I${srcdir} -I${srcdir}/.. -o $@ $< ${LIBS} + ${SUBCMDOBJS}: %.o : %.c ../libeasel.a ${QUIET_CC}${CC} -I. -I.. -I${srcdir} -I${srcdir}/.. ${CPPFLAGS} ${CFLAGS} ${PTHREAD_CFLAGS} ${SIMD_CFLAGS} ${DEFS} -c $< diff --git a/miniapps/esl-mixdchlet.c b/miniapps/esl-mixdchlet.c index fa38f8fb..23150c4a 100644 --- a/miniapps/esl-mixdchlet.c +++ b/miniapps/esl-mixdchlet.c @@ -111,6 +111,7 @@ static ESL_OPTIONS fit_options[] = { /* name type default env range toggles reqs incomp help docgroup*/ { "-h", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "show brief help on version and usage", 0 }, { "-s", eslARG_INT, "0", NULL, NULL, NULL, NULL, NULL, "set random number seed to ", 0 }, + { "-d", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "fit a DNA strand-symmetric mixture to A C G T counts", 0 }, { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, }; @@ -119,13 +120,14 @@ cmd_fit(const char *topcmd, const ESL_SUBCMD *sub, int argc, char **argv) { ESL_GETOPTS *go = esl_subcmd_CreateDefaultApp(topcmd, sub, fit_options, argc, argv, /*custom opthelp?:*/NULL); ESL_RANDOMNESS *rng = esl_randomness_Create( esl_opt_GetInteger(go, "-s")); + int isDna = esl_opt_GetBoolean(go, "-d"); // a complement-symmetric mixture Dirichlet for DNA base counts? int Q = strtol(esl_opt_GetArg(go, 1), NULL, 10); // number of mixture components int K = strtol(esl_opt_GetArg(go, 2), NULL, 10); // size of probability/parameter vectors - length of count vectors char *ctfile = esl_opt_GetArg(go, 3); // count file to input char *outfile = esl_opt_GetArg(go, 4); // mixture Dirichlet file output ESL_FILEPARSER *efp = NULL; // open fileparser for reading FILE *ofp = NULL; // open output file for writing - ESL_MIXDCHLET *dchl = esl_mixdchlet_Create(Q,K); // mixture Dirichlet being estimated + ESL_MIXDCHLET *dchl = esl_mixdchlet_CreateForFitting(Q,K,isDna); // mixture Dirichlet being estimated int Nalloc = 1024; // initial allocation for ct[] double **ct = esl_mat_DCreate(Nalloc, K); // count vectors, [0..N-1][0..K-1] int N = 0; // number of count vectors so far