From 7b59361fa76cbb9f93e665edf610f904ec321e1b Mon Sep 17 00:00:00 2001 From: Nick Harding Date: Tue, 23 Jul 2019 17:35:57 +0100 Subject: [PATCH 1/3] Add 3 functions to simulate genotypes/allele counts for quick protyping --- allel/simulate/__init__.py | 2 + allel/simulate/simulate.py | 102 +++++++++++++++++++++++++++++++++++++ 2 files changed, 104 insertions(+) create mode 100644 allel/simulate/__init__.py create mode 100644 allel/simulate/simulate.py diff --git a/allel/simulate/__init__.py b/allel/simulate/__init__.py new file mode 100644 index 00000000..76651761 --- /dev/null +++ b/allel/simulate/__init__.py @@ -0,0 +1,2 @@ +# -*- coding: utf-8 -*- +# flake8: noqa diff --git a/allel/simulate/simulate.py b/allel/simulate/simulate.py new file mode 100644 index 00000000..c8664aed --- /dev/null +++ b/allel/simulate/simulate.py @@ -0,0 +1,102 @@ +from allel.model.dask import GenotypeDaskArray, GenotypeAlleleCountsDaskArray, AlleleCountsDaskArray + +import numpy as np +import dask.array as da + + +def simulate_genotypes(n_variants, n_samples, p=(0.95, 0.05), ploidy=2): + """generate genotypes from a random distribution + + Parameters + ---------- + n_variants : int + number of variants to generate + n_samples : int + number of samples to generate + p : tuple, float + probability of each allele, must sum to 1. This is used to implicitly specify the number of alleles + ploidy : int + ploidy of individuals + + Returns + ------- + GenotypeDaskArray: int8, shape (nvariants, nsamples, ploidy) + + Notes + ----- + For speed and efficiency all variants are drawn from the same distribution. + For a more "realistic" simulate this simple function may want to be extended. + + """ + a = np.arange(0, len(p), dtype="int8") + + g = da.random.choice( + a, size=(n_variants, n_samples, ploidy), p=p) + + return GenotypeDaskArray(g) + + +def simulate_allele_counts_array(n_variants, n_samples, p=(0.95, 0.05), ploidy=2): + """generate an allele counts array from a random distribution + + Parameters + ---------- + n_variants : int + number of variants to generate + n_samples : int + number of samples to generate + p : tuple, float + probability of each allele, must sum to 1. This is used to implicitly specify the number of alleles + ploidy : int + ploidy of individuals + + Returns + ------- + GenotypeAlleleCountsDaskArray: int64, shape (nvariants, nsamples, n_alleles) + + Notes + ----- + For speed and efficiency all variants are drawn from the same distribution. + For a more "realistic" simulate this simple function may want to be extended. + + """ + + aca = da.random.multinomial( + ploidy, + p, + size=(n_variants, n_samples)) + + return GenotypeAlleleCountsDaskArray(aca) + + +def simulate_allele_counts(n_variants, n_samples, p=(0.95, 0.05), ploidy=2): + """generate allele counts from a random distribution + + Parameters + ---------- + n_variants : int + number of variants to generate + n_samples : int + number of samples to generate + p : tuple, float + probability of each allele, must sum to 1. This is used to implicitly specify the number of alleles + ploidy : int + ploidy of individuals + + Returns + ------- + GenotypeDaskArray: int8, shape (nvariants, n_alleles) + + Notes + ----- + For speed and efficiency all variants are drawn from the same distribution. + For a more "realistic" simulate this simple function may want to be extended. + + """ + + ac = da.random.multinomial( + ploidy * n_samples, + p, + size=(n_variants,)) + + return AlleleCountsDaskArray(ac) From 696377367f6d084b5511659a5e335c48ea23ae81 Mon Sep 17 00:00:00 2001 From: Nick Harding Date: Tue, 23 Jul 2019 17:38:59 +0100 Subject: [PATCH 2/3] fix line lengths for PEP8 --- allel/simulate/simulate.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/allel/simulate/simulate.py b/allel/simulate/simulate.py index c8664aed..62022f8f 100644 --- a/allel/simulate/simulate.py +++ b/allel/simulate/simulate.py @@ -14,7 +14,8 @@ def simulate_genotypes(n_variants, n_samples, p=(0.95, 0.05), ploidy=2): n_samples : int number of samples to generate p : tuple, float - probability of each allele, must sum to 1. This is used to implicitly specify the number of alleles + probability of each allele, must sum to 1. + This is used to implicitly specify the number of alleles ploidy : int ploidy of individuals @@ -46,7 +47,8 @@ def simulate_allele_counts_array(n_variants, n_samples, p=(0.95, 0.05), ploidy=2 n_samples : int number of samples to generate p : tuple, float - probability of each allele, must sum to 1. This is used to implicitly specify the number of alleles + probability of each allele, must sum to 1. + This is used to implicitly specify the number of alleles ploidy : int ploidy of individuals @@ -79,7 +81,8 @@ def simulate_allele_counts(n_variants, n_samples, p=(0.95, 0.05), ploidy=2): n_samples : int number of samples to generate p : tuple, float - probability of each allele, must sum to 1. This is used to implicitly specify the number of alleles + probability of each allele, must sum to 1. + This is used to implicitly specify the number of alleles ploidy : int ploidy of individuals From 4689493b004de974a9fbefa3801ecb29bc461221 Mon Sep 17 00:00:00 2001 From: Nick Harding Date: Tue, 5 Nov 2019 15:30:36 +0000 Subject: [PATCH 3/3] Improve param strings and rename function --- allel/simulate/simulate.py | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/allel/simulate/simulate.py b/allel/simulate/simulate.py index 62022f8f..2283b645 100644 --- a/allel/simulate/simulate.py +++ b/allel/simulate/simulate.py @@ -5,7 +5,7 @@ def simulate_genotypes(n_variants, n_samples, p=(0.95, 0.05), ploidy=2): - """generate genotypes from a random distribution + """Generate genotypes from a random distribution. Parameters ---------- @@ -26,7 +26,10 @@ def simulate_genotypes(n_variants, n_samples, p=(0.95, 0.05), ploidy=2): Notes ----- For speed and efficiency all variants are drawn from the same distribution. - For a more "realistic" simulate this simple function may want to be extended. + For a more "realistic" simulation this simple function may want to be extended. + + `np.random.dirichlet((n1, n2, n3, n4))` can be used to generate an appropriate vector for p, + where n are pseudocounts """ a = np.arange(0, len(p), dtype="int8") @@ -37,8 +40,8 @@ def simulate_genotypes(n_variants, n_samples, p=(0.95, 0.05), ploidy=2): return GenotypeDaskArray(g) -def simulate_allele_counts_array(n_variants, n_samples, p=(0.95, 0.05), ploidy=2): - """generate an allele counts array from a random distribution +def simulate_genotype_allele_counts(n_variants, n_samples, p=(0.95, 0.05), ploidy=2): + """Generate a genotype allele counts array from a random multinomial distribution. Parameters ---------- @@ -59,7 +62,10 @@ def simulate_allele_counts_array(n_variants, n_samples, p=(0.95, 0.05), ploidy=2 Notes ----- For speed and efficiency all variants are drawn from the same distribution. - For a more "realistic" simulate this simple function may want to be extended. + For a more "realistic" simulation this simple function may want to be extended. + + `np.random.dirichlet((n1, n2, n3, n4))` can be used to generate an appropriate vector for p, + where n are pseudocounts """ @@ -72,7 +78,7 @@ def simulate_allele_counts_array(n_variants, n_samples, p=(0.95, 0.05), ploidy=2 def simulate_allele_counts(n_variants, n_samples, p=(0.95, 0.05), ploidy=2): - """generate allele counts from a random distribution + """Generate an allele counts array from a random multinomial distribution. Parameters ---------- @@ -93,7 +99,10 @@ def simulate_allele_counts(n_variants, n_samples, p=(0.95, 0.05), ploidy=2): Notes ----- For speed and efficiency all variants are drawn from the same distribution. - For a more "realistic" simulate this simple function may want to be extended. + For a more "realistic" simulation this simple function may want to be extended. + + `np.random.dirichlet((n1, n2, n3, n4))` can be used to generate an appropriate vector for p, + where n are pseudocounts """