1+ import pytest
2+ import pandas as pd
3+ import numpy as np
4+ from mutclust .pca_analysis import calculate_eigen_genes
5+
6+ def test_empty_cluster_error ():
7+ """Test that empty clusters raise an error."""
8+ expression_data = pd .DataFrame ({
9+ 'Sample1' : [1.0 ],
10+ 'Sample2' : [2.0 ]
11+ }, index = ['Gene1' ])
12+
13+ gene_clusters = [
14+ ['Gene1' ],
15+ [] # Empty cluster
16+ ]
17+
18+ with pytest .raises (ValueError , match = "Cannot perform PCA on empty clusters" ):
19+ calculate_eigen_genes (expression_data , gene_clusters )
20+
21+ def test_single_gene_clusters ():
22+ """Test that single-gene clusters work correctly."""
23+ expression_data = pd .DataFrame ({
24+ 'Sample1' : [1.0 , 2.0 ],
25+ 'Sample2' : [3.0 , 4.0 ]
26+ }, index = ['Gene1' , 'Gene2' ])
27+
28+ gene_clusters = [
29+ ['Gene1' ],
30+ ['Gene2' ]
31+ ]
32+
33+ eigen_genes = calculate_eigen_genes (expression_data , gene_clusters )
34+
35+ # Check that each cluster's values match the original expression
36+ assert np .allclose (eigen_genes ['Cluster_0' ].values ,
37+ expression_data .loc ['Gene1' ].values , rtol = 1e-5 )
38+ assert np .allclose (eigen_genes ['Cluster_1' ].values ,
39+ expression_data .loc ['Gene2' ].values , rtol = 1e-5 )
40+
41+ def test_large_dataset_parallel ():
42+ """Test that the function can handle larger datasets in parallel."""
43+ # Create a larger dataset with 100 genes and 50 samples
44+ np .random .seed (42 )
45+ n_genes = 100
46+ n_samples = 50
47+
48+ # Generate random expression data
49+ expression_data = pd .DataFrame (
50+ np .random .randn (n_genes , n_samples ),
51+ index = [f'Gene{ i } ' for i in range (n_genes )],
52+ columns = [f'Sample{ i } ' for i in range (n_samples )]
53+ )
54+
55+ # Create 10 clusters with 10 genes each
56+ gene_clusters = [
57+ [f'Gene{ i } ' for i in range (j * 10 , (j + 1 )* 10 )]
58+ for j in range (10 )
59+ ]
60+
61+ # Calculate eigen-genes
62+ eigen_genes = calculate_eigen_genes (expression_data , gene_clusters )
63+
64+ # Check the output format
65+ assert isinstance (eigen_genes , pd .DataFrame )
66+ assert eigen_genes .shape == (n_samples , 10 ) # 50 samples, 10 clusters
67+ assert all (eigen_genes .columns == [f'Cluster_{ i } ' for i in range (10 )])
68+
69+ # Check that each cluster's eigen-gene has the right dimensions
70+ for i in range (10 ):
71+ assert len (eigen_genes [f'Cluster_{ i } ' ]) == n_samples
72+ # Check that the values are not all zeros
73+ assert not np .allclose (eigen_genes [f'Cluster_{ i } ' ], 0 )
0 commit comments