nnperm/plot_simulated_aggregated.py at master · randommm/nnperm · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
#----------------------------------------------------------------------
# Copyright 2018 Marco Inacio <pythonpackages@marcoinacio.com>
#
#This program is free software: you can redistribute it and/or modify
#it under the terms of the GNU General Public License as published by
#the Free Software Foundation, version 3 of the License.

#This program is distributed in the hope that it will be useful,
#but WITHOUT ANY WARRANTY; without even the implied warranty of
#MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.    See the
#GNU General Public License for more details.

#You should have received a copy of the GNU General Public License
#along with this program. If not, see <http://www.gnu.org/licenses/>.
#----------------------------------------------------------------------

import numpy as np
import pandas as pd
import itertools
from plotnine import *
from db_structure import Result

df = pd.DataFrame(list(Result.select().where(Result.complexity==1, Result.method!='remove', ((Result.retrain_permutations!=0) or (Result.distribution>=2))).dicts()))

def plotcdfs(df, distribution, power=0.05):
    idx1 = df['distribution'] == distribution
    idx2 = df['betat'] > 0
    idxs = np.logical_and(idx1, idx2)
    df = df[idxs]

    plot = ggplot()
    for db_size in [1000, 10000]:
        dfs = df[df['db_size'] == db_size]

        ccolor = '#555555'
        dodge_text = position_dodge(width=0.9)

        #aggregate
        dfs = dfs.groupby(["method", "estimator", 'betat', 'retrain_permutations'],
            as_index=False)["pvalue"].apply(
            lambda x: sum(x<0.05)/len(x)).reset_index()
        dfs = dfs.rename(columns = {0:'pvalue'})
        dfs['pvalue'] = np.round(dfs['pvalue'] * 100, 1)
        pvalue_max = np.max(dfs['pvalue'])

        #new column
        retrain = np.array(dfs['retrain_permutations'])
        retrain = np.array(retrain,
            dtype=bool)
        method = np.array(dfs['method'])
        for i in range(len(method)):
            if method[i] == 'permutation':
                method[i] = 'COINP'
            if method[i] == 'shuffle_once':
                method[i] = 'CPI'
            if (not retrain[i]) and method[i] != "remove":
                method[i] = "Approximate " + method[i]
        dfs['retrain_and_method'] = list(method)


        dfs['betat'] = np.array(dfs['betat'], dtype="str")
        dfs['estimator'] = dfs["estimator"].apply(lambda x: x.upper())
        to_append = map(' and '.join, zip(dfs["betat"], dfs["estimator"]))
        dfs['betat_and_estimator'] = list(to_append)


        if db_size == 1000:
            plot += geom_col(dfs,
            aes(x='retrain_and_method', y='pvalue', fill='betat_and_estimator'),
            show_legend=True, position = "dodge",
            )

            plot += guides(fill=guide_legend(title="betat and \n estimator \n"))
        else:
            plot += geom_col(dfs,
            aes(x='retrain_and_method', y='pvalue', fill='betat_and_estimator'),
            show_legend=False, position = "dodge", alpha=0.0, color="#110011"
            )
            plot += scale_color_discrete(l=.4)

        #plot += geom_text(dfs,
        #             aes(label='pvalue', y='pvalue', x='retrain_and_method'),
        #             position=dodge_text, angle=45,
        #             size=8, va='bottom', format_string='{}%')

    plot += theme(panel_background=element_rect(fill='white'),               # new
             #axis_title_y=element_blank(),
             axis_line_x=element_line(color='black'),
             #axis_line_y=element_blank(),
             #axis_text_y=element_blank(),
             axis_text_x=element_text(color=ccolor, rotation=90),
             #axis_ticks_major_y=element_blank(),
             #axis_ticks_major_x=element_blank(),
             panel_grid=element_blank(),
             panel_border=element_blank(),
             )

    plot += ggtitle("Distribution " + str(distribution+1))
    plot += ylab("Test power")
    plot += xlab("Method and retrain")
    plot += lims(y=(0, np.max(dfs['pvalue'])+2))

    return plot

for distribution in range(5):
    filename = "plots/"
    filename += "aggregated"
    filename += "_distribution" + str(distribution+1)
    filename += ".pdf"
    plotcdfs(df.copy(), distribution).save(filename)