-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathcoder.c
More file actions
212 lines (188 loc) · 5.06 KB
/
coder.c
File metadata and controls
212 lines (188 loc) · 5.06 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
/*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 3 of the License, or
* (at your option) any later version.
*
* Copyright (C) 2011-2012 Gad Abraham and National ICT Australia (NICTA).
* All rights reserved.
*/
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include "common.h"
#include "coder.h"
/* A precomputed PLINK mapping from BED words to allele dosage
*
* We take the BED word, interpret it as an int 0...255.
* For the kth int, we store 4 genotypes (minor allele dosages) {0,1,2,3}
*
* See comments for decode_plink() on the mapping
*
* */
int mapping_init(mapping *m)
{
int a1, a2;
int i, k;
unsigned char geno;
m->size = 256;
CALLOCTEST(m->map, m->size, sizeof(dtype*));
for(i = 0 ; i < m->size ; i++)
{
CALLOCTEST(m->map[i], 4, sizeof(dtype));
/* i is interpreted in binary as 4 genotypes */
k = 0;
/* 1st genotype */
geno = (i & MASK0);
a1 = !(geno & 1);
a2 = !(geno >> 1);
m->map[i][k] = (geno == 1) ? 3 : a1 + a2;
k++;
/* 2nd genotype */
geno = (i & MASK1) >> 2;
a1 = !(geno & 1);
a2 = !(geno >> 1);
m->map[i][k] = (geno == 1) ? 3 : a1 + a2;
k++;
/* 3rd genotype */
geno = (i & MASK2) >> 4;
a1 = !(geno & 1);
a2 = !(geno >> 1);
m->map[i][k] = (geno == 1) ? 3 : a1 + a2;
k++;
/* 4th genotype */
geno = (i & MASK3) >> 6;
a1 = !(geno & 1);
a2 = !(geno >> 1);
m->map[i][k] = (geno == 1) ? 3 : a1 + a2;
}
return SUCCESS;
}
void mapping_free(mapping *m)
{
int i;
if(m)
{
for(i = 0 ; i < m->size ; i++)
FREENULL(m->map[i]);
FREENULL(m->map);
}
}
/*
* Converts an array of short integers {0,1,2,3} (3 is NA) to binary
* {00, 01, 10, 11}, so that each byte contains 4 genotypes.
*
* length of out should be ceiling(length(in) / 4.0), so it should be padded
* with zeros if the number of bytes is not a multiple of 4. This function
* does not check for alignment!
*
* n: length of "in" buffer
*
*/
void encode(unsigned char *out, const unsigned char *in, const int n)
{
int i, j, k = 0;
for(i = 0 ; i < n ; i += PACK_DENSITY)
{
out[k] = 0;
for(j = 0 ; j < PACK_DENSITY ; j++)
if(i + j < n)
out[k] |= in[i + j] << (2 * j);
k++;
}
}
/*
* n: length of "in" buffer
*
* out must be a pointer of length sizeof(char) * n * PACK_DENSITY
*/
void decode(unsigned char *out,
const unsigned char *in, const int n)
{
int i, k;
unsigned char tmp;
for(i = 0 ; i < n ; ++i)
{
tmp = in[i];
k = PACK_DENSITY * i;
out[k++] = (tmp & MASK0);
out[k++] = (tmp & MASK1) >> 2;
out[k++] = (tmp & MASK2) >> 4;
out[k] = (tmp & MASK3) >> 6;
}
}
/*
* plink BED sparsnp
* minor homozyous: 00 => numeric 0 10 => numeric 2
* heterozygous: 10 => numeric 2 01 => numeric 1
* major homozygous: 11 => numeric 3 00 => numeric 0
* missing: 01 => numeric 1 11 => numeric 3
*
*
* http://pngu.mgh.harvard.edu/~purcell/plink/binary.shtml says,
* The bytes in plink are read backwards HGFEDCBA, not GHEFCDAB, but we read
* them forwards as a character (a proper byte)
*
* By default, plink usage dosage of the *major* allele, since allele A1 is
* usually the minor allele and the code "1" refers to the second allele A2,
* so that "11" is A2/A2 or major/major.
*
* We always use minor allele dosage, to be consistent with the output from
* plink --recodeA which used minor allele dosage by default.
*
* out: array of genotypes
* in: array of packed genotypes (bytes)
* n: number of bytes in input
*
*/
void decode_plink(unsigned char *out,
const unsigned char *in, const int n)
{
int i, k;
unsigned char tmp, geno;
int a1, a2;
for(i = 0 ; i < n ; ++i)
{
tmp = in[i];
k = PACK_DENSITY * i;
/* geno is interpreted as a char, however a1 and a2 are bits for allele 1 and
* allele 2. The final genotype is the sum of the alleles, except for 01
* which denotes missing.
*/
geno = (tmp & MASK0);
a1 = !(geno & 1);
a2 = !(geno >> 1);
out[k] = (geno == 1) ? 3 : a1 + a2;
k++;
geno = (tmp & MASK1) >> 2;
a1 = !(geno & 1);
a2 = !(geno >> 1);
out[k] = (geno == 1) ? 3 : a1 + a2;
k++;
geno = (tmp & MASK2) >> 4;
a1 = !(geno & 1);
a2 = !(geno >> 1);
out[k] = (geno == 1) ? 3 : a1 + a2;
k++;
geno = (tmp & MASK3) >> 6;
a1 = !(geno & 1);
a2 = !(geno >> 1);
out[k] = (geno == 1) ? 3 : a1 + a2;
}
}
/* n: number of bytes in input
*/
void decode_plink_mapping(mapping *map, unsigned char *out,
const unsigned char *in, const int n)
{
int i, k = 0;
dtype *tmp;
for(i = 0 ; i < n ; i++)
{
tmp = map->map[in[i]];
out[k++] = tmp[0];
out[k++] = tmp[1];
out[k++] = tmp[2];
out[k++] = tmp[3];
}
}