-
Notifications
You must be signed in to change notification settings - Fork 4
/
variant.hpp
255 lines (236 loc) · 7.35 KB
/
variant.hpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
/**
* MALVA - genotyping by Mapping-free ALternate-allele detection of known VAriants
* Copyright (C) 2019 Giulia Bernardini, Luca Denti, Marco Previtali
*
* This file is part of MALVA.
*
* MALVA is free software: you can redistribute it and/or modify it
* under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* MALVA is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with MALVA; see the file LICENSE. If not, see
* <https://www.gnu.org/licenses/>.
**/
#ifndef _VARIANT_HPP_
#define _VARIANT_HPP_
#include <numeric>
#include <string>
#include <vector>
#include <string_view>
using namespace std;
typedef struct
{
string first;
double second;
} GT;
typedef struct
{
int first;
int second;
} Geno;
struct Variant
{
string seq_name;
int ref_pos; // Variant position 0-based
string idx; // ID
string ref_sub; // Reference base{s}
vector<string> alts; // List of alternatives
float quality; // Quality field
string filter; // Filter field
string info; // Info field
vector<Geno> genotypes; // full list of genotypes
vector<bool> phasing; // true if genotype i-th is phased, false otherwise
int ref_size; // Len of reference base{s}
int min_size; // Length of the shortest string (ref and alts)
int max_size; // Length of the longest string (ref and alts)
bool has_alts = true; // false if no alternatives, i.e. only <CN>
bool is_present = true; // false if no sample has this variant
vector<float> frequencies; // Allele frequency in the considered population
vector<uint> coverages; // Allele coverages (computed from input sample)
vector<GT> computed_gts; // Computed genotypes
Variant() {}
Variant(bcf_hdr_t *vcf_header, bcf1_t *vcf_record, const string &freq_key, const bool uniform)
{
seq_name = bcf_hdr_id2name(vcf_header, vcf_record->rid);
ref_pos = vcf_record->pos;
idx = vcf_record->d.id;
ref_sub = vcf_record->d.allele[0];
transform(ref_sub.begin(), ref_sub.end(), ref_sub.begin(), ::toupper);
/**
* There is vcf_record->rlen for the ref_size but sometimes it
* returns a wrong value. The only example I found is when all
* alternatives are <CN> but maybe there could be more cases...
**/
ref_size = (int)ref_sub.size();
for (int i = 1; i < vcf_record->n_allele; ++i)
{
char *curr_alt = vcf_record->d.allele[i];
if (curr_alt[0] != '<')
{
string alt_tmp(curr_alt);
transform(alt_tmp.begin(), alt_tmp.end(), alt_tmp.begin(), ::toupper);
alts.emplace_back(alt_tmp);
}
}
coverages.resize(alts.size() + 1, 0); // +1 for the reference allele
quality = vcf_record->qual;
filter = "PASS"; // TODO: get filter string from VCF
info = "."; // TODO: get info string from VCF
// Set sizes and has_alts flag
set_sizes();
if (has_alts)
{
// Populate frequencies vector
extract_frequencies(vcf_header, vcf_record, freq_key, uniform);
if (is_present)
// Populate genotypes and phasing
extract_genotypes(vcf_header, vcf_record);
}
}
/**
* Set the has_alts flag and the min/max size of the variant
**/
void set_sizes()
{
if (alts.size() == 0)
has_alts = false;
else
{
min_size = ref_size;
max_size = ref_size;
for (uint i = 0; i < alts.size(); i++)
{
if ((int)alts[i].size() < min_size)
min_size = alts[i].size();
if ((int)alts[i].size() > max_size)
max_size = alts[i].size();
}
}
}
void extract_frequencies(bcf_hdr_t *vcf_header, bcf1_t *vcf_record,
const string &freq_key, const bool uniform)
{
int ndst = 0;
float *altall_freqs = NULL;
bcf_get_info_float(vcf_header, vcf_record, freq_key.c_str(),
&altall_freqs, &ndst);
if (!uniform)
{
frequencies.emplace_back(0); // First element is reserved for reference allele
for (uint i = 0; i < alts.size(); ++i)
{
float *freq = altall_freqs + i;
frequencies.emplace_back(freq[0]);
}
// Here we compute the frequency of the reference allele
frequencies[0] = 1.0 - accumulate(frequencies.begin(), frequencies.end(), 0.0);
if (frequencies[0] < 0) // this to avoid bad approximation
frequencies[0] = 0.0;
}
else
{
float uniform_freq = 1.0 / (alts.size() + 1);
for (uint i = 0; i < alts.size() + 1; ++i)
frequencies.emplace_back(uniform_freq);
}
if (frequencies[0] == 1.0)
is_present = false;
}
void extract_genotypes(bcf_hdr_t *vcf_header, bcf1_t *vcf_record)
{
// number of individuals from header
int n_individuals = bcf_hdr_nsamples(vcf_header);
int32_t *gt_arr = NULL, ngt = 0;
int ngt_ret_value =
bcf_get_genotypes(vcf_header, vcf_record, >_arr, &ngt);
/***
* If the record contains GT fields, ngt == ngt_ret_value == #GT.
* Otherwise, ngt_ret_value is <= 0 that means an error occurred.
***/
if (ngt_ret_value <= 0)
{
// cout << "The record doesn't contain GT information" << endl;
has_alts = false;
return;
}
int ploidy = ngt / n_individuals;
for (int i = 0; i < n_individuals; ++i)
{
int32_t *curr_gt = gt_arr + i * ploidy;
// Here, I'm assuming ploidy = 2. Otherwise, loop til ploidy
int all_1;
int all_2;
bool is_phased = false;
if (curr_gt[1] == bcf_int32_vector_end)
{
all_1 = bcf_gt_allele(curr_gt[0]);
all_2 = bcf_gt_allele(curr_gt[0]);
is_phased = true;
}
else
{
all_1 = bcf_gt_allele(curr_gt[0]);
all_2 = bcf_gt_allele(curr_gt[1]);
if (bcf_gt_is_phased(curr_gt[1]))
// this works, but I'm not 100% sure it's sufficient
is_phased = true;
}
if (all_1 < 0)
all_1 = 0;
if (all_2 < 0)
all_2 = 0;
// FIXME: we store two alleles per sample even when we have an
// haploid VCF. We manage haploid VCF in a special way in
// var_block.hpp
Geno gen = {all_1, all_2};
genotypes.emplace_back(gen);
phasing.emplace_back(is_phased);
}
free(gt_arr);
}
/**
* Return the i-th allele, 0 is the reference
**/
string_view get_allele(const int i) const
{
if (i == 0)
return ref_sub;
else
return alts[i - 1];
}
/**
* Given an allele, return its position in the list (1-based since 0 is the
*reference)
**/
int get_allele_index(const string_view &a) const
{
if (ref_sub == a)
return 0;
int i = 1;
for (const string &all : alts)
{
if (all == a)
return i;
++i;
}
return -1;
}
void set_coverage(const int i, const uint cov)
{
// maybe we can add some control here
coverages[i] = cov;
}
void add_genotype(const GT >)
{
// maybe we can add some control here
computed_gts.emplace_back(gt);
}
};
#endif