-
Notifications
You must be signed in to change notification settings - Fork 6
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
PhredNtHash class #108
base: master
Are you sure you want to change the base?
PhredNtHash class #108
Changes from all commits
593bf46
a912dcd
020a5a3
cebddbb
dc8b9b9
e03786e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,127 @@ | ||
#ifndef BTLLIB_PHRED_NTHASH_HPP | ||
#define BTLLIB_PHRED_NTHASH_HPP | ||
|
||
#include "btllib/nthash.hpp" | ||
#include "btllib/util.hpp" | ||
|
||
#include <string> | ||
#include <vector> | ||
|
||
namespace btllib { | ||
/** | ||
* NtHash with Phred score filtering. | ||
*/ | ||
class PhredNtHash : private NtHash | ||
{ | ||
public: | ||
/** | ||
* Constructor for PhredNtHash. | ||
* @param seq Sequence to hash. | ||
* @param seq_len Length of `seq`. | ||
* @param hash_num Number of hashes to compute. | ||
* @param k Length of k-mer. | ||
* @param phred_min Minimum Phred score for a base to be included in the hash. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think the explanation for |
||
* @param quality_string String of Phred scores for each base in `seq`. | ||
* @param pos Position to start hashing from. | ||
* @return PhredNtHash object. | ||
*/ | ||
PhredNtHash(const char* seq, | ||
size_t seq_len, | ||
unsigned hash_num, | ||
unsigned k, | ||
size_t phred_min, | ||
std::string_view quality_string, | ||
size_t pos = 0); | ||
|
||
/** | ||
* Constructor for PhredNtHash. | ||
* @param seq Sequence to hash. | ||
* @param hash_num Number of hashes to compute. | ||
* @param k Length of k-mer. | ||
* @param phred_min Minimum Phred score for a base to be included in the hash. | ||
* @param quality_string String of Phred scores for each base in `seq`. | ||
* @param pos Position to start hashing from. | ||
* @return PhredNtHash object. | ||
*/ | ||
PhredNtHash(const std::string& seq, | ||
unsigned hash_num, | ||
unsigned k, | ||
size_t phred_min, | ||
std::string_view quality_string, | ||
size_t pos = 0); | ||
|
||
/** | ||
* Constructor for PhredNtHash. | ||
* @param seq Sequence to hash. | ||
* @param seq_len Length of `seq`. | ||
* @param hash_num Number of hashes to compute. | ||
* @param k Length of k-mer. | ||
* @param phred_min Minimum Phred score for a base to be included in the hash. | ||
* @param quality_string String of Phred scores for each base in `seq`. | ||
* @param pos Position to start hashing from. | ||
* @return PhredNtHash object. | ||
*/ | ||
PhredNtHash(const char* seq, | ||
size_t seq_len, | ||
unsigned hash_num, | ||
unsigned k, | ||
size_t phred_min, | ||
const char* quality_string, | ||
size_t pos = 0); | ||
|
||
/** | ||
* Constructor for PhredNtHash. | ||
* @param seq Sequence to hash. | ||
* @param hash_num Number of hashes to compute. | ||
* @param k Length of k-mer. | ||
* @param phred_min Minimum Phred score for a base to be included in the hash. | ||
* @param quality_string String of Phred scores for each base in `seq`. | ||
* @param pos Position to start hashing from. | ||
* @return PhredNtHash object. | ||
*/ | ||
PhredNtHash(const std::string& seq, | ||
unsigned hash_num, | ||
unsigned k, | ||
size_t phred_min, | ||
const char* quality_string, | ||
size_t pos = 0); | ||
|
||
/** | ||
* Roll the hash forward by one base. If the next k-mer contains a base with | ||
* a Phred score below `phred_min`, the hash will be rolled forward until a | ||
* k-mer with all bases with a Phred score above `phred_min` is found. | ||
* @return True if successful, false if the end of the sequence is reached or | ||
* no k-mer with all bases with a Phred score above `phred_min` is found. | ||
*/ | ||
bool roll(); | ||
/** | ||
* Roll the hash backward by one base. If the previous k-mer contains a base | ||
* with a Phred score below `phred_min`, the hash will be rolled backward | ||
* until a k-mer with all bases with a Phred score above `phred_min` is found. | ||
* @return True if successful, false if the start of the sequence is reached | ||
* or no k-mer with all bases with a Phred score above `phred_min` is found. | ||
*/ | ||
bool roll_back(); | ||
const uint64_t* hashes() const { return NtHash::hashes(); } | ||
|
||
/** | ||
* Get the position of last hashed k-mer or the k-mer to be hashed if roll() | ||
* has never been called on this NtHash object. | ||
*/ | ||
size_t get_pos() const { return NtHash::get_pos(); } | ||
bool forward() const { return NtHash::forward(); } | ||
unsigned get_hash_num() const { return NtHash::get_hash_num(); } | ||
unsigned get_k() const { return NtHash::get_k(); } | ||
size_t get_seq_len() const { return NtHash::get_seq_len(); } | ||
uint64_t get_forward_hash() const { return NtHash::get_forward_hash(); } | ||
uint64_t get_reverse_hash() const { return NtHash::get_reverse_hash(); } | ||
|
||
private: | ||
const char* qual_seq; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Let's prefer to use a |
||
unsigned char phred_min; | ||
RangeMinimumQuery<std::string_view> rmq; | ||
}; | ||
|
||
} // namespace btllib | ||
|
||
#endif // BTLLIB_PHRED_NTHASH_HPP |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,114 @@ | ||
#include "btllib/phred_nthash.hpp" | ||
|
||
namespace btllib { | ||
PhredNtHash::PhredNtHash(const char* seq, | ||
size_t seq_len, | ||
unsigned hash_num, | ||
unsigned k, | ||
size_t phred_min, | ||
std::string_view quality_string, | ||
size_t pos) | ||
: NtHash(seq, seq_len, hash_num, k, pos) | ||
, qual_seq(quality_string.data()) | ||
, phred_min((unsigned char)(phred_min + (size_t)PHRED_OFFSET)) | ||
, rmq(quality_string, seq_len) | ||
{ | ||
} | ||
|
||
PhredNtHash::PhredNtHash(const std::string& seq, | ||
unsigned hash_num, | ||
unsigned k, | ||
size_t phred_min, | ||
std::string_view quality_string, | ||
size_t pos) | ||
: NtHash(seq, hash_num, k, pos) | ||
, qual_seq(quality_string.data()) | ||
, phred_min((unsigned char)(phred_min + (size_t)PHRED_OFFSET)) | ||
, rmq(quality_string, seq.length()) | ||
{ | ||
} | ||
|
||
PhredNtHash::PhredNtHash(const char* seq, | ||
size_t seq_len, | ||
unsigned hash_num, | ||
unsigned k, | ||
size_t phred_min, | ||
const char* quality_string, | ||
size_t pos) | ||
: NtHash(seq, seq_len, hash_num, k, pos) | ||
, qual_seq(quality_string) | ||
, phred_min((unsigned char)(phred_min + (size_t)PHRED_OFFSET)) | ||
, rmq(quality_string, seq_len) | ||
{ | ||
} | ||
|
||
PhredNtHash::PhredNtHash(const std::string& seq, | ||
unsigned hash_num, | ||
unsigned k, | ||
size_t phred_min, | ||
const char* quality_string, | ||
size_t pos) | ||
: NtHash(seq, hash_num, k, pos) | ||
, qual_seq(quality_string) | ||
, phred_min((unsigned char)(phred_min + (size_t)PHRED_OFFSET)) | ||
, rmq(quality_string, seq.length()) | ||
{ | ||
} | ||
|
||
bool | ||
PhredNtHash::roll() | ||
{ | ||
bool success = NtHash::roll(); | ||
if (!success) { | ||
return false; | ||
} | ||
size_t curr_pos = NtHash::get_pos(); | ||
size_t k = NtHash::get_k(); | ||
size_t seq_len = NtHash::get_seq_len(); | ||
size_t min_phred_idx = rmq.query(curr_pos, curr_pos + k - 1); | ||
auto min_phred = (unsigned char)qual_seq[min_phred_idx]; | ||
while (min_phred < phred_min) { | ||
// check next kmer range does not exceed sequence length | ||
if (min_phred_idx + k >= seq_len) { | ||
return false; | ||
} | ||
curr_pos = min_phred_idx + 1; | ||
min_phred_idx = rmq.query(curr_pos, curr_pos + k - 1); | ||
min_phred = (size_t)qual_seq[min_phred_idx]; | ||
} | ||
|
||
while (curr_pos != NtHash::get_pos()) { | ||
success = NtHash::roll(); | ||
} | ||
|
||
return success; | ||
} | ||
|
||
bool | ||
PhredNtHash::roll_back() | ||
{ | ||
bool success = NtHash::roll_back(); | ||
if (!success) { | ||
return false; | ||
} | ||
size_t curr_pos = NtHash::get_pos(); | ||
size_t k = NtHash::get_k(); | ||
size_t min_phred_idx = rmq.query(curr_pos, curr_pos + k - 1); | ||
auto min_phred = (unsigned char)qual_seq[min_phred_idx]; | ||
while (min_phred < phred_min) { | ||
// check next kmer range does not exceed sequence length | ||
if (min_phred_idx - k >= NtHash::get_seq_len()) { | ||
return false; | ||
} | ||
curr_pos = min_phred_idx - k; | ||
min_phred_idx = rmq.query(curr_pos, curr_pos + k - 1); | ||
min_phred = (size_t)qual_seq[min_phred_idx]; | ||
} | ||
|
||
while (curr_pos != NtHash::get_pos()) { | ||
success = NtHash::roll_back(); | ||
} | ||
|
||
return success; | ||
} | ||
} // namespace btllib |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
An alternative would be to use
: public NtHash
and hide unused methods withusing
, e.g.This way, there'll be no need to reimplement
NtHash
's public methods likeget_pos
etc.Source: https://www.learncpp.com/cpp-tutorial/hiding-inherited-functionality/