Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

PhredNtHash class #108

Open
wants to merge 6 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions include/btllib/nthash.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,7 @@ class NtHash
bool forward() const { return forward_hash <= reverse_hash; }
unsigned get_hash_num() const { return hash_num; }
unsigned get_k() const { return k; }
size_t get_seq_len() const { return seq_len; }

uint64_t get_forward_hash() const { return forward_hash; }
uint64_t get_reverse_hash() const { return reverse_hash; }
Expand Down
127 changes: 127 additions & 0 deletions include/btllib/phred_nthash.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
#ifndef BTLLIB_PHRED_NTHASH_HPP
#define BTLLIB_PHRED_NTHASH_HPP

#include "btllib/nthash.hpp"
#include "btllib/util.hpp"

#include <string>
#include <vector>

namespace btllib {
/**
* NtHash with Phred score filtering.
*/
class PhredNtHash : private NtHash
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

An alternative would be to use : public NtHash and hide unused methods with using, e.g.

private:
  using NtHash::peek;

This way, there'll be no need to reimplement NtHash's public methods like get_pos etc.

Source: https://www.learncpp.com/cpp-tutorial/hiding-inherited-functionality/

{
public:
/**
* Constructor for PhredNtHash.
* @param seq Sequence to hash.
* @param seq_len Length of `seq`.
* @param hash_num Number of hashes to compute.
* @param k Length of k-mer.
* @param phred_min Minimum Phred score for a base to be included in the hash.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think the explanation for phred_min needs clarification. I thought that if the Phred score for a base is less than this threshold, the k-mer will be ignored, not that specific base (unlike spaced seeds where bases are ignored).

* @param quality_string String of Phred scores for each base in `seq`.
* @param pos Position to start hashing from.
* @return PhredNtHash object.
*/
PhredNtHash(const char* seq,
size_t seq_len,
unsigned hash_num,
unsigned k,
size_t phred_min,
std::string_view quality_string,
size_t pos = 0);

/**
* Constructor for PhredNtHash.
* @param seq Sequence to hash.
* @param hash_num Number of hashes to compute.
* @param k Length of k-mer.
* @param phred_min Minimum Phred score for a base to be included in the hash.
* @param quality_string String of Phred scores for each base in `seq`.
* @param pos Position to start hashing from.
* @return PhredNtHash object.
*/
PhredNtHash(const std::string& seq,
unsigned hash_num,
unsigned k,
size_t phred_min,
std::string_view quality_string,
size_t pos = 0);

/**
* Constructor for PhredNtHash.
* @param seq Sequence to hash.
* @param seq_len Length of `seq`.
* @param hash_num Number of hashes to compute.
* @param k Length of k-mer.
* @param phred_min Minimum Phred score for a base to be included in the hash.
* @param quality_string String of Phred scores for each base in `seq`.
* @param pos Position to start hashing from.
* @return PhredNtHash object.
*/
PhredNtHash(const char* seq,
size_t seq_len,
unsigned hash_num,
unsigned k,
size_t phred_min,
const char* quality_string,
size_t pos = 0);

/**
* Constructor for PhredNtHash.
* @param seq Sequence to hash.
* @param hash_num Number of hashes to compute.
* @param k Length of k-mer.
* @param phred_min Minimum Phred score for a base to be included in the hash.
* @param quality_string String of Phred scores for each base in `seq`.
* @param pos Position to start hashing from.
* @return PhredNtHash object.
*/
PhredNtHash(const std::string& seq,
unsigned hash_num,
unsigned k,
size_t phred_min,
const char* quality_string,
size_t pos = 0);

/**
* Roll the hash forward by one base. If the next k-mer contains a base with
* a Phred score below `phred_min`, the hash will be rolled forward until a
* k-mer with all bases with a Phred score above `phred_min` is found.
* @return True if successful, false if the end of the sequence is reached or
* no k-mer with all bases with a Phred score above `phred_min` is found.
*/
bool roll();
/**
* Roll the hash backward by one base. If the previous k-mer contains a base
* with a Phred score below `phred_min`, the hash will be rolled backward
* until a k-mer with all bases with a Phred score above `phred_min` is found.
* @return True if successful, false if the start of the sequence is reached
* or no k-mer with all bases with a Phred score above `phred_min` is found.
*/
bool roll_back();
const uint64_t* hashes() const { return NtHash::hashes(); }

/**
* Get the position of last hashed k-mer or the k-mer to be hashed if roll()
* has never been called on this NtHash object.
*/
size_t get_pos() const { return NtHash::get_pos(); }
bool forward() const { return NtHash::forward(); }
unsigned get_hash_num() const { return NtHash::get_hash_num(); }
unsigned get_k() const { return NtHash::get_k(); }
size_t get_seq_len() const { return NtHash::get_seq_len(); }
uint64_t get_forward_hash() const { return NtHash::get_forward_hash(); }
uint64_t get_reverse_hash() const { return NtHash::get_reverse_hash(); }

private:
const char* qual_seq;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's prefer to use a shared_ptr or an std::string_view for qual_seq here.

unsigned char phred_min;
RangeMinimumQuery<std::string_view> rmq;
};

} // namespace btllib

#endif // BTLLIB_PHRED_NTHASH_HPP
91 changes: 91 additions & 0 deletions include/btllib/util.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,18 @@
#define BTLLIB_UTIL_HPP

#include "btllib/cstring.hpp"
#include "btllib/status.hpp"

#include <cmath>
#include <condition_variable>
#include <mutex>
#include <string>
#include <vector>

namespace btllib {

static constexpr double PHRED_OFFSET = 33.0;

/**
* Split a string into component substrings with a delimiter.
*
Expand Down Expand Up @@ -125,6 +129,93 @@ get_dirname(const std::string& path);
double
calc_phred_avg(const std::string& qual, size_t start_pos = 0, size_t len = 0);

/**
* Range minimum query class.
* Finds the index of the minimum value in a range of any iterable that can be
* indexed via [x] where x is the index corresponding to the element in the
* iterable.
* @tparam T The type of the iterable.
*/
template<class T>
class RangeMinimumQuery
{
public:
/**
* Constructor for RangeMinimumQuery.
* @param array The iterable to be queried.
* @param size_of_array The size of the iterable.
*/
RangeMinimumQuery(const T& array, size_t size_of_array);
/**
* Query the index of the minimum value in the range [start, end].
* @param start The start index of the range.
* @param end The end index of the range.
* @return The index of the minimum value in the range.
* @pre start <= end
* @pre end < size_of_array
* @pre start >= 0
*
* @post The return value is in the range [start, end]
* @post The return value is the index of the minimum value in the range.
*/
size_t query(size_t start, size_t end);

private:
//* The lookup table for the query.
std::vector<std::vector<size_t>> lookup_table;
};

// adapted from
// https://www.geeksforgeeks.org/range-minimum-query-for-static-array/
template<class T>
RangeMinimumQuery<T>::RangeMinimumQuery(const T& array, size_t size_of_array)
{
size_t log_size =
(size_t)(log2(
size_of_array)) + // NOLINT(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
1;
lookup_table.resize(log_size, std::vector<size_t>(size_of_array));
for (size_t i = 0; i < size_of_array; i++) {
lookup_table[0][i] = i;
}
for (size_t j = 1; j < log_size; j++) {
for (size_t i = 0; i + (1 << j) - 1 < size_of_array; i++) {
if (array[lookup_table[j - 1][i]] <
array[lookup_table[j - 1][i + (1 << (j - 1))]]) {
lookup_table[j][i] = lookup_table[j - 1][i];
} else {
lookup_table[j][i] = lookup_table[j - 1][i + (1 << (j - 1))];
}
}
}
}

template<class T>
size_t
RangeMinimumQuery<T>::query(size_t start, size_t end)
{
if (start > end) {
log_error("RangeMinimumQuery::query: start > end");
std::exit(EXIT_FAILURE); // NOLINT(concurrency-mt-unsafe)
}
if (end >= lookup_table[0].size()) {
log_error("RangeMinimumQuery::query: end >= lookup_table[0].size()");
std::exit(EXIT_FAILURE); // NOLINT(concurrency-mt-unsafe)
}
if (start >= lookup_table[0].size()) {
log_error("RangeMinimumQuery::query: start >= lookup_table[0].size()");
std::exit(EXIT_FAILURE); // NOLINT(concurrency-mt-unsafe)
}

auto j = (size_t)(log2(
end - // NOLINT(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
start + 1));
if (lookup_table[j][start] <= lookup_table[j][end - (1 << j) + 1]) {
return lookup_table[j][start];
}
return lookup_table[j][end - (1 << j) + 1];
}

// This exists in C++20, but we don't support that yet
/// @cond HIDDEN_SYMBOLS
class Barrier
Expand Down
114 changes: 114 additions & 0 deletions src/btllib/phred_nthash.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
#include "btllib/phred_nthash.hpp"

namespace btllib {
PhredNtHash::PhredNtHash(const char* seq,
size_t seq_len,
unsigned hash_num,
unsigned k,
size_t phred_min,
std::string_view quality_string,
size_t pos)
: NtHash(seq, seq_len, hash_num, k, pos)
, qual_seq(quality_string.data())
, phred_min((unsigned char)(phred_min + (size_t)PHRED_OFFSET))
, rmq(quality_string, seq_len)
{
}

PhredNtHash::PhredNtHash(const std::string& seq,
unsigned hash_num,
unsigned k,
size_t phred_min,
std::string_view quality_string,
size_t pos)
: NtHash(seq, hash_num, k, pos)
, qual_seq(quality_string.data())
, phred_min((unsigned char)(phred_min + (size_t)PHRED_OFFSET))
, rmq(quality_string, seq.length())
{
}

PhredNtHash::PhredNtHash(const char* seq,
size_t seq_len,
unsigned hash_num,
unsigned k,
size_t phred_min,
const char* quality_string,
size_t pos)
: NtHash(seq, seq_len, hash_num, k, pos)
, qual_seq(quality_string)
, phred_min((unsigned char)(phred_min + (size_t)PHRED_OFFSET))
, rmq(quality_string, seq_len)
{
}

PhredNtHash::PhredNtHash(const std::string& seq,
unsigned hash_num,
unsigned k,
size_t phred_min,
const char* quality_string,
size_t pos)
: NtHash(seq, hash_num, k, pos)
, qual_seq(quality_string)
, phred_min((unsigned char)(phred_min + (size_t)PHRED_OFFSET))
, rmq(quality_string, seq.length())
{
}

bool
PhredNtHash::roll()
{
bool success = NtHash::roll();
if (!success) {
return false;
}
size_t curr_pos = NtHash::get_pos();
size_t k = NtHash::get_k();
size_t seq_len = NtHash::get_seq_len();
size_t min_phred_idx = rmq.query(curr_pos, curr_pos + k - 1);
auto min_phred = (unsigned char)qual_seq[min_phred_idx];
while (min_phred < phred_min) {
// check next kmer range does not exceed sequence length
if (min_phred_idx + k >= seq_len) {
return false;
}
curr_pos = min_phred_idx + 1;
min_phred_idx = rmq.query(curr_pos, curr_pos + k - 1);
min_phred = (size_t)qual_seq[min_phred_idx];
}

while (curr_pos != NtHash::get_pos()) {
success = NtHash::roll();
}

return success;
}

bool
PhredNtHash::roll_back()
{
bool success = NtHash::roll_back();
if (!success) {
return false;
}
size_t curr_pos = NtHash::get_pos();
size_t k = NtHash::get_k();
size_t min_phred_idx = rmq.query(curr_pos, curr_pos + k - 1);
auto min_phred = (unsigned char)qual_seq[min_phred_idx];
while (min_phred < phred_min) {
// check next kmer range does not exceed sequence length
if (min_phred_idx - k >= NtHash::get_seq_len()) {
return false;
}
curr_pos = min_phred_idx - k;
min_phred_idx = rmq.query(curr_pos, curr_pos + k - 1);
min_phred = (size_t)qual_seq[min_phred_idx];
}

while (curr_pos != NtHash::get_pos()) {
success = NtHash::roll_back();
}

return success;
}
} // namespace btllib
1 change: 0 additions & 1 deletion src/btllib/util.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -176,7 +176,6 @@ calc_phred_avg(const std::string& qual, const size_t start_pos, size_t len)
phred_sum += (size_t)qual.at(i);
}

static constexpr double PHRED_OFFSET = 33.0;
return ((double)phred_sum / (double)len) - PHRED_OFFSET;
}

Expand Down
Loading