Skip to content

Commit

Permalink
remove aaf field from Genotypes objects
Browse files Browse the repository at this point in the history
  • Loading branch information
aryarm committed Oct 18, 2022
1 parent f7df71f commit 94e1f61
Show file tree
Hide file tree
Showing 5 changed files with 12 additions and 68 deletions.
4 changes: 2 additions & 2 deletions docs/api/data.rst
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ Properties
**********
The ``data`` property of a :class:`Genotypes` object is a numpy array representing the genotype matrix. Rows of the array are samples and columns are variants. Each entry in the matrix is a tuple of values -- one for each chromosome. Each value is an integer denoting the index of the allele (0 for REF, 1 for the first ALT allele, 2 for the next ALT allele, etc).

There are two additional properties that contain variant and sample metadata. The ``variants`` property is a numpy structured array and the ``samples`` property is a simple tuple of sample IDs. The ``variants`` structured array has four named columns: "id", "chrom", "pos", and "aaf" (alternate allele frequency).
There are two additional properties that contain variant and sample metadata. The ``variants`` property is a numpy structured array and the ``samples`` property is a simple tuple of sample IDs. The ``variants`` structured array has three named columns: "id" (variant ID), "chrom" (chromosome name), and "pos" (chromosomal position).

Reading a file
**************
Expand Down Expand Up @@ -143,7 +143,7 @@ By default, the ``subset()`` method returns a new :class:`Genotypes` instance. T

GenotypesRefAlt
+++++++++++++++
The :class:`Genotypes` class can be easily *extended* (sub-classed) to load extra fields into the ``variants`` structured array. The :class:`GenotypesRefAlt` class is an example of this where I extended the :class:`Genotypes` class to add REF and ALT fields from the VCF to the columns of the structured array. So the ``variants`` array will have named columns: "id", "chrom", "pos", "aaf", "ref", and "alt".
The :class:`Genotypes` class can be easily *extended* (sub-classed) to load extra fields into the ``variants`` structured array. The :class:`GenotypesRefAlt` class is an example of this where I extended the :class:`Genotypes` class to add REF and ALT fields from the VCF to the columns of the structured array. So the ``variants`` array will have named columns: "id", "chrom", "pos", "ref", and "alt".

All of the other methods in the :class:`Genotypes` class are inherited, but the :class:`GenotypesRefAlt` class implements an additional method ``write()`` for dumping the contents of the class to the provided file.

Expand Down
44 changes: 3 additions & 41 deletions haptools/data/genotypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,6 @@ class Genotypes(Data):
1. ID
2. CHROM
3. POS
4. AAF: allele freq of alternate allele (or MAF if to_MAC() is called)
log: Logger
A logging instance for recording debug statements
_prephased : bool
Expand Down Expand Up @@ -61,7 +60,6 @@ def __init__(self, fname: Path | str, log: Logger = None):
("id", "U50"),
("chrom", "U10"),
("pos", np.uint32),
("aaf", np.float64),
],
)
self._prephased = False
Expand Down Expand Up @@ -219,7 +217,7 @@ def _variant_arr(self, record: Variant):
A row from the :py:attr:`~.Genotypes.variants` array
"""
return np.array(
(record.ID, record.CHROM, record.POS, record.aaf),
(record.ID, record.CHROM, record.POS),
dtype=self.variants.dtype,
)

Expand Down Expand Up @@ -504,38 +502,6 @@ def check_phase(self):
# remove the last dimension that contains the phase info
self.data = self.data[:, :, :2]

def to_MAC(self):
"""
Convert the ALT count GT matrix into a matrix of minor allele counts
This function modifies :py:attr:`~.Genotypes.data` in-place
It also changes the 'aaf' record in :py:attr:`~.Genotypes.variants` to 'maf'
Raises
------
AssertionError
If the matrix has already been converted
"""
if self.variants.dtype.names[3] == "maf":
self.log.warning(
"The matrix already counts instances of the minor allele rather than "
"the ALT allele."
)
return
need_conversion = self.variants["aaf"] > 0.5
# flip the count on the variants that have an alternate allele frequency
# above 0.5
self.data[:, need_conversion, :2] = ~self.data[:, need_conversion, :2]
# also encode an MAF instead of an AAF in self.variants
self.variants["aaf"][need_conversion] = (
1 - self.variants["aaf"][need_conversion]
)
# replace 'aaf' with 'maf' in the matrix
self.variants.dtype.names = [
(x, "maf")[x == "aaf"] for x in self.variants.dtype.names
]


class GenotypesRefAlt(Genotypes):
"""
Expand All @@ -556,9 +522,8 @@ class GenotypesRefAlt(Genotypes):
1. ID
2. CHROM
3. POS
4. AAF: allele freq of alternate allele (or MAF if to_MAC() is called)
5. REF
6. ALT
4. REF
5. ALT
log: Logger
See documentation for :py:attr:`~.Genotypes.log`
"""
Expand All @@ -571,7 +536,6 @@ def __init__(self, fname: Path | str, log: Logger = None):
("id", "U50"),
("chrom", "U10"),
("pos", np.uint32),
("aaf", np.float64),
("ref", "U100"),
("alt", "U100"),
],
Expand All @@ -586,7 +550,6 @@ def _variant_arr(self, record: Variant):
record.ID,
record.CHROM,
record.POS,
record.aaf,
record.REF,
record.ALT[0],
),
Expand Down Expand Up @@ -795,7 +758,6 @@ def _variant_arr(
record[cid["ID"]],
record[cid["CHROM"]],
record[cid["POS"]],
0.5,
record[cid["REF"]],
record[cid["ALT"]],
),
Expand Down
2 changes: 1 addition & 1 deletion haptools/data/haplotypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -1080,7 +1080,7 @@ def transform(
hap_gts = GenotypesRefAlt(fname=None, log=self.log)
hap_gts.samples = gts.samples
hap_gts.variants = np.array(
[(hap.id, hap.chrom, hap.start, 0, "A", "T") for hap in self.data.values()],
[(hap.id, hap.chrom, hap.start, "A", "T") for hap in self.data.values()],
dtype=hap_gts.variants.dtype,
)
# build a fast data structure for querying the alleles in each haplotype:
Expand Down
2 changes: 1 addition & 1 deletion haptools/transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ def transform(
hap_gts = data.GenotypesRefAlt(fname=None, log=self.log)
hap_gts.samples = gts.samples
hap_gts.variants = np.array(
[(hap.id, hap.chrom, hap.start, 0, "A", "T") for hap in self.data.values()],
[(hap.id, hap.chrom, hap.start, "A", "T") for hap in self.data.values()],
dtype=hap_gts.variants.dtype,
)
# build a fast data structure for querying the alleles in each haplotype:
Expand Down
28 changes: 5 additions & 23 deletions tests/test_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,17 +39,12 @@ def _get_fake_genotypes(self):
gts.data = self._get_expected_genotypes()
gts.variants = np.array(
[
("1:10114:T:C", "1", 10114, 0),
("1:10116:A:G", "1", 10116, 0.6),
("1:10117:C:A", "1", 10117, 0),
("1:10122:A:G", "1", 10122, 0),
],
dtype=[
("id", "U50"),
("chrom", "U10"),
("pos", np.uint32),
("aaf", np.float64),
("1:10114:T:C", "1", 10114),
("1:10116:A:G", "1", 10116),
("1:10117:C:A", "1", 10117),
("1:10122:A:G", "1", 10122),
],
dtype=gts.variants.dtype,
)
gts.samples = ("HG00096", "HG00097", "HG00099", "HG00100", "HG00101")
gts.check_phase()
Expand Down Expand Up @@ -117,18 +112,6 @@ def test_load_genotypes(self, caplog):
gts.check_phase()
assert len(caplog.records) > 0 and caplog.records[0].levelname == "WARNING"

# convert the matrix of alt allele counts to a matrix of minor allele counts
assert gts.variants["aaf"][1] == 0.6
gts.to_MAC()
expected[:, 1, :] = ~expected[:, 1, :]
np.testing.assert_allclose(gts.data, expected)
assert gts.variants["maf"][1] == 0.4

# try to do the MAC conversion again - it should warn b/c we've already done it
caplog.clear()
gts.to_MAC()
assert len(caplog.records) > 0 and caplog.records[0].levelname == "WARNING"

def test_load_genotypes_example(self):
samples = (
"HG00096",
Expand Down Expand Up @@ -632,7 +615,6 @@ def test_read_subset(self):
haps.read(region="21:26928472-26941960")
assert expected == haps.data


def test_read_extras(self):
# what do we expect to see from the simphenotype.hap file?
expected = {
Expand Down

0 comments on commit 94e1f61

Please sign in to comment.