Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

docs: fix example of .blocks.det to .hap conversion in API docs #236

Merged
merged 6 commits into from
Jan 12, 2024
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 16 additions & 14 deletions docs/api/examples.rst
Original file line number Diff line number Diff line change
Expand Up @@ -8,38 +8,35 @@ Converting a ``.blocks.det`` file into a ``.hap`` file
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
You can use the :ref:`data API <api-data>` to easily convert `a PLINK 1.9 .blocks.det file <https://www.cog-genomics.org/plink/1.9/formats#blocks>`_ into a ``.hap`` file.

As an example, let's say we would like to convert the following ``.blocks.det`` file.
As an example, let's say we would like to convert `the following .blocks.det file <https://github.com/cast-genomics/haptools/blob/main/tests/data/simple.blocks.det>`_.

.. code-block::

CHR BP1 BP2 KB NSNPS SNPS
1 2313888 2331789 17.902 3 rs7527871|rs2840528|rs7545940
1 2462779 2482556 19.778 2 rs2296442|rs2246732
1 2867411 2869431 2.021 2 rs10752728|rs897635
1 2974991 2979823 4.833 3 rs10489588|rs9661525|rs2993510
.. include:: ../../tests/data/simple.blocks.det
:literal:

.. code-block:: python

from haptools import data

# load the genotypes file
# you can use either a VCF or PGEN file
gt = data.GenotypesVCF.load("input.vcf.gz")
gt = data.GenotypesPLINK.load("input.pgen")
gt = data.GenotypesVCF.load("tests/data/simple.vcf.gz")
gt = data.GenotypesPLINK.load("tests/data/simple.pgen")

# load the haplotypes
hp = data.Haplotypes("output.hap")
hp.data = {}

# iterate through lines of the .blocks.det file
with open("input.blocks.det") as blocks_file:
for idx, line in enumerate(blocks_file.readlines()):
with open("tests/data/simple.blocks.det") as blocks_file:
for idx, line in enumerate(blocks_file.read().splitlines()[1:]):
# initialize variables and parse line from the blocks file
hap_id = f"H{idx}"
chrom, bp1, bp2, kb, nsnps, snps = line.split("\t")

# create a haplotype line in the .hap file
hp.data[hap_id] = data.Haplotype(chrom=chrom, start=bp1, end=bp2, id=hap_id)
hp.data[hap_id] = data.Haplotype(
chrom=chrom, start=int(bp1), end=int(bp2), id=hap_id
)

# fetch alleles from the genotypes file
snp_gts = gt.subset(variants=tuple(snps.split("|")))
Expand All @@ -48,7 +45,12 @@ As an example, let's say we would like to convert the following ``.blocks.det``
# Note that the .blocks.det file doesn't specify an allele, so
# we simply choose the first allele (ie the REF allele) for this example
hp.data[hap_id].variants = tuple(
data.Variant(start=v["pos"], end=v["pos"]+len(v["alleles"][0]), id=v["id"], allele=v["alleles"][0])
data.Variant(
start=v["pos"],
end=v["pos"] + len(v["alleles"][0]),
id=v["id"],
allele=v["alleles"][0],
)
for v in snp_gts.variants
)

Expand Down
4 changes: 4 additions & 0 deletions tests/data/simple.blocks.det
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
CHR BP1 BP2 KB NSNPS SNPS
1 10114 10117 2.001 2 1:10114:T:C|1:10116:A:G
1 10114 10119 2.007 2 1:10114:T:C|1:10117:C:A
1 10116 10119 2.011 2 1:10116:A:G|1:10117:C:A
10 changes: 10 additions & 0 deletions tests/data/simple.blocks.det.hap
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# version 0.2.0
H 1 10114 10117 H0
H 1 10114 10119 H1
H 1 10116 10119 H2
V H0 10114 10115 1:10114:T:C T
V H0 10116 10117 1:10116:A:G A
V H1 10114 10115 1:10114:T:C T
V H1 10117 10118 1:10117:C:A C
V H2 10116 10117 1:10116:A:G A
V H2 10117 10118 1:10117:C:A C
46 changes: 46 additions & 0 deletions tests/test_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -2061,6 +2061,52 @@ def test_breakpoints_to_pop_array_chrom_no_match(self):


class TestDocExamples:
def test_blocks2hap(self):
# load the genotypes file
# you can use either a VCF or PGEN file
gt = GenotypesVCF.load(DATADIR / "simple.vcf.gz")
gt = GenotypesPLINK.load(DATADIR / "simple.pgen")

# load the haplotypes
hp = Haplotypes("output.hap")
hp.data = {}

# iterate through lines of the .blocks.det file
with open(DATADIR / "simple.blocks.det") as blocks_file:
for idx, line in enumerate(blocks_file.read().splitlines()[1:]):
# initialize variables and parse line from the blocks file
hap_id = f"H{idx}"
chrom, bp1, bp2, kb, nsnps, snps = line.split("\t")

# create a haplotype line in the .hap file
hp.data[hap_id] = Haplotype(
chrom=chrom, start=int(bp1), end=int(bp2), id=hap_id
)

# fetch alleles from the genotypes file
snp_gts = gt.subset(variants=tuple(snps.split("|")))

# create variant lines for each haplotype
# Note that the .blocks.det file doesn't specify an allele, so
# we simply choose the first allele (ie the REF allele) for this example
hp.data[hap_id].variants = tuple(
Variant(
start=v["pos"],
end=v["pos"] + len(v["alleles"][0]),
id=v["id"],
allele=v["alleles"][0],
)
for v in snp_gts.variants
)

hp.write()

# validate the output and clean up afterwards
with open("output.hap") as hp_file:
with open(DATADIR / "simple.blocks.det.hap") as expected:
assert hp_file.read() == expected.read()
hp.fname.unlink()

def test_gts2hap(self):
# load variants from the snplist file
variants = {}
Expand Down
Loading