diff --git a/docs/api/examples.rst b/docs/api/examples.rst index f2cc3416..608b5854 100644 --- a/docs/api/examples.rst +++ b/docs/api/examples.rst @@ -8,15 +8,10 @@ Converting a ``.blocks.det`` file into a ``.hap`` file ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ You can use the :ref:`data API ` to easily convert `a PLINK 1.9 .blocks.det file `_ into a ``.hap`` file. -As an example, let's say we would like to convert the following ``.blocks.det`` file. +As an example, let's say we would like to convert `the following .blocks.det file `_. -.. code-block:: - - CHR BP1 BP2 KB NSNPS SNPS - 1 2313888 2331789 17.902 3 rs7527871|rs2840528|rs7545940 - 1 2462779 2482556 19.778 2 rs2296442|rs2246732 - 1 2867411 2869431 2.021 2 rs10752728|rs897635 - 1 2974991 2979823 4.833 3 rs10489588|rs9661525|rs2993510 +.. include:: ../../tests/data/simple.blocks.det + :literal: .. code-block:: python @@ -24,22 +19,24 @@ As an example, let's say we would like to convert the following ``.blocks.det`` # load the genotypes file # you can use either a VCF or PGEN file - gt = data.GenotypesVCF.load("input.vcf.gz") - gt = data.GenotypesPLINK.load("input.pgen") + gt = data.GenotypesVCF.load("tests/data/simple.vcf.gz") + gt = data.GenotypesPLINK.load("tests/data/simple.pgen") # load the haplotypes hp = data.Haplotypes("output.hap") hp.data = {} # iterate through lines of the .blocks.det file - with open("input.blocks.det") as blocks_file: - for idx, line in enumerate(blocks_file.readlines()): + with open("tests/data/simple.blocks.det") as blocks_file: + for idx, line in enumerate(blocks_file.read().splitlines()[1:]): # initialize variables and parse line from the blocks file hap_id = f"H{idx}" chrom, bp1, bp2, kb, nsnps, snps = line.split("\t") # create a haplotype line in the .hap file - hp.data[hap_id] = data.Haplotype(chrom=chrom, start=bp1, end=bp2, id=hap_id) + hp.data[hap_id] = data.Haplotype( + chrom=chrom, start=int(bp1), end=int(bp2), id=hap_id + ) # fetch alleles from the genotypes file snp_gts = gt.subset(variants=tuple(snps.split("|"))) @@ -48,7 +45,12 @@ As an example, let's say we would like to convert the following ``.blocks.det`` # Note that the .blocks.det file doesn't specify an allele, so # we simply choose the first allele (ie the REF allele) for this example hp.data[hap_id].variants = tuple( - data.Variant(start=v["pos"], end=v["pos"]+len(v["alleles"][0]), id=v["id"], allele=v["alleles"][0]) + data.Variant( + start=v["pos"], + end=v["pos"] + len(v["alleles"][0]), + id=v["id"], + allele=v["alleles"][0], + ) for v in snp_gts.variants ) diff --git a/tests/data/simple.blocks.det b/tests/data/simple.blocks.det new file mode 100644 index 00000000..19c4c3e4 --- /dev/null +++ b/tests/data/simple.blocks.det @@ -0,0 +1,4 @@ +CHR BP1 BP2 KB NSNPS SNPS +1 10114 10117 2.001 2 1:10114:T:C|1:10116:A:G +1 10114 10119 2.007 2 1:10114:T:C|1:10117:C:A +1 10116 10119 2.011 2 1:10116:A:G|1:10117:C:A diff --git a/tests/data/simple.blocks.det.hap b/tests/data/simple.blocks.det.hap new file mode 100644 index 00000000..dd2d26f9 --- /dev/null +++ b/tests/data/simple.blocks.det.hap @@ -0,0 +1,10 @@ +# version 0.2.0 +H 1 10114 10117 H0 +H 1 10114 10119 H1 +H 1 10116 10119 H2 +V H0 10114 10115 1:10114:T:C T +V H0 10116 10117 1:10116:A:G A +V H1 10114 10115 1:10114:T:C T +V H1 10117 10118 1:10117:C:A C +V H2 10116 10117 1:10116:A:G A +V H2 10117 10118 1:10117:C:A C diff --git a/tests/test_data.py b/tests/test_data.py index 2fe5626e..31b8481a 100644 --- a/tests/test_data.py +++ b/tests/test_data.py @@ -2114,6 +2114,52 @@ def test_breakpoints_to_pop_array_chrom_no_match(self): class TestDocExamples: + def test_blocks2hap(self): + # load the genotypes file + # you can use either a VCF or PGEN file + gt = GenotypesVCF.load(DATADIR / "simple.vcf.gz") + gt = GenotypesPLINK.load(DATADIR / "simple.pgen") + + # load the haplotypes + hp = Haplotypes("output.hap") + hp.data = {} + + # iterate through lines of the .blocks.det file + with open(DATADIR / "simple.blocks.det") as blocks_file: + for idx, line in enumerate(blocks_file.read().splitlines()[1:]): + # initialize variables and parse line from the blocks file + hap_id = f"H{idx}" + chrom, bp1, bp2, kb, nsnps, snps = line.split("\t") + + # create a haplotype line in the .hap file + hp.data[hap_id] = Haplotype( + chrom=chrom, start=int(bp1), end=int(bp2), id=hap_id + ) + + # fetch alleles from the genotypes file + snp_gts = gt.subset(variants=tuple(snps.split("|"))) + + # create variant lines for each haplotype + # Note that the .blocks.det file doesn't specify an allele, so + # we simply choose the first allele (ie the REF allele) for this example + hp.data[hap_id].variants = tuple( + Variant( + start=v["pos"], + end=v["pos"] + len(v["alleles"][0]), + id=v["id"], + allele=v["alleles"][0], + ) + for v in snp_gts.variants + ) + + hp.write() + + # validate the output and clean up afterwards + with open("output.hap") as hp_file: + with open(DATADIR / "simple.blocks.det.hap") as expected: + assert hp_file.read() == expected.read() + hp.fname.unlink() + def test_gts2hap(self): # load variants from the snplist file variants = {}