Skip to content

Commit

Permalink
Expose the ability to encode a Genotoype into a GT field (#1648)
Browse files Browse the repository at this point in the history
* Expose the ability to encode a Genotoype into a GT field by exposing
 two public methods in VCFEncoder: writeGtField and encodeGtField
* Supports broadinstitute/gatk#8160 but seems like a useful thing to be able to do in general
* minor breaking change in VCFEncoder, made methods formatVCFField and buildAlleleStrings static
  It is unlikely anyone overrides either of these methods so it should not be a problem.
  • Loading branch information
lbergelson authored Jan 31, 2023
1 parent 1024da5 commit 8108d56
Show file tree
Hide file tree
Showing 2 changed files with 78 additions and 8 deletions.
50 changes: 42 additions & 8 deletions src/main/java/htsjdk/variant/vcf/VCFEncoder.java
Original file line number Diff line number Diff line change
Expand Up @@ -219,7 +219,7 @@ private void fieldIsMissingFromHeaderError(final VariantContext vc, final String
}

@SuppressWarnings("rawtypes")
String formatVCFField(final Object val) {
static String formatVCFField(final Object val) {
final String result;
if (val == null) {
result = VCFConstants.MISSING_VALUE_v4;
Expand Down Expand Up @@ -327,11 +327,7 @@ public void addGenotypeData(final VariantContext vc, final Map<Allele, String> a
throw new IllegalStateException("GTs cannot be missing for some samples if they are available for others in the record");
}

writeAllele(g.getAllele(0), alleleMap, vcfoutput);
for (int i = 1; i < g.getPloidy(); i++) {
vcfoutput.append(g.isPhased() ? VCFConstants.PHASED : VCFConstants.UNPHASED);
writeAllele(g.getAllele(i), alleleMap, vcfoutput);
}
writeGtField(alleleMap, vcfoutput, g);
continue;

} else {
Expand Down Expand Up @@ -387,6 +383,21 @@ public void addGenotypeData(final VariantContext vc, final Map<Allele, String> a
}
}

/**
* write the encoded GT field for a Genotype
* @param alleleMap a mapping of Allele -> GT allele value (from {@link this#buildAlleleStrings(VariantContext)}
* @param vcfoutput the appendable to write to, to avoid inefficiency due to string copying
* @param g the genotoype to encode
* @throws IOException if appending fails with an IOException
*/
public static void writeGtField(final Map<Allele, String> alleleMap, final Appendable vcfoutput, final Genotype g) throws IOException {
writeAllele(g.getAllele(0), alleleMap, vcfoutput);
for (int i = 1; i < g.getPloidy(); i++) {
vcfoutput.append(g.isPhased() ? VCFConstants.PHASED : VCFConstants.UNPHASED);
writeAllele(g.getAllele(i), alleleMap, vcfoutput);
}
}

/*
* Create the info string; assumes that no values are null
*/
Expand Down Expand Up @@ -416,8 +427,31 @@ private void writeInfoString(final Map<String, String> infoFields, final Appenda
}
}

public Map<Allele, String> buildAlleleStrings(final VariantContext vc) {
final Map<Allele, String> alleleMap = new HashMap<Allele, String>(vc.getAlleles().size() + 1);
/**
* Easy way to generate the GT field for a Genotype. This will be less efficient than using
* {@link this#writeGtField(Map, Appendable, Genotype)} because of redundant Map initializations
* @param vc a VariantContext which must contain g or the results are likely to be incorrect
* @param g a Genotype in vc
* @return a String containing the encoding of the GT field of g
*/
public static String encodeGtField(VariantContext vc, Genotype g) {
final StringBuilder builder = new StringBuilder();
try {
writeGtField(VCFEncoder.buildAlleleStrings(vc), builder, g);
} catch (final IOException e) {
throw new RuntimeException("Somehow we failed to append to a StringBuilder, this shouldn't happen.", e);
}
return builder.toString();
}

/**
* return a Map containing Allele -> String(allele position) for all Alleles in VC
* (as well as NO_CALL)
* ex: A,T,TC -> { A:0, T:1, TC:2, NO_CALL:EMPTY_ALLELE}
* This may be efficient when looking up values for many genotypes per VC
*/
public static Map<Allele, String> buildAlleleStrings(final VariantContext vc) {
final Map<Allele, String> alleleMap = new HashMap<>(vc.getAlleles().size() + 1);
alleleMap.put(Allele.NO_CALL, VCFConstants.EMPTY_ALLELE); // convenience for lookup

final List<Allele> alleles = vc.getAlleles();
Expand Down
36 changes: 36 additions & 0 deletions src/test/java/htsjdk/variant/vcf/VCFEncoderTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,15 @@
import htsjdk.HtsjdkTest;
import htsjdk.tribble.util.ParsingUtils;
import htsjdk.variant.variantcontext.Allele;
import htsjdk.variant.variantcontext.Genotype;
import htsjdk.variant.variantcontext.GenotypeBuilder;
import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.variantcontext.VariantContextBuilder;
import org.testng.Assert;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
Expand Down Expand Up @@ -135,6 +137,40 @@ public Object[][] makeMissingFormatTestData() {
return tests.toArray(new Object[][]{});
}

@Test
public void testEncodeGT(){
final VariantContextBuilder vcb = new VariantContextBuilder("test",
"chr?", 100, 100,
Arrays.asList(Allele.REF_A, Allele.ALT_T, Allele.create("TC")));
final Genotype g1 = new GenotypeBuilder("s1", Arrays.asList(Allele.REF_A, Allele.REF_A)).make();
final Genotype g2 = new GenotypeBuilder("s2", Arrays.asList(Allele.ALT_T, Allele.create("TC"))).make();
vcb.genotypes(g1, g2);
final VariantContext vc = vcb.make();

Assert.assertEquals(VCFEncoder.encodeGtField(vc, g1), "0/0");
Assert.assertEquals(VCFEncoder.encodeGtField(vc, g2), "1/2");
}

@Test
public void testsWriteGtField() throws IOException {
final VariantContextBuilder vcb = new VariantContextBuilder("test",
"chr?", 100, 100,
Arrays.asList(Allele.REF_A, Allele.ALT_T, Allele.create("TC")));
final Genotype g1 = new GenotypeBuilder("s1", Arrays.asList(Allele.REF_A, Allele.REF_A)).make();
final Genotype g2 = new GenotypeBuilder("s2", Arrays.asList(Allele.ALT_T, Allele.create("TC"))).make();
vcb.genotypes(g1, g2);
final VariantContext vc = vcb.make();
final Map<Allele, String> alleleStringMap = VCFEncoder.buildAlleleStrings(vc);

final StringBuilder b1 = new StringBuilder();
VCFEncoder.writeGtField(alleleStringMap, b1, g1 );
Assert.assertEquals(b1.toString(), "0/0");

final StringBuilder b2 = new StringBuilder();
VCFEncoder.writeGtField(alleleStringMap, b2, g2);
Assert.assertEquals(b2.toString(), "1/2");
}

@Test(dataProvider = "MissingFormatTestData")
public void testMissingFormatFields(final VCFEncoder encoder, final VariantContext vc, final String expectedLastColumn, final Map<Allele, String> alleleMap, final List<String> genotypeFormatKeys) {
final StringBuilder sb = new StringBuilder();
Expand Down

0 comments on commit 8108d56

Please sign in to comment.