-
Notifications
You must be signed in to change notification settings - Fork 244
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
handle utf-8 encoding #1569
base: master
Are you sure you want to change the base?
handle utf-8 encoding #1569
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -28,6 +28,7 @@ | |
import java.io.IOException; | ||
import java.io.OutputStream; | ||
import java.io.Writer; | ||
import java.nio.charset.*; | ||
|
||
/** | ||
* Fast (I hope) buffered Writer that converts char to byte merely by casting, rather than charset conversion. | ||
|
@@ -70,17 +71,10 @@ public void flush() throws IOException { | |
* All other Writer methods vector through this, so this is the only one that must be overridden. | ||
*/ | ||
@Override | ||
public void write(final char[] chars, int offset, int length) throws IOException { | ||
while (length > 0) { | ||
final int charsToConvert = Math.min(length, buffer.length - numBytes); | ||
StringUtil.charsToBytes(chars, offset, charsToConvert, buffer, numBytes); | ||
numBytes += charsToConvert; | ||
offset += charsToConvert; | ||
length -= charsToConvert; | ||
if (numBytes == buffer.length) { | ||
os.write(buffer, 0, numBytes); | ||
numBytes = 0; | ||
} | ||
} | ||
public void write(final char[] chars, int off, int len) throws IOException { | ||
String str = new String(chars,off,len); | ||
byte[] b = str.getBytes(StandardCharsets.UTF_8); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is going to write all SAM content as UTF-8, but I think the spec only permits UTF-8 in a few places, and requires 7 bit ascii elsewhere. So we may need a different approach. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It seems like we're actually just doing a costly reimplementation of a standard Java Writer here now. It seems like validation of the character ranges should be done in the SamRecord and not here. Maybe this class has outlived it's usefulness and we can just use a standard Writer in place of it. |
||
int bufferLength = b.length; | ||
os.write(b, 0, bufferLength); | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,94 @@ | ||
package htsjdk.samtools; | ||
|
||
import htsjdk.*; | ||
import htsjdk.samtools.util.*; | ||
import org.testng.*; | ||
import org.testng.annotations.*; | ||
import java.io.*; | ||
|
||
public class SAMFileRoundTripTest extends HtsjdkTest{ | ||
private static final File TEST_DATA_DIR = new File("src/test/resources/htsjdk/samtools"); | ||
|
||
@DataProvider(name = "Utf8PositiveTestCases") | ||
public Object[][] Utf8PositiveTestCases() { | ||
SAMProgramRecord programRecordRoundTrip = new SAMProgramRecord("33"); | ||
programRecordRoundTrip.setAttribute("CL","xy"); | ||
programRecordRoundTrip.setAttribute("DS","description"); | ||
|
||
SAMProgramRecord programRecordRoundTripUtf8 = new SAMProgramRecord("33"); | ||
programRecordRoundTripUtf8.setAttribute("CL","äカ"); | ||
programRecordRoundTripUtf8.setAttribute("DS","\uD83D\uDE00リ"); | ||
|
||
SAMSequenceRecord sequenceRecordRoundTrip = new SAMSequenceRecord("chr3",101); | ||
sequenceRecordRoundTrip.setAttribute("DS","descriptionhere"); | ||
sequenceRecordRoundTrip.setSequenceIndex(2); | ||
|
||
SAMSequenceRecord sequenceRecordRoundTripUtf8 = new SAMSequenceRecord("chr3",101); | ||
sequenceRecordRoundTripUtf8.setAttribute("DS","Emoji\uD83D\uDE0A"); | ||
sequenceRecordRoundTripUtf8.setSequenceIndex(2); | ||
|
||
return new Object[][]{ | ||
{"roundtrip.sam", programRecordRoundTrip, "@CO\tcomment here", sequenceRecordRoundTrip}, | ||
{"roundtrip_with_utf8.sam", programRecordRoundTripUtf8, "@CO\tKanjiアメリカ\uD83D\uDE00リä", sequenceRecordRoundTripUtf8} | ||
}; | ||
} | ||
|
||
@Test(dataProvider = "Utf8PositiveTestCases", description = "Test UTF-8 encoding present in permitted fields of a SAM file") | ||
public void Utf8RoundTripPositiveTests(final String inputFile, SAMProgramRecord programRecord,final String commentRecord, SAMSequenceRecord sequenceRecord) throws Exception { | ||
final File input = new File(TEST_DATA_DIR, inputFile); | ||
final File outputFile = File.createTempFile("roundtrip-utf8-out", ".sam"); | ||
outputFile.delete(); | ||
outputFile.deleteOnExit(); | ||
final SAMFileWriterFactory factory = new SAMFileWriterFactory(); | ||
try (SamReader reader = SamReaderFactory.makeDefault().open(input); | ||
SAMFileWriter writer = factory.makeSAMWriter(reader.getFileHeader(), false, new FileOutputStream(outputFile))) { | ||
for (SAMRecord rec : reader) { | ||
writer.addAlignment(rec); | ||
} | ||
SAMFileHeader head = reader.getFileHeader(); | ||
Assert.assertEquals(head.getProgramRecords().get(0), programRecord); | ||
Assert.assertEquals(head.getComments().get(0),commentRecord ); | ||
Assert.assertEquals(head.getSequence("chr3"),sequenceRecord); | ||
} | ||
|
||
final String originalsam; | ||
try (InputStream is = new FileInputStream(input)) { | ||
originalsam = IOUtil.readFully(is); | ||
} | ||
|
||
final String writtenSam; | ||
try (InputStream is = new FileInputStream(outputFile)) { | ||
writtenSam = IOUtil.readFully(is); | ||
} | ||
|
||
Assert.assertEquals(writtenSam, originalsam); | ||
} | ||
|
||
@DataProvider(name = "Utf8NegativeTestCases") | ||
public Object[][] Utf8NegativeTestCases() { | ||
return new Object[][]{ | ||
{"roundtrip_with_utf8_bad_1.sam", "Invalid character in read bases"}, | ||
{"roundtrip_with_utf8_bad_2.sam", "Non-numeric value in POS column"}, | ||
{"roundtrip_with_utf8_bad_2.sam", "Non-numeric value in POS column"} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think these tests may be failing somewhat accidentally, because they're testing multi byte chars in fields that are already constrained to specific sets of characters (numeric for POS, or valid bases for read bases). I'm not certain that its demonstrating that the chars are rejected because they're multi-byte; but rather that its just that they don't conform to the constraints. I suspect they would still fail if you used single byte ASCII characters that didn't conform. It may be find to leave these, but I think we some need additional cases that use multi byte chars in places where there is no constraint other than that they be ASCII. |
||
}; | ||
} | ||
|
||
@Test(dataProvider = "Utf8NegativeTestCases",description = "Test UTF-8 encoding present in unpermitted fields of a SAM file", expectedExceptions = {IllegalArgumentException.class, SAMFormatException.class }) | ||
public void Utf8RoundTripNegativeTest(final String inputFile,final String exceptionString) throws Exception { | ||
final File input = new File(TEST_DATA_DIR, inputFile); | ||
final File outputFile = File.createTempFile("roundtrip-utf8-out", ".sam"); | ||
outputFile.delete(); | ||
outputFile.deleteOnExit(); | ||
final SAMFileWriterFactory factory = new SAMFileWriterFactory(); | ||
try (SamReader reader = SamReaderFactory.makeDefault().open(input); | ||
SAMFileWriter writer = factory.makeSAMWriter(reader.getFileHeader(), false, new FileOutputStream(outputFile))) { | ||
for (SAMRecord rec : reader) { | ||
writer.addAlignment(rec); | ||
} | ||
} | ||
catch (final Exception ex) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The expectedExceptions clause indicates that you expect |
||
Assert.assertTrue(ex.getMessage().contains(exceptionString)); | ||
throw ex; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The way this test is constructed makes it hard to tell if its throwing on read, or on write. Lets replace it with separate tests that show that we fail in both cases. Since we will presumably fail when we read a misplaced multibyte character, I think the test that shows that we fail on write will have to create the output programmatically, rather then via roundtrip. |
||
} | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
@HD VN:1.6 SO:unsorted | ||
@SQ SN:chr1 LN:101 | ||
@SQ SN:chr2 LN:101 | ||
@SQ SN:chr3 LN:101 DS:Emoji😊 | ||
@RG ID:0 SM:Hi,Mom! DS:Kanjiアメリカ | ||
@RG ID:rg1 PL:ILLUMINA SM:sm1 | ||
@PG ID:33 CL:äカ DS:😀リ | ||
@CO Kanjiアメリカ😀リä | ||
A 73 chr2 1 255 10M * 0 0 CAACAGAAGC )'.*.+2,)) RG:Z:rg1 | ||
A 133 * 0 0 * chr2 1 0 CAACAGAAGC )'.*.+2,)) RG:Z:rg1 | ||
B 99 chr1 1 255 10M = 26 35 CAACAGAAGC )'.*.+2,)) RG:Z:rg1 | ||
B 147 chr1 26 255 10M = 1 -35 CAACAGAAGC )'.*.+2,)) RG:Z:rg1 | ||
C 99 chr2 1 255 10M = 26 35 CAACAGAAGC )'.*.+2,)) RG:Z:rg1 | ||
C 147 chr2 26 255 10M = 1 -35 CAACAGAAGC )'.*.+2,)) RG:Z:rg1 | ||
D 99 chr3 1 255 10M = 25 35 CAACAGAAGC )'.*.+2,)) RG:Z:rg1 | ||
D 147 chr3 26 255 10M = 1 -35 CAACAGAAGC )'.*.+2,)) RG:Z:rg1 | ||
E 99 chr1 2 255 10M = 15 30 CAACAGAAGC )'.*.+2,)) RG:Z:rg1 | ||
E 147 chr1 15 255 10M = 2 -30 CAACAGAAGC )'.*.+2,)) RG:Z:rg1 |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
@HD VN:1.6 SO:unsorted | ||
@SQ SN:chr1 LN:101 | ||
@SQ SN:chr2 LN:101 | ||
@SQ SN:chr3 LN:101 DS:Emoji😊 | ||
@RG ID:0 SM:Hi,Mom! DS:Kanjiアメリカ | ||
@RG ID:rg1 PL:ILLUMINA SM:sm1 | ||
@PG ID:33 CL:äカ DS:😀リ | ||
@CO Kanjiアメリカ😀リä | ||
A 73 chr2 1 255 10M * 0 0 CAACAGAAGC )'.*.+2,)) RG:Z:rg1 | ||
A 133 * 0 0 * chr2 1 0 CAリCAGAAGC )'.*.+2,)) RG:Z:rg1 | ||
B 99 chr1 1 255 10M = 26 35 CAACAGAAGC )'.*.+2,)) RG:Z:rg1 | ||
B 147 chr1 26 255 10M = 1 -35 CAACAGAAGC )'.*.+2,)) RG:Z:rg1 | ||
C 99 chr2 1 255 10M = 26 35 CAACAGAAGC )'.*.+2,)) RG:Z:rg1 | ||
C 147 chr1 😀6 255 10M = 1 -35 CAACAGAAGC )'.*.+2,)) RG:Z:rg1 | ||
D 99 chr3 1 255 10M = 25 35 CAACAGAAGC )'.*.+2,)) RG:Z:rg1 | ||
D 147 chr3 26 255 10M = 1 -35 CAACAGAAGC )'.*.+2,)) RG:Z:rg1 | ||
E 99 chr1 2 255 10M = 15 30 CAACAGAAGC )'.*.+2,)) RG:Z:rg1 | ||
E 147 chr1 15 255 10M = 2 -30 CAACAGAAGC )'.*.+2,)) RG:Z:rg1 |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
@HD VN:1.6 SO:unsorted | ||
@SQ SN:chr1 LN:101 | ||
@SQ SN:chr2 LN:101 | ||
@SQ SN:chr3 LN:101 DS:Emoji😊 | ||
@RG ID:0 SM:Hi,Mom! DS:Kanjiアメリカ | ||
@RG ID:rg1 PL:ILLUMINA SM:sm1 | ||
@PG ID:33 CL:äカ DS:😀リ | ||
@CO Kanjiアメリカ😀リä | ||
A 73 chr2 1 255 10M * 0 0 CAACAGAAGC )'.*.+2,)) RG:Z:rg1 | ||
A 133 * 0 0 * chr2 1 0 CAACAGAAGC )'.*.+2,)) RG:Z:rg1 | ||
B 99 chr1 1 255 10M = 26 35 CAACAGAAGC )'.*.+2,)) RG:Z:rg1 | ||
B 147 chr1 26 255 10M = 1 -35 CAACAGAAGC )'.*.+2,)) RG:Z:rg1 | ||
C 99 chr2 1 255 10M = 26 35 CAACAGAAGC )'.*.+2,)) RG:Z:rg1 | ||
C 147 chr1 😀6 255 10M = 1 -35 CAACAGAAGC )'.*.+2,)) RG:Z:rg1 | ||
D 99 chr3 1 255 10M = 25 35 CAACAGAAGC )'.*.+2,)) RG:Z:rg1 | ||
D 147 chr3 26 255 10M = 1 -35 CAACAGAAGC )'.*.+2,)) RG:Z:rg1 | ||
E 99 chr1 2 255 10M = 15 30 CAACAGAAGC )'.*.+2,)) RG:Z:rg1 | ||
E 147 chr1 15 255 10M = 2 -30 CAACAGAAGC )'.*.+2,)) RG:Z:rg1 |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
@HD VN:1.6 SO:unsorted | ||
@SQ SN:chr1 LN:101 | ||
@SQ SN:chr2 LN:101 | ||
@SQ SN:chr3 LN:101 DS:Emoji😊 | ||
@RG ID:0 SM:Hi,Mom! DS:Kanjiアメリカ | ||
@RG ID:rg1 PL:ILLUMINA SM:sm1 | ||
@PG ID:33 CL:äカ DS:😀リ | ||
@CO Kanjiアメリカ😀リä | ||
A 73 chr2 1 255 10M * 0 0 CAACAGAAGC )'.*.+2,)) RG:Z:rg1 | ||
A 133 * 0 0 * chr2 1 0 CAACAGAAGC )'.*.+2,)) RG:Z:rg1 | ||
B 99 chr1 1 255 10M = 26 35 CAACAGAAGC )'.*.+2,)) R😀:Z:rg1 | ||
B 147 chr1 26 255 10M = 1 -35 CAACAGAAGC )'.*.+2,)) RG:Z:rg1 | ||
C 99 chr2 1 255 10M = 26 35 CAACAGAAGC )'.*.+2,)) RG:Z:rg1 | ||
C 147 chr1 😀6 255 10M = 1 -35 CAACAGAAGC )'.*.+2,)) RG:Z:rg1 | ||
D 99 chr3 1 255 10M = 25 35 CAACAGAAGC )'.*.+2,)) RG:Z:rg1 | ||
D 147 chr3 26 255 10M = 1 -35 CAACAGAAGC )'.*.+2,)) RG:Z:rg1 | ||
E 99 chr1 2 255 10M = 15 30 CAACAGAAGC )'.*.+2,)) RG:Z:rg1 | ||
E 147 chr1 15 255 10M = 2 -30 CAACAGAAGC )'.*.+2,)) RG:Z:rg1 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
By convention we try to avoid import *, except in the case where not doing so would require a really large list of imports. In this case, it looks like you're only using
StandardCharsets
, so you can just import that.