-
Notifications
You must be signed in to change notification settings - Fork 0
/
splitreads.cpp
81 lines (66 loc) · 1.59 KB
/
splitreads.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
#include <iostream>
#include <vector>
#include <unordered_map>
#include <fstream>
#include <string>
#include <zlib.h>
#include <math.h>
#include <sys/stat.h>
#include <sys/types.h>
#include "kseq.h"
KSEQ_INIT(gzFile, gzread)
using namespace std;
int main(int ac, char **av)
{
if (ac != 4)
{
cout << "Usage stats <FASTQ/FASTA File> <CHUNK_SIZE> <OUT_DIR>" << endl;
return -1;
}
string path = av[1];
float chunksize = stof(av[2]);
string outdir = av[3];
struct stat info;
if (info.st_mode & S_IFDIR)
{
cout << "Out directory already exist. Please remove " << outdir << endl;
return -1;
}
else
{
mkdir(outdir.c_str(), 0755);
}
gzFile fp = gzopen(path.c_str(), "r");
kseq_t *ks = kseq_init(fp);
int ret;
u_int64_t no_reads = 0;
while ((ret = kseq_read(ks)) >= 0)
{
no_reads++;
}
cout << "Number of sequences " << no_reads << endl;
gzrewind(fp);
kseq_rewind(ks);
float chunks = ceil((float)no_reads / chunksize);
cout << "Breaking into " << chunks << " chunks" << endl;
no_reads = 0;
ofstream fso;
int chunkid = 1;
while ((ret = kseq_read(ks)) >= 0)
{
no_reads++;
if (no_reads >1 && no_reads % (int)chunksize == 1)
{
chunkid++;
fso.close();
}
if (!fso.is_open())
{
fso.open(outdir + "/" + to_string(chunkid) + ".chunk.fasta");
}
fso << ">" << ks->name.s << "\n" << ks->seq.s << "\n";
}
fso.close();
gzclose(fp);
kseq_destroy(ks);
}