Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

added -B/-E to trimfq for keeping first/last INT bp and also -s for shortest read #91

Open
wants to merge 33 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
50dc245
Update seqtk.c
ndaniel Sep 19, 2014
7be87da
Update seqtk.c
ndaniel Sep 19, 2014
0b15910
Update seqtk.c
ndaniel Sep 19, 2014
fd8400b
Update seqtk.c
ndaniel Sep 20, 2014
63918fb
Update README.md
ndaniel Sep 20, 2014
c99d7fc
Update seqtk.c
ndaniel Sep 20, 2014
77bc4e7
Update seqtk.c
ndaniel Sep 22, 2014
e5d97e7
Update seqtk.c
ndaniel Sep 25, 2014
cbe7863
Update README.md
ndaniel Sep 25, 2014
744b2ae
Update seqtk.c
ndaniel Oct 2, 2014
f97e583
Update seqtk.c
ndaniel Oct 2, 2014
442fa31
Update seqtk.c
ndaniel Oct 2, 2014
a87a7f7
Update seqtk.c
ndaniel Oct 3, 2014
f646fb4
Update README.md
ndaniel Oct 3, 2014
a30bec2
Update seqtk.c
ndaniel Oct 3, 2014
9063c2a
Update seqtk.c
ndaniel Oct 3, 2014
f07d0c9
Update seqtk.c
ndaniel Oct 3, 2014
7a34621
Update seqtk.c
ndaniel Oct 6, 2014
9ce4ca9
updates
ndaniel Oct 22, 2015
60513ba
updates
ndaniel Oct 22, 2015
dd555ec
Update README.md
ndaniel Oct 22, 2015
0b4b78d
updates
ndaniel Mar 5, 2017
4e81d66
Added options -B/-E for trimfq.
ndaniel Mar 5, 2017
4402319
Update seqtk.c
ndaniel Mar 5, 2017
760020c
Update README.md
ndaniel Mar 5, 2017
d95666f
Update seqtk.c
ndaniel Mar 5, 2017
1b69f69
added threshold for shortest read while trimming
ndaniel Mar 5, 2017
89aade8
Update README.md
ndaniel Mar 5, 2017
893a380
Update seqtk.c
ndaniel Mar 5, 2017
73b9c25
back to original
ndaniel Mar 6, 2017
649aa9f
back to original
ndaniel Mar 6, 2017
9464ef3
more precise trimming options added for trimfq, such as -s/-E/-B.
ndaniel Mar 6, 2017
a83376c
added option -e for subseq such that exclusion is performed instead o…
ndaniel Jun 6, 2017
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -59,3 +59,6 @@ Seqtk Examples

seqtk trimfq -b 5 -e 10 in.fa > out.fa

* Keep the 50bp from right end of each read by trimming the rest and if the trimmed read length ends up having less the 20bp then the first 20 bp should be kept only:

seqtk trimfq -E 50 -s 20 in.fq > out.fq
75 changes: 56 additions & 19 deletions seqtk.c
Original file line number Diff line number Diff line change
Expand Up @@ -277,29 +277,33 @@ krint64_t kr_rand(krand_t *kr)

/* quality based trimming with Mott's algorithm */
int stk_trimfq(int argc, char *argv[])
{ // FIXME: when a record with zero length will always be treated as a fasta record
{
gzFile fp;
kseq_t *seq;
double param = 0.05, q_int2real[128];
int i, c, min_len = 30, left = 0, right = 0, fixed_len = -1;
while ((c = getopt(argc, argv, "l:q:b:e:L:")) >= 0) {
int i, c, min_len = 30, shortest_len = 1, left = 0, right = 0, left_keep = 0, right_keep = 0;
while ((c = getopt(argc, argv, "l:s:q:b:e:B:E:")) >= 0) {
switch (c) {
case 'q': param = atof(optarg); break;
case 'l': min_len = atoi(optarg); break;
case 's': shortest_len = atoi(optarg); break;
case 'b': left = atoi(optarg); break;
case 'e': right = atoi(optarg); break;
case 'L': fixed_len = atoi(optarg); break;
case 'B': left_keep = atoi(optarg); break;
case 'E': right_keep = atoi(optarg); break;
}
}
if (optind == argc) {
fprintf(stderr, "\n");
fprintf(stderr, "Usage: seqtk trimfq [options] <in.fq>\n\n");
fprintf(stderr, "Options: -q FLOAT error rate threshold (disabled by -b/-e) [%.2f]\n", param);
fprintf(stderr, " -l INT maximally trim down to INT bp (disabled by -b/-e) [%d]\n", min_len);
fprintf(stderr, " -b INT trim INT bp from left (non-zero to disable -q/-l) [0]\n");
fprintf(stderr, " -e INT trim INT bp from right (non-zero to disable -q/-l) [0]\n");
fprintf(stderr, " -L INT retain at most INT bp from the 5'-end (non-zero to disable -q/-l) [0]\n");
fprintf(stderr, " -Q force FASTQ output\n");
fprintf(stderr, "Options: -q FLOAT error rate threshold (disabled by -b/-e/-B/-E) [%.2f]\n", param);
fprintf(stderr, " -l INT maximally trim down to INT bp (disabled by -b/-e/-B/-E) [%d]\n", min_len);
fprintf(stderr, " -s INT trimming by -b/-e/-B/-E shall not produce reads shorter then INT bp [%d]\n", shortest_len);
fprintf(stderr, " -b INT trim INT bp from left (non-zero to disable -q) [0]\n");
fprintf(stderr, " -e INT trim INT bp from right (non-zero to disable -q) [0]\n");
fprintf(stderr, " -B INT keep first INT bp from left (non-zero to disable -q/-e/-E) [%d]\n", left_keep);
fprintf(stderr, " -E INT keep last INT bp from right (non-zero to disable -q/-b/-B) [%d]\n", right_keep);
// fprintf(stderr, " -Q force FASTQ output\n");
fprintf(stderr, "\n");
return 1;
}
Expand All @@ -314,11 +318,34 @@ int stk_trimfq(int argc, char *argv[])
while (kseq_read(seq) >= 0) {
int beg, tmp, end;
double s, max;
if (left || right || fixed_len > 0) {
if (left_keep) {
beg = left; end = left + left_keep;
if (seq->seq.l < end) end = seq->seq.l;
if (seq->seq.l < beg) beg = seq->seq.l;
if (end - beg < shortest_len) {
beg = 0;
end = shortest_len;
if (end > seq->seq.l) end = seq->seq.l;
}
} else if (right_keep) {
beg = seq->seq.l - right_keep - right; end = seq->seq.l - right;
if (beg < 0) beg = 0;
if (end < 0) end = 0;
if (end - beg < shortest_len) {
beg = 0;
end = shortest_len;
if (end > seq->seq.l) end = seq->seq.l;
}
} else if (left || right) {
beg = left; end = seq->seq.l - right;
if (beg >= end) beg = end = 0;
if (fixed_len > 0 && end - beg > fixed_len) end = beg + fixed_len;
} else if (seq->qual.l > min_len) {
if (end < 0) end = 0;
if (seq->seq.l < beg) beg = seq->seq.l;
if (end - beg < shortest_len) {
beg = 0;
end = shortest_len;
if (end > seq->seq.l) end = seq->seq.l;
}
} else if (seq->qual.l > min_len && param != 0.) {
for (i = 0, beg = tmp = 0, end = seq->qual.l, s = max = 0.; i < seq->qual.l; ++i) {
int q = seq->qual.s[i];
if (q < 36) q = 36;
Expand Down Expand Up @@ -548,18 +575,21 @@ int stk_subseq(int argc, char *argv[])
khash_t(reg) *h = kh_init(reg);
gzFile fp;
kseq_t *seq;
int l, i, j, c, is_tab = 0, line = 0;
int l, i, j, c, is_tab = 0, line = 0, is_exclude = 0;
reglist_t dummy;
khint_t k;
while ((c = getopt(argc, argv, "tl:")) >= 0) {
while ((c = getopt(argc, argv, "tel:")) >= 0) {
switch (c) {
case 't': is_tab = 1; break;
case 'e': is_exclude = 1; break;
case 'l': line = atoi(optarg); break;
}
}
if (optind + 2 > argc) {
fprintf(stderr, "\n");
fprintf(stderr, "Usage: seqtk subseq [options] <in.fa> <in.bed>|<name.list>\n\n");
fprintf(stderr, "Options: -t TAB delimited output\n");
fprintf(stderr, " -e exclusion instead of inclusion for sequences from <name.list>\n");
fprintf(stderr, " -l INT sequence line length [%d]\n\n", line);
fprintf(stderr, "Note: Use 'samtools faidx' if only a few regions are intended.\n\n");
return 1;
Expand All @@ -575,12 +605,19 @@ int stk_subseq(int argc, char *argv[])
fprintf(stderr, "[E::%s] failed to open the input file/stream\n", __func__);
return 1;
}
dummy.n= dummy.m = 1; dummy.a = calloc(1, 8);
seq = kseq_init(fp);
while ((l = kseq_read(seq)) >= 0) {
reglist_t *p;
k = kh_get(reg, h, seq->name.s);
if (k == kh_end(h)) continue;
p = &kh_val(h, k);
if (is_exclude == 0) {
if (k == kh_end(h)) continue;
p = &kh_val(h, k);
} else {
if (k != kh_end(h)) continue;
p = &dummy;
dummy.a[0] = INT_MAX;
}
for (i = 0; i < p->n; ++i) {
int beg = p->a[i]>>32, end = p->a[i];
if (beg >= seq->seq.l) {
Expand Down Expand Up @@ -1664,7 +1701,7 @@ static int usage()
{
fprintf(stderr, "\n");
fprintf(stderr, "Usage: seqtk <command> <arguments>\n");
fprintf(stderr, "Version: 1.2-r101-dirty\n\n");
fprintf(stderr, "Version: 1.2-r101c-dirty\n\n");
fprintf(stderr, "Command: seq common transformation of FASTA/Q\n");
fprintf(stderr, " comp get the nucleotide composition of FASTA/Q\n");
fprintf(stderr, " sample subsample sequences\n");
Expand Down