-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlikelihood.C
143 lines (125 loc) · 3.46 KB
/
likelihood.C
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
/****************************************************************
likelihood.C
Copyright (C)2014 William H. Majoros ([email protected]).
This is OPEN SOURCE SOFTWARE governed by the Gnu General Public
License (GPL) version 3, as described at www.opensource.org.
****************************************************************/
#include <iostream>
#include "BOOM/String.H"
#include "BOOM/CommandLine.H"
#include "BOOM/FastaReader.H"
#include "BOOM/Alphabet.H"
#include "BOOM/Exceptions.H"
#include "BOOM/Sequence.H"
#include "BinaryProductions.H"
#include "SCFG.H"
#include "Inside.H"
using namespace std;
using namespace BOOM;
class Application {
Alphabet alphabet, bgAlphabet;
BinaryProductions binaryProductions, bgProds;
void getAlphabet(SCFG &,Alphabet &);
float applyWindows(Sequence &,BinaryProductions &,int windowSize);
public:
Application();
int main(int argc,char *argv[]);
};
int main(int argc,char *argv[])
{
try
{
Application app;
return app.main(argc,argv);
}
catch(const RootException &e) {
cerr<<e.getMessage()<<endl;
}
catch(const char *p)
{
cerr << p << endl;
}
catch(const string &msg)
{
cerr << msg.c_str() << endl;
}
catch(const exception &e)
{
cerr << "STL exception caught in main:\n" << e.what() << endl;
}
catch(...)
{
cerr << "Unknown exception caught in main" << endl;
}
return -1;
}
Application::Application()
{
// ctor
}
int Application::main(int argc,char *argv[])
{
// Process command line
CommandLine cmd(argc,argv,"b:w:");
if(cmd.numArgs()!=2)
throw String("\n\
likelihood [options] <grammar.cnf> <seq.fasta>\n\
-b filename : compute likelihood ratios using bg model from file\n\
-w windowsize : use nonoverlapping windows\n\
");
const String grammarFile=cmd.arg(0);
const String fastaFile=cmd.arg(1);
const bool wantRatios=cmd.option('b');
const bool wantWindows=cmd.option('w');
int windowSize=wantWindows ? cmd.optParm('w').asInt() : 0;
// Load and recode grammar using binary productions for fast indexing
SCFG G(grammarFile);
getAlphabet(G,alphabet);
binaryProductions.init(G,alphabet);
if(wantRatios) {
SCFG bg(cmd.optParm('b'));
getAlphabet(bg,bgAlphabet);
bgProds.init(bg,bgAlphabet);
}
// Process sequence file
FastaReader reader(fastaFile,alphabet);
String def, seqStr;
while(reader.nextSequence(def,seqStr)) {
const int L=seqStr.length();
if(L==0) throw "empty sequence encountered";
Sequence seq(seqStr,alphabet);
float lik=applyWindows(seq,binaryProductions,windowSize);
if(wantRatios) lik-=applyWindows(seq,bgProds,windowSize);
cout<<lik<<endl;
}
return 0;
}
float Application::applyWindows(Sequence &seq,BinaryProductions &prods,
int windowSize)
{
float lik=0;
Sequence subseq;
int L=seq.getLength();
if(windowSize<1) windowSize=L;
for(int begin=0 ; begin<L ; begin+=windowSize) {
int end=begin+windowSize;
if(end>=L) end=L;
int len=end-begin;
seq.getSubsequence(begin,len,subseq);
Inside inside(subseq,prods);
lik+=inside.likelihood();
}
return lik;
}
void Application::getAlphabet(SCFG &G,Alphabet &alphabet)
{
GrammarAlphabet &terminals=G.getTerminals();
const int N=terminals.size();
for(int i=0 ; i<N ; ++i) {
GrammarSymbol *s=terminals[i];
const String &lexeme=s->getLexeme();
if(lexeme.length()!=1)
throw lexeme+" has improper length for a terminal";
alphabet.add(lexeme[0]);
}
}