Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix mac issues #159

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions src/hhalign.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,8 @@ void HHalign::help(Parameters& par, char all) {
printf(" -norealign do NOT realign displayed hits with MAC algorithm (def=realign) \n");
printf(" -mact [0,1[ posterior prob threshold for MAC realignment controlling greedi- \n");
printf(" ness at alignment ends: 0:global >0.1:local (default=%.2f) \n", par.mact);
printf(" -mac_min_length minimum length of MAC hit in comparison with the Viterbi hit\n");
printf(" (in terms of matched_cols, default=%.2f)\n", par.mac_min_length);
printf(" -glob/-loc use global/local alignment mode for searching/ranking (def=local)\n");

if (all) {
Expand Down Expand Up @@ -518,6 +520,9 @@ void HHalign::ProcessArguments(Parameters& par) {
else if (!strcmp(argv[i], "-mact") && (i < argc - 1)) {
par.mact = atof(argv[++i]);
}
else if (!strcmp(argv[i], "-mac_min_length")
&& (i < argc - 1))
par.mac_min_length = atof(argv[++i]);
//not in the help - but intended
else if (!strcmp(argv[i], "-scwin") && (i < argc - 1)) {
par.columnscore = 5;
Expand Down
191 changes: 142 additions & 49 deletions src/hhbacktracemac.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -108,70 +108,162 @@ void PosteriorDecoder::writeProfilesToHits(HMM &q, HMM &t, PosteriorMatrix &p_mm
}
}

void PosteriorDecoder::backtraceMAC(HMM & q, HMM & t, PosteriorMatrix & p_mm, ViterbiMatrix & backtrace_matrix, const int elem, Hit & hit, float corr) {

// Trace back trough the matrix b[i][j] until STOP state is found

LogLevel actual_level = Log::reporting_level();
int step; // counts steps in path through 5-layered dynamic programming matrix
int i,j; // query and template match state indices

initializeBacktrace(t,hit);

// Make sure that backtracing stops when t:M1 or q:M1 is reached (Start state), e.g. sMM[i][1], or sIM[i][1] (M:MM, B:IM)
for (i = 0; i <= q.L; ++i) backtrace_matrix.setMatMat(i, 1, elem, ViterbiMatrix::STOP); // b[i][1] = STOP;
for (j = 1; j <= t.L; ++j) backtrace_matrix.setMatMat(1, j, elem, ViterbiMatrix::STOP); // b[1][j] = STOP;

void PosteriorDecoder::findBacktraceMAC(ViterbiMatrix & backtrace_matrix, const int elem,
int & i1, int & j1, int i2, int j2, std::vector<int> & vi, std::vector<int> & vj, std::vector<char> & states,
int & matched_cols, int & nsteps,
LogLevel & actual_level) {
// Back-tracing loop
// In contrast to the Viterbi-Backtracing, STOP signifies the first Match-Match state, NOT the state before the first MM state
hit.matched_cols = 1; // for each MACTH (or STOP) state matched_col is incremented by 1
hit.state = ViterbiMatrix::MM; // lowest state with maximum score must be match-match state
step = 0; // steps through the matrix correspond to alignment columns (from 1 to nsteps)
i = hit.i2; j = hit.j2; // last aligned pair is (i2,j2)
matched_cols = 1; // for each MACTH (or STOP) state matched_col is incremented by 1
char state = ViterbiMatrix::MM; // lowest state with maximum score must be match-match state
int step = 0; // steps through the matrix correspond to alignment columns (from 1 to nsteps)
int i = i2; // last aligned pair is (i2,j2)
int j = j2;

// arrays start from 1
vi.push_back(0);
vj.push_back(0);
states.push_back((char) ViterbiMatrix::STOP); // note: static cast is needed for avoiding "undefined" ViterbiMatrix::MM object error

if (backtrace_matrix.getMatMat(i, j, elem) != ViterbiMatrix::MM) { // b[i][j] != MM
if (Log::reporting_level() > DEBUG)
fprintf(stderr,"Error: backtrace does not start in match-match state, but in state %i, (i,j)=(%i,%i)\n",backtrace_matrix.getMatMat(i, j, elem),i,j);

step = 0;
hit.i[step] = i;
hit.j[step] = j;
hit.alt_i->push_back(i);
hit.alt_j->push_back(j);
hit.state = ViterbiMatrix::STOP;
vi[0] = i;
vj[0] = j;
state = ViterbiMatrix::STOP;
} else {
while (hit.state != ViterbiMatrix::STOP) {
while (state != ViterbiMatrix::STOP) {
step++;
hit.states[step] = hit.state = backtrace_matrix.getMatMat(i, j, elem); // b[i][j];
hit.i[step] = i;
hit.j[step] = j;
hit.alt_i->push_back(i);
hit.alt_j->push_back(j);
// Exclude cells in direct neighbourhood from all further alignments
for (int ii = imax(i-2,1); ii <= imin(i+2, q.L); ++ii)
// hit.cell_off[ii][j] = 1;
backtrace_matrix.setCellOff(ii, j, elem, true);
for (int jj = imax(j-2,1); jj <= imin(j+2, t.L); ++jj)
backtrace_matrix.setCellOff(i, jj, elem, true);

if (hit.state == ViterbiMatrix::MM) hit.matched_cols++;

switch (hit.state) {
state = backtrace_matrix.getMatMat(i, j, elem); // b[i][j];
states.push_back(state);
vi.push_back(i);
vj.push_back(j);

if (state == ViterbiMatrix::MM) matched_cols++;

switch (state) {
case ViterbiMatrix::MM: i--; j--; break;
case ViterbiMatrix::IM: j--; break;
case ViterbiMatrix::MI: i--; break;
case ViterbiMatrix::STOP: break;
default:
fprintf(stderr,"Error: unallowed state value %i occurred during backtracing at step %i, (i,j)=(%i,%i)\n", hit.state, step, i, j);
hit.state = 0;
fprintf(stderr,"Error: unallowed state value %i occurred during backtracing at step %i, (i,j)=(%i,%i)\n", state, step, i, j);
state = 0;
actual_level = DEBUG1;
break;
} //end switch (state)
} //end while (state)
}
hit.i1 = hit.i[step];
hit.j1 = hit.j[step];
hit.states[step] = ViterbiMatrix::MM; // first state (STOP state) is set to MM state
hit.nsteps = step;
// first state (STOP state) is set to MM state
states.push_back((char) ViterbiMatrix::MM); // note: static cast is needed for avoiding "undefined" ViterbiMatrix::MM object error
nsteps = step;
i1 = i2;
j1 = j2;
i1 = vi[nsteps];
j1 = vj[nsteps];
}

void PosteriorDecoder::backtraceMAC(HMM & q, HMM & t, PosteriorMatrix & p_mm, ViterbiMatrix & backtrace_matrix, const int elem, Hit & hit, float corr, float mac_min_length) {

// Trace back trough the matrix b[i][j] until STOP state is found

LogLevel actual_level = Log::reporting_level();
int step; // counts steps in path through 5-layered dynamic programming matrix
int i,j; // query and template match state indices

initializeBacktrace(q,t,hit);

// Make sure that backtracing stops when t:M1 or q:M1 is reached (Start state), e.g. sMM[i][1], or sIM[i][1] (M:MM, B:IM)
for (i = 0; i <= q.L; ++i) backtrace_matrix.setMatMat(i, 1, elem, ViterbiMatrix::STOP); // b[i][1] = STOP;
for (j = 1; j <= t.L; ++j) backtrace_matrix.setMatMat(1, j, elem, ViterbiMatrix::STOP); // b[1][j] = STOP;

std::vector<int> vi;
std::vector<int> vj;
std::vector<char> states;
int matched_cols, nsteps;
int i2 = hit.i2;
int j2 = hit.j2;
int i1, j1;

findBacktraceMAC(backtrace_matrix, elem, i1, j1, i2, j2, vi, vj, states, matched_cols, nsteps, actual_level);

char msg[512];
if (i2 < m_temp_hit->i1 || j2 < m_temp_hit->j1 || i1 > m_temp_hit->i2 || j1 > m_temp_hit->j2) {
snprintf(msg, sizeof(msg), "The main MAC hit (%d,%d)-(%d,%d) is outside the Viterbi hit scope (%d,%d)-(%d,%d), looking for the alternative...\n",
i1, j1, i2, j2, m_temp_hit->i1, m_temp_hit->j1, m_temp_hit->i2, m_temp_hit->j2);
HH_LOG(WARNING) << msg;
// find first hit from mac_hit_end_positions which overlaps with Viterbi hit
int no = 1;
int found = 0;
int i2_0 = i2, j2_0 = j2;
for(std::multimap<double, std::pair<int, int> >::iterator it = hit.mac_hit_end_positions.begin();
it != hit.mac_hit_end_positions.end(); it++, no++) {
double score = -it->first;
i2 = it->second.first;
j2 = it->second.second;
vi.clear();
vj.clear();
states.clear();
findBacktraceMAC(backtrace_matrix, elem, i1, j1, i2, j2, vi, vj, states, matched_cols, nsteps, actual_level);
if (!(i2 < m_temp_hit->i1 || j2 < m_temp_hit->j1 || i1 > m_temp_hit->i2 || j1 > m_temp_hit->j2)) {
snprintf(msg, sizeof(msg), "Found the alternative MAC hit #%d (%d,%d)-(%d,%d) (score=%f) inside the Viterbi hit scope (%d,%d)-(%d,%d)\n",
no, i1, j1, i2, j2, score, m_temp_hit->i1, m_temp_hit->j1, m_temp_hit->i2, m_temp_hit->j2);
HH_LOG(WARNING) << msg;
found = no;
break;
}
}
if (found == 0) {
nsteps = 0;
}
}
int mac_is_valid = 1;
if (matched_cols < hit.matched_cols * mac_min_length) {
snprintf(msg, sizeof(msg), "MAC alignment is too short, matched_cols=%d, the threshold is %f of the original matched_cols=%d: restoring the original Viterbi hit (%d,%d)-(%d,%d). You can consider using lower -mact value in order to avoid such cases.\n", matched_cols, mac_min_length, hit.matched_cols, m_temp_hit->i1, m_temp_hit->j1, m_temp_hit->i2, m_temp_hit->j2);
HH_LOG(WARNING) << msg;
mac_is_valid = 0;
} else if (matched_cols < hit.matched_cols) {
snprintf(msg, sizeof(msg), "MAC alignment is shorter (matched_cols=%d) than the Viterbi hit (matched_cols=%d). You can consider using lower -mact value or higher -mac_min_length value in order to avoid such cases.\n", matched_cols, hit.matched_cols);
HH_LOG(WARNING) << msg;
}
if (!nsteps) {
snprintf(msg, sizeof(msg), "Could not find valid MAC alternative hit for the original Viterbi hit scope: restoring the original Viterbi hit (%d,%d)-(%d,%d). You can consider using lower -mact value in order to avoid such cases.\n", m_temp_hit->i1, m_temp_hit->j1, m_temp_hit->i2, m_temp_hit->j2);
HH_LOG(WARNING) << msg;
mac_is_valid = 0;
}
if (!mac_is_valid) {
restoreHitPath(hit);
return;
}

// register the checked hit
for (step = 1; step <= nsteps; step++) {
hit.states[step] = hit.state = states[step];
i = vi[step]; j = vj[step];
hit.i[step] = i;
hit.j[step] = j;
hit.alt_i->push_back(i);
hit.alt_j->push_back(j);
// Exclude cells in direct neighbourhood from all further alignments
for (int ii = imax(i-2,1); ii <= imin(i+2, q.L); ++ii)
backtrace_matrix.setCellOff(ii, j, elem, true);
for (int jj = imax(j-2,1); jj <= imin(j+2, t.L); ++jj)
backtrace_matrix.setCellOff(i, jj, elem, true);
}
if (nsteps == 0) {
hit.i[0] = vi[0];
hit.j[0] = vj[0];
hit.alt_i->push_back(vi[0]);
hit.alt_j->push_back(vj[0]);
hit.state = ViterbiMatrix::STOP;
}
hit.i1 = hit.i[nsteps];
hit.j1 = hit.j[nsteps];
hit.states[nsteps] = ViterbiMatrix::MM; // first state (STOP state) is set to MM state
hit.nsteps = nsteps;
hit.matched_cols = matched_cols;

// Allocate new space for alignment scores
hit.S = new float[hit.nsteps+1];
Expand Down Expand Up @@ -270,22 +362,23 @@ void PosteriorDecoder::backtraceMAC(HMM & q, HMM & t, PosteriorMatrix & p_mm, Vi
/////////////////////////////////////////////////////////////////////////////////////
// Allocate memory for data of new alignment (sequence names, alignment, scores,...)
/////////////////////////////////////////////////////////////////////////////////////
void PosteriorDecoder::initializeBacktrace(HMM & t, Hit & hit) {
void PosteriorDecoder::initializeBacktrace(HMM & q, HMM & t, Hit & hit) {
int maxlen = q.L + t.L + 1;
// Allocate new space
if(hit.i) {
delete[] hit.i;
}
hit.i = new int[hit.i2 + hit.j2 + 2];
hit.i = new int[maxlen];

if(hit.j) {
delete[] hit.j;
}
hit.j = new int[hit.i2 + hit.j2 + 2];
hit.j = new int[maxlen];

if(hit.states) {
delete[] hit.states;
}
hit.states = new char[hit.i2 + hit.j2 + 2];
hit.states = new char[maxlen];

if(hit.S) {
delete[] hit.S;
Expand Down
5 changes: 5 additions & 0 deletions src/hhblits.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -292,6 +292,8 @@ void HHblits::help(Parameters& par, char all) {
printf(" -realign_old_hits realign hits from previous iterations \n");
printf(" -mact [0,1[ posterior prob threshold for MAC realignment controlling greedi- \n");
printf(" ness at alignment ends: 0:global >0.1:local (default=%.2f) \n", par.mact);
printf(" -mac_min_length minimum length of MAC hit in comparison with the Viterbi hit\n");
printf(" (in terms of matched_cols, default=%.2f)\n", par.mac_min_length);
printf(" -glob/-loc use global/local alignment mode for searching/ranking (def=local)\n");
if (all) {
printf(" -realign realign displayed hits with max. accuracy (MAC) algorithm \n");
Expand Down Expand Up @@ -762,6 +764,9 @@ void HHblits::ProcessArguments(Parameters& par) {
else if ((!strcmp(argv[i], "-mact") || !strcmp(argv[i], "-mapt"))
&& (i < argc - 1))
par.mact = atof(argv[++i]);
else if (!strcmp(argv[i], "-mac_min_length")
&& (i < argc - 1))
par.mac_min_length = atof(argv[++i]);
else if (!strcmp(argv[i], "-sc") && (i < argc - 1))
par.columnscore = atoi(argv[++i]);
//no help required
Expand Down
1 change: 1 addition & 0 deletions src/hhdecl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@ Parameters::Parameters(const int argc, const char** argv) : argc(argc), argv(arg
ssa = 1.0f; // weight of ss evolution matrix
shift = -0.03f; // Shift match score up
mact = 0.3501f; // Probability threshold for MAC alignment in local mode for alignment ends (set to 0.3501 to track user modification)
mac_min_length = 0.0f; // Minimum length of MAC hit in comparison with the original Viterbi hit (in terms of matched_cols)
corr = 0.1f; // Weight of correlations of scores for |i-j|<=4

egq = 0.0f; // no charge for end gaps as default
Expand Down
1 change: 1 addition & 0 deletions src/hhdecl.h
Original file line number Diff line number Diff line change
Expand Up @@ -239,6 +239,7 @@ class Parameters // Parameters for gap penalties and pseudocounts
float corr; // Weight of correlations between scores with |i-j|<=4
float shift; // Score offset for match-match states
double mact; // Probability threshold (negative offset) in MAC alignment determining greediness at ends of alignment
float mac_min_length; // Minimum length of MAC hit in comparison with the original Viterbi hit (in terms of matched_cols)
int realign_max; // Realign max ... hits
float maxmem; // maximum available memory in GB for realignment (approximately)

Expand Down
34 changes: 34 additions & 0 deletions src/hhhit.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
#include <time.h>
#include <ctype.h>
#include <vector>
#include <map>

class Hit;

Expand Down Expand Up @@ -138,6 +139,39 @@ class Hit
char state; // 0: Start/STOP state 1: MM state 2: GD state (-D) 3: IM state 4: DG state (D-) 5 MI state
int min_overlap;

/* macAlgorithm() method stores here all the possible MAC path end positions (it's needed by backtraceMAC()) */
std::multimap<double, std::pair<int, int> > mac_hit_end_positions;
int max_mac_hit_end_positions;

void initMACHitEndPositions(int max) {
if (mac_hit_end_positions.size() > 0) {
mac_hit_end_positions.erase(mac_hit_end_positions.begin(), mac_hit_end_positions.end());
}
max_mac_hit_end_positions = max;
}

void storeMACHitEndPosition(double score, int i2, int j2) {
std::pair<int, int> p = std::pair<int, int>(i2, j2);
mac_hit_end_positions.insert(std::pair<double, std::pair<int, int> >(-score, p));
// store no more than max_mac_hit_end_positions for the best MAC hits
while (mac_hit_end_positions.size() > max_mac_hit_end_positions) {
mac_hit_end_positions.erase(std::next(mac_hit_end_positions.rbegin()).base());
}
}

void eraseMACHitEndPosition(double score, int i2, int j2) {
typedef std::multimap<double, std::pair<int, int> >::iterator mapIterator;

std::pair<mapIterator, mapIterator> its = mac_hit_end_positions.equal_range(-score);
std::pair<int, int> p = std::pair<int, int>(i2, j2);
for(mapIterator it = its.first; it != its.second; it++) {
if (it->second == p) {
mac_hit_end_positions.erase(it);
return;
}
}
}

private:
// Calculate probability of true positive : p_TP(score)/( p_TP(score)+p_FP(score) )
// TP: same superfamily OR MAXSUB score >=0.1
Expand Down
11 changes: 11 additions & 0 deletions src/hhmacalgorithm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ void PosteriorDecoder::macAlgorithm(HMM & q, HMM & t, Hit & hit,
// char * c_ptr = new char;
char val;
hit.min_overlap = 0;
hit.initMACHitEndPositions(imax(q.L, t.L)); // reasonable limit on the number of alternative MAC hits
// Dynamic programming
for (i = 1; i <= q.L; ++i) { // Loop through query positions i
// If q is compared to t, exclude regions where overlap of q with t < min_overlap residues
Expand Down Expand Up @@ -128,6 +129,13 @@ void PosteriorDecoder::macAlgorithm(HMM & q, HMM & t, Hit & hit,
hit.i2 = i; hit.j2 = j;
score_MAC = S_curr[j];
}
if((m_local || i == q.L) && val == ViterbiMatrix::MM) { // local alignment path ends with MM state
hit.storeMACHitEndPosition(S_curr[j], i, j);
if (val == ViterbiMatrix::MM) {
// delete suboptimal previous position
hit.eraseMACHitEndPosition(S_prev[j-1], i - 1, j - 1);
}
}

} // end if

Expand All @@ -145,6 +153,9 @@ void PosteriorDecoder::macAlgorithm(HMM & q, HMM & t, Hit & hit,
hit.j2 = jmax;
score_MAC = S_curr[jmax];
}
if (!m_local) {
hit.storeMACHitEndPosition(S_curr[jmax], i, jmax);
}

for (j = 0; j <= t.L; ++j)
S_prev[j] = S_curr[j];
Expand Down
Loading