Skip to content

Commit

Permalink
result2profile can print frequencies in tsv format (#907)
Browse files Browse the repository at this point in the history
  • Loading branch information
Victor-Mihaila authored Nov 28, 2024
1 parent 747c64c commit c2c3ad9
Show file tree
Hide file tree
Showing 7 changed files with 68 additions and 13 deletions.
17 changes: 17 additions & 0 deletions src/alignment/PSSMCalculator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -239,6 +239,23 @@ void PSSMCalculator::printPSSM(size_t queryLength){
}
}

void PSSMCalculator::profileToString(std::string& result, size_t queryLength){
result.append(5, ' ');
for (size_t aa = 0; aa < Sequence::PROFILE_AA_SIZE; aa++) {
result.append(SSTR(subMat->num2aa[aa]));
result.append(6, ' ');
}
result.append(1, '\n');
for (size_t i = 0; i < queryLength; i++) {
for (size_t aa = 0; aa < Sequence::PROFILE_AA_SIZE; aa++) {
result.append(SSTR(profile[i * Sequence::PROFILE_AA_SIZE + aa], 4));
result.append(1, ' ');
}
result.append(1, '\n');
}
result.append(1, '\n');
}

void PSSMCalculator::computeLogPSSM(BaseMatrix *subMat, char *pssm, const float *profile, float bitFactor, size_t queryLength, float scoreBias) {
for(size_t pos = 0; pos < queryLength; pos++) {
for(size_t aa = 0; aa < Sequence::PROFILE_AA_SIZE; aa++) {
Expand Down
1 change: 1 addition & 0 deletions src/alignment/PSSMCalculator.h
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ class PSSMCalculator {

void printProfile(size_t queryLength);
void printPSSM(size_t queryLength);
void profileToString(std::string& result, size_t queryLength);

// prepare pseudocounts
static void preparePseudoCounts(float *frequency, float *frequency_with_pseudocounts, size_t entrySize, size_t queryLength, const float **R);
Expand Down
3 changes: 3 additions & 0 deletions src/commons/Parameters.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,7 @@ Parameters::Parameters():
PARAM_PC_MODE(PARAM_PC_MODE_ID, "--pseudo-cnt-mode", "Pseudo count mode", "use 0: substitution-matrix or 1: context-specific pseudocounts", typeid(int), (void *) &pcmode, "^[0-1]{1}$", MMseqsParameter::COMMAND_PROFILE | MMseqsParameter::COMMAND_EXPERT),
PARAM_PCA(PARAM_PCA_ID, "--pca", "Pseudo count a", "Pseudo count admixture strength", typeid(MultiParam<PseudoCounts>), (void *) &pca, "^[0-9]*(\\.[0-9]+)?$", MMseqsParameter::COMMAND_PROFILE | MMseqsParameter::COMMAND_EXPERT),
PARAM_PCB(PARAM_PCB_ID, "--pcb", "Pseudo count b", "Pseudo counts: Neff at half of maximum admixture (range 0.0-inf)", typeid(MultiParam<PseudoCounts>), (void *) &pcb, "^[0-9]*(\\.[0-9]+)?$", MMseqsParameter::COMMAND_PROFILE | MMseqsParameter::COMMAND_EXPERT),
PARAM_PROFILE_OUTPUT_MODE(PARAM_PROFILE_OUTPUT_MODE_ID, "--profile-output-mode", "Profile output mode", "Profile output mode: 0: binary log-odds 1: human-readable frequencies", typeid(int), (void *) &profileOutputMode, "^[0-1]{1}$", MMseqsParameter::COMMAND_PROFILE | MMseqsParameter::COMMAND_EXPERT),
// sequence2profile
PARAM_NEFF(PARAM_NEFF_ID, "--neff", "Neff", "Neff included into context state profile (1.0,20.0)", typeid(float), (void *) &neff, "^[0-9]*(\\.[0-9]+)?$", MMseqsParameter::COMMAND_PROFILE),
PARAM_TAU(PARAM_TAU_ID, "--tau", "Tau", "Tau: context state pseudo count mixture (0.0,1.0)", typeid(float), (void *) &tau, "[0-9]*(\\.[0-9]+)?$", MMseqsParameter::COMMAND_PROFILE),
Expand Down Expand Up @@ -570,6 +571,7 @@ Parameters::Parameters():
result2profile.push_back(&PARAM_THREADS);
result2profile.push_back(&PARAM_COMPRESSED);
result2profile.push_back(&PARAM_V);
result2profile.push_back(&PARAM_PROFILE_OUTPUT_MODE);
// createtsv
createtsv.push_back(&PARAM_FIRST_SEQ_REP_SEQ);
Expand Down Expand Up @@ -2437,6 +2439,7 @@ void Parameters::setDefaults() {
pcmode = PCMODE_SUBSTITUTION_SCORE;
pca = MultiParam<PseudoCounts>(PseudoCounts(1.1, 1.4));
pcb = MultiParam<PseudoCounts>(PseudoCounts(4.1, 5.8));
profileOutputMode = 0;

// sequence2profile
neff = 1.0;
Expand Down
2 changes: 2 additions & 0 deletions src/commons/Parameters.h
Original file line number Diff line number Diff line change
Expand Up @@ -536,6 +536,7 @@ class Parameters {
int pcmode;
MultiParam<PseudoCounts> pca;
MultiParam<PseudoCounts> pcb;
int profileOutputMode;

// sequence2profile
float neff;
Expand Down Expand Up @@ -855,6 +856,7 @@ class Parameters {
PARAMETER(PARAM_PC_MODE)
PARAMETER(PARAM_PCA)
PARAMETER(PARAM_PCB)
PARAMETER(PARAM_PROFILE_OUTPUT_MODE)

// sequence2profile
PARAMETER(PARAM_NEFF)
Expand Down
10 changes: 10 additions & 0 deletions src/commons/Util.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -659,7 +659,17 @@ std::string SSTR(double x) {
return fmt::format("{:.3E}", x);
}

template<>
std::string SSTR(double x, int precision) {
return fmt::format("{:.{}E}", x, precision);
}

template<>
std::string SSTR(float x) {
return fmt::format("{:.3f}", x);
}

template<>
std::string SSTR(float x, int precision) {
return fmt::format("{:.{}f}", x, precision);
}
8 changes: 8 additions & 0 deletions src/commons/Util.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,12 @@ std::string SSTR(T) {
return "";
}

template<typename T>
std::string SSTR(T, int) {
static_assert(assert_false<T>::value , "Not implemented for requested type");
return "";
}

template<> std::string SSTR(const char*);
template<> std::string SSTR(char*);
template<> std::string SSTR(bool);
Expand All @@ -45,6 +51,8 @@ template<> std::string SSTR(long long);
template<> std::string SSTR(unsigned long long);
template<> std::string SSTR(double);
template<> std::string SSTR(float);
template<> std::string SSTR(double, int precision);
template<> std::string SSTR(float, int precision);

#define ARRAY_SIZE(a) (sizeof(a) / sizeof(a[0]))

Expand Down
40 changes: 27 additions & 13 deletions src/util/result2profile.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,11 @@ int result2profile(int argc, const char **argv, const Command &command, bool ret
}

int type = Parameters::DBTYPE_HMM_PROFILE;
if (returnAlnRes) {
const int writePlain = par.profileOutputMode == 1;
if (par.profileOutputMode == 1) {
type = Parameters::DBTYPE_OMIT_FILE;
par.compressed = false;
} else if (returnAlnRes) {
type = Parameters::DBTYPE_ALIGNMENT_RES;
if (needSrcIndex) {
type = DBReader<unsigned int>::setExtendedDbtype(type, Parameters::DBTYPE_EXTENDED_INDEX_NEED_SRC);
Expand Down Expand Up @@ -261,18 +265,25 @@ int result2profile(int argc, const char **argv, const Command &command, bool ret
alnResults,
#endif
par.wg, 0.0);
if (par.compBiasCorrection == true){
SubstitutionMatrix::calcGlobalAaBiasCorrection(&subMat, pssmRes.pssm, pNullBuffer,
Sequence::PROFILE_AA_SIZE,
res.centerLength);
}

if (par.maskProfile == true) {
masker.mask(centerSequence, par.maskProb, pssmRes);
}
pssmRes.toBuffer(centerSequence, subMat, result);
if (writePlain) {
result.clear();
result.append("Query profile of sequence ");
result.append(SSTR(queryKey));
result.push_back('\n');
calculator.profileToString(result, res.centerLength);
} else {
if (par.compBiasCorrection == true){
SubstitutionMatrix::calcGlobalAaBiasCorrection(&subMat, pssmRes.pssm, pNullBuffer,
Sequence::PROFILE_AA_SIZE,
res.centerLength);
}
if (par.maskProfile == true) {
masker.mask(centerSequence, par.maskProb, pssmRes);
}
pssmRes.toBuffer(centerSequence, subMat, result);
}
}
resultWriter.writeData(result.c_str(), result.length(), queryKey, thread_idx);
resultWriter.writeData(result.c_str(), result.length(), queryKey, thread_idx, writePlain == false);
result.clear();
alnResults.clear();

Expand All @@ -281,7 +292,10 @@ int result2profile(int argc, const char **argv, const Command &command, bool ret
}
delete[] pNullBuffer;
}
resultWriter.close(returnAlnRes == false);
resultWriter.close(returnAlnRes == false || writePlain == true);
if (writePlain) {
FileUtil::remove(par.db4Index.c_str());
}
resultReader.close();

if (!sameDatabase) {
Expand Down

0 comments on commit c2c3ad9

Please sign in to comment.