Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[BUGS#1251][6.0][backport] fix: FindMatches#search ignore penalty when segmented matches #1215

Open
wants to merge 4 commits into
base: releases/6.0
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion release/changes.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
OmegaT 6.0.2
----------------------------------------------------------------------
0 Enhancement
4 Bug fixes
5 Bug fixes
0 Localisation updates
----------------------------------------------------------------------

Expand All @@ -11,6 +11,11 @@
- Preferences/CutomColorSelector don't show notification immediately when modifying color
https://sourceforge.net/p/omegat/bugs/1273/

- tm/penalty-nnn value is not respected
- FindMatcher always search separate segment match without penalty
https://sourceforge.net/p/omegat/bugs/1248/
https://sourceforge.net/p/omegat/bugs/1251/

- [Revert regression] List of issues stays hidden in the background
https://sourceforge.net/p/omegat/bugs/1225/

Expand Down
4 changes: 2 additions & 2 deletions src/org/omegat/core/data/ProjectTMX.java
Original file line number Diff line number Diff line change
Expand Up @@ -74,14 +74,14 @@ public class ProjectTMX {
*
* It must be used with synchronization around ProjectTMX.
*/
Map<String, TMXEntry> defaults;
protected Map<String, TMXEntry> defaults;

/**
* Storage for alternative translations for current project.
*
* It must be used with synchronization around ProjectTMX.
*/
Map<EntryKey, TMXEntry> alternatives;
protected Map<EntryKey, TMXEntry> alternatives;

final CheckOrphanedCallback checkOrphanedCallback;

Expand Down
50 changes: 34 additions & 16 deletions src/org/omegat/core/matching/NearString.java
Original file line number Diff line number Diff line change
Expand Up @@ -47,23 +47,31 @@
*/
public class NearString {
public enum MATCH_SOURCE {
MEMORY, TM, FILES
MEMORY, TM, FILES, TM_SUBSEG
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
MEMORY, TM, FILES, TM_SUBSEG
MEMORY, TM, FILES, SUBSEGMENTS

};

public enum SORT_KEY {
SCORE, SCORE_NO_STEM, ADJUSTED_SCORE
}

public NearString(final EntryKey key, final String source, final String translation, MATCH_SOURCE comesFrom,
final boolean fuzzyMark, final int nearScore, final int nearScoreNoStem, final int adjustedScore,
final byte[] nearData, final String projName, final String creator, final long creationDate,
final boolean fuzzyMark, final int nearScore, final int nearScoreNoStem, final int adjustedScore,
final byte[] nearData, final String projName, final String creator, final long creationDate,
final String changer, final long changedDate, final List<TMXProp> props) {
this(key, source, translation, comesFrom, fuzzyMark, new Scores(nearScore, nearScoreNoStem,
adjustedScore, 0), nearData, projName, creator, creationDate, changer, changedDate, props);
}

public NearString(final EntryKey key, final String source, final String translation, MATCH_SOURCE comesFrom,
final boolean fuzzyMark, final Scores scores, final byte[] nearData, final String projName,
final String creator, final long creationDate,
final String changer, final long changedDate, final List<TMXProp> props) {
this.key = key;
this.source = source;
this.translation = translation;
this.comesFrom = comesFrom;
this.fuzzyMark = fuzzyMark;
this.scores = new Scores[] { new Scores(nearScore, nearScoreNoStem, adjustedScore) };
this.scores = new Scores[] { scores };
this.attr = nearData;
this.projs = new String[] { projName == null ? "" : projName };
this.props = props;
Expand All @@ -77,27 +85,35 @@ public static NearString merge(NearString ns, final EntryKey key, final String s
MATCH_SOURCE comesFrom, final boolean fuzzyMark, final int nearScore, final int nearScoreNoStem,
final int adjustedScore, final byte[] nearData, final String projName, final String creator,
final long creationDate, final String changer, final long changedDate, final List<TMXProp> props) {
return merge(ns, key, source, translation, comesFrom, fuzzyMark, new Scores(nearScore,
nearScoreNoStem, adjustedScore, 0), nearData, projName, creator, creationDate, changer,
changedDate, props);
}

public static NearString merge(NearString ns, final EntryKey key, final String source, final String translation,
MATCH_SOURCE comesFrom, final boolean fuzzyMark, final Scores scores,
final byte[] nearData, final String projName, final String creator,
final long creationDate, final String changer, final long changedDate, final List<TMXProp> props) {

List<String> projs = new ArrayList<>();
List<Scores> scores = new ArrayList<>();
List<Scores> mergedScores = new ArrayList<>();
projs.addAll(Arrays.asList(ns.projs));
scores.addAll(Arrays.asList(ns.scores));
mergedScores.addAll(Arrays.asList(ns.scores));

NearString merged;
if (nearScore > ns.scores[0].score) {
merged = new NearString(key, source, translation, comesFrom, fuzzyMark, nearScore,
nearScoreNoStem, adjustedScore, nearData, null, creator, creationDate, changer, changedDate, props);
if (scores.score > ns.scores[0].score) {
merged = new NearString(key, source, translation, comesFrom, fuzzyMark, scores,
nearData, null, creator, creationDate, changer, changedDate, props);
projs.add(0, projName);
scores.add(0, merged.scores[0]);
mergedScores.add(0, merged.scores[0]);
} else {
merged = new NearString(ns.key, ns.source, ns.translation, ns.comesFrom, ns.fuzzyMark, nearScore,
nearScoreNoStem, adjustedScore, ns.attr, null, ns.creator, ns.creationDate, ns.changer,
ns.changedDate, ns.props);
merged = new NearString(ns.key, ns.source, ns.translation, ns.comesFrom, ns.fuzzyMark, scores,
ns.attr, null, ns.creator, ns.creationDate, ns.changer, ns.changedDate, ns.props);
projs.add(projName);
scores.add(merged.scores[0]);
mergedScores.add(merged.scores[0]);
}
merged.projs = projs.toArray(new String[projs.size()]);
merged.scores = scores.toArray(new Scores[scores.size()]);
merged.scores = mergedScores.toArray(new Scores[mergedScores.size()]);
return merged;
}

Expand Down Expand Up @@ -130,11 +146,13 @@ public static class Scores {
public final int scoreNoStem;
/** adjusted similarity score for match including all tokens */
public final int adjustedScore;
public final int penalty;

public Scores(int score, int scoreNoStem, int adjustedScore) {
public Scores(int score, int scoreNoStem, int adjustedScore, int penalty) {
this.score = score;
this.scoreNoStem = scoreNoStem;
this.adjustedScore = adjustedScore;
this.penalty = penalty;
}

public String toString() {
Expand Down
51 changes: 31 additions & 20 deletions src/org/omegat/core/statistics/FindMatches.java
Original file line number Diff line number Diff line change
Expand Up @@ -30,10 +30,12 @@

import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Objects;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

Expand Down Expand Up @@ -273,6 +275,8 @@ public void iterate(EntryKey source, TMXEntry trans) {
if (separateSegmentMatcher != null) {
// split paragraph even when segmentation disabled, then find
// matches for every segment
int maxPenalty = 0;
Set<String> tmxNames = new HashSet<>();
List<StringBuilder> spaces = new ArrayList<StringBuilder>();
List<Rule> brules = new ArrayList<Rule>();
Language sourceLang = project.getProjectProperties().getSourceLanguage();
Expand All @@ -292,6 +296,14 @@ public void iterate(EntryKey source, TMXEntry trans) {
&& segmentMatch.get(0).scores[0].score >= SUBSEGMENT_MATCH_THRESHOLD) {
fsrc.add(segmentMatch.get(0).source);
ftrans.add(segmentMatch.get(0).translation);
segmentMatch.stream().filter(match -> !match.projs[0].isEmpty())
.map(match -> match.projs[0]).forEach(tmxNames::add);
if (segmentMatch.get(0).fuzzyMark) {
if (maxPenalty < PENALTY_FOR_FUZZY) {
maxPenalty = PENALTY_FOR_FUZZY;
}
}
maxPenalty = Math.max(maxPenalty, segmentMatch.get(0).scores[0].penalty);
} else {
fsrc.add("");
ftrans.add("");
Expand All @@ -301,8 +313,8 @@ public void iterate(EntryKey source, TMXEntry trans) {
String foundSrc = Core.getSegmenter().glue(sourceLang, sourceLang, fsrc, spaces, brules);
// glue found translations
String foundTrans = Core.getSegmenter().glue(sourceLang, targetLang, ftrans, spaces, brules);
processEntry(null, foundSrc, foundTrans, NearString.MATCH_SOURCE.TM, false, 0, "", "", 0, "",
0, null);
processEntry(null, foundSrc, foundTrans, NearString.MATCH_SOURCE.TM_SUBSEG, false, maxPenalty,
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
processEntry(null, foundSrc, foundTrans, NearString.MATCH_SOURCE.TM_SUBSEG, false, maxPenalty,
processEntry(null, foundSrc, foundTrans, NearString.MATCH_SOURCE.SUBSEGMENTS, false, maxPenalty,

String.join(",", tmxNames), "", 0, "", 0, null);
}
}

Expand Down Expand Up @@ -421,8 +433,9 @@ protected void processEntry(EntryKey key, String source, String translation,
return;
}

addNearString(key, source, translation, comesFrom, fuzzy, similarityStem, similarityNoStem,
simAdjusted, null, tmxName, creator, creationDate, changer, changedDate, props);
addNearString(key, source, translation, comesFrom, fuzzy, new NearString.Scores(similarityStem,
similarityNoStem, simAdjusted, penalty),
null, tmxName, creator, creationDate, changer, changedDate, props);
}

/**
Expand Down Expand Up @@ -457,8 +470,7 @@ protected boolean haveChanceToAdd(final int simStem, final int simNoStem, final
* "similarity,simAdjusted"
*/
protected void addNearString(final EntryKey key, final String source, final String translation,
NearString.MATCH_SOURCE comesFrom, final boolean fuzzy, final int similarity,
final int similarityNoStem, final int simAdjusted, final byte[] similarityData,
NearString.MATCH_SOURCE comesFrom, final boolean fuzzy, NearString.Scores scores, final byte[] similarityData,
final String tmxName, final String creator, final long creationDate, final String changer,
final long changedDate, final List<TMXProp> tuProperties) {
// find position for new data
Expand All @@ -470,25 +482,25 @@ protected void addNearString(final EntryKey key, final String source, final Stri
// single NearString with
// multiple project entries.
result.set(i,
NearString.merge(st, key, source, translation, comesFrom, fuzzy, similarity,
similarityNoStem, simAdjusted, similarityData, tmxName, creator, creationDate,
NearString.merge(st, key, source, translation, comesFrom, fuzzy,
scores, similarityData, tmxName, creator, creationDate,
changer, changedDate, tuProperties));
return;
}
if (st.scores[0].score < similarity) {
if (st.scores[0].score < scores.score) {
break;
}
if (st.scores[0].score == similarity) {
if (st.scores[0].scoreNoStem < similarityNoStem) {
if (st.scores[0].score == scores.score) {
if (st.scores[0].scoreNoStem < scores.scoreNoStem) {
break;
}
if (st.scores[0].scoreNoStem == similarityNoStem) {
if (st.scores[0].adjustedScore < simAdjusted) {
if (st.scores[0].scoreNoStem == scores.scoreNoStem) {
if (st.scores[0].adjustedScore < scores.adjustedScore) {
break;
}
// Patch contributed by Antonio Vilei
// text with the same case has precedence
if (similarity == 100 && !st.source.equals(srcText) && source.equals(srcText)) {
if (scores.score == 100 && !st.source.equals(srcText) && source.equals(srcText)) {
break;
}
}
Expand All @@ -497,9 +509,8 @@ protected void addNearString(final EntryKey key, final String source, final Stri
}

result.add(pos,
new NearString(key, source, translation, comesFrom, fuzzy, similarity, similarityNoStem,
simAdjusted, similarityData, tmxName, creator, creationDate, changer, changedDate,
tuProperties));
new NearString(key, source, translation, comesFrom, fuzzy, scores, similarityData, tmxName, creator,
creationDate, changer, changedDate, tuProperties));
if (result.size() > maxCount) {
result.remove(result.size() - 1);
}
Expand All @@ -508,9 +519,9 @@ protected void addNearString(final EntryKey key, final String source, final Stri
/*
* Methods for tokenize strings with caching.
*/
Map<String, Token[]> tokenizeStemCache = new HashMap<String, Token[]>();
Map<String, Token[]> tokenizeNoStemCache = new HashMap<String, Token[]>();
Map<String, Token[]> tokenizeAllCache = new HashMap<String, Token[]>();
Map<String, Token[]> tokenizeStemCache = new HashMap<>();
Map<String, Token[]> tokenizeNoStemCache = new HashMap<>();
Map<String, Token[]> tokenizeAllCache = new HashMap<>();

public Token[] tokenizeStem(String str) {
Token[] tokens = tokenizeStemCache.get(str);
Expand Down
16 changes: 16 additions & 0 deletions test/data/tmx/penalty-010/segment_1.tmx
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE tmx PUBLIC "-//LISA OSCAR:1998//DTD for Translation Memory eXchange//EN" "tmx14.dtd">

<tmx version="1.4">
<header creationtoolversion="0.1" adminlang="en" segtype="paragraph" creationdate="20230930T155211Z" datatype="unknown" srclang="ja" creationtool="txt2tmx" o-tmf="TextEdit"></header>
<body>
<tu>
<tuv xml:lang="fr">
<seg>weird behavior</seg>
</tuv>
<tuv xml:lang="ja">
<seg>地力の搾取と浪費が現われる。(1)</seg>
</tuv>
</tu>
</body>
</tmx>
46 changes: 46 additions & 0 deletions test/data/tmx/test-multiple-entries.tmx
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE tmx SYSTEM "tmx14.dtd">
<tmx version="1.4">
<header datatype="plaintext" srclang="en-US" adminlang="EN-US" o-tmf="OmegaT TMX" segtype="sentence"
creationtoolversion="test" creationtool="test"/>
<body>
<!-- Default translations -->
<tu>
<tuv lang="en-US">
<seg>Other</seg>
</tuv>
<tuv lang="co" changeid="id" changedate="20200523T143256Z">
<seg>Altre</seg>
</tuv>
</tu>
<tu>
<tuv lang="en-US">
<seg>For installation on Linux.</seg>
</tuv>
<tuv lang="co" changeid="id" changedate="20200526T131725Z" creationid="id" creationdate="20200526T131725Z">
<seg>Per l’installazioni nant’à i sistemi Linux.</seg>
</tuv>
</tu>
<tu>
<tuv lang="en-US">
<seg>For installation on other operating systems (such as FreeBSD and Solaris).</seg>
</tuv>
<tuv lang="co" changeid="id" changedate="20200526T131840Z" creationid="id"
creationdate="20200526T131840Z">
<seg>Per l’installazioni nant’à d’altri sistemi (cum’è FreeBSD è Solaris).</seg>
</tuv>
</tu>
<!-- Alternative translations -->
<tu>
<prop type="file">website/download.html</prop>
<prop type="prev">For installation on Linux.</prop>
<prop type="next">For installation on other operating systems (such as FreeBSD and Solaris).&lt;br0/></prop>
<tuv lang="en-US">
<seg>Other</seg>
</tuv>
<tuv lang="co" changeid="id" changedate="20200526T131742Z" creationid="id" creationdate="20200526T131742Z">
<seg>Altri</seg>
</tuv>
</tu>
</body>
</tmx>
Loading