Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: EditorUtils use document's locale to splitting words #1175

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -300,6 +300,7 @@ dependencies {
}
runtimeOnly(libs.language.detector)
runtimeOnly(libs.dumont.hunspell)
implementation(libs.icu4j)

// Lucene for tokenizers
implementation(libs.bundles.lucene)
Expand Down Expand Up @@ -1702,7 +1703,7 @@ tasks.register('testAcceptance', Test) {
classpath = sourceSets.testAcceptance.runtimeClasspath
systemProperties = System.properties
systemProperty 'java.util.logging.config.file', "${rootDir}/config/test/logger.properties"

dependsOn firstStepsEn
dependsOn ':aligner:jar'
}

Expand Down
4 changes: 2 additions & 2 deletions gradle/libs.versions.toml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ commons_io = "2.16.1"
commons_text = "1.11.0"
commons_validator = "1.9.0"
jsoup = "1.18.2"
icu4j = { require = "[70,73.2[", prefer = "72.1" }
icu4j = { require = "[71.1,76.1[", prefer = "74.2" }
stax2api = "4.2.2"
woodstox = "6.5.0"
languagetool = "6.1"
Expand Down Expand Up @@ -65,7 +65,7 @@ commons-lang3 = {group = "org.apache.commons", name = "commons-lang3", version.r
commons-text = {group = "org.apache.commons", name = "commons-text", version.ref = "commons_text"}
commons-validator = {group = "commons-validator", name = "commons-validator", version.ref = "commons_validator"}
jsoup = {group = "org.jsoup", name = "jsoup", version.ref = "jsoup"}
icj4j = {group = "com.ibm.icu", name = "icu4j", version.ref = "icu4j"}
icu4j = {group = "com.ibm.icu", name = "icu4j", version.ref = "icu4j"}
stax2-api = {group = "org.codehaus.woodstox", name = "stax2-api", version.ref = "stax2api"}
woodstox-core = {group = "com.fasterxml.woodstox", name = "woodstox-core", version.ref = "woodstox"}
languagetool-all = {group = "org.languagetool", name = "language-all", version.ref = "languagetool"}
Expand Down
4 changes: 2 additions & 2 deletions language-modules/ja/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ dependencies {
exclude module: 'icu4j'
}
implementation(dependencies.variantOf(libs.lucene.gosen) { classifier("ipadic") })
implementation(libs.icj4j)
compileOnly(libs.icu4j)
}

testImplementation(libs.junit4)
Expand All @@ -43,7 +43,7 @@ dependencies {
exclude module: 'icu4j'
}
testRuntimeOnly(dependencies.variantOf(libs.lucene.gosen) { classifier("ipadic") })
testRuntimeOnly(libs.icj4j)
testRuntimeOnly(libs.icu4j)

testImplementation(libs.assertj)
testImplementation(testFixtures(project.rootProject))
Expand Down
12 changes: 9 additions & 3 deletions src/org/omegat/gui/editor/EditorController.java
Original file line number Diff line number Diff line change
Expand Up @@ -704,9 +704,12 @@ protected void loadDocument() {

doc.setDocumentFilter(new DocumentFilter3());

// add locate for target language to editor
// add locales to editor
Locale targetLocale = Core.getProject().getProjectProperties().getTargetLanguage().getLocale();
editor.setLocale(targetLocale);
editor.setTargetLocale(targetLocale);
Locale sourceLocale = Core.getProject().getProjectProperties().getSourceLanguage().getLocale();
editor.setSourceLocale(sourceLocale);

editor.setDocument(doc);

Expand Down Expand Up @@ -1639,8 +1642,9 @@ public void changeCase(CHANGE_CASE_TO toWhat) {
try {
// no selection? make it the current word
if (start == end) {
start = EditorUtils.getWordStart(editor, start);
end = EditorUtils.getWordEnd(editor, end);
Locale locale = Core.getProject().getProjectProperties().getTargetLanguage().getLocale();
start = EditorUtils.getWordStart(editor, start, locale);
end = EditorUtils.getWordEnd(editor, end, locale);

// adjust the bound again
if (start < translationStart && end <= translationEnd) {
Expand Down Expand Up @@ -1947,6 +1951,7 @@ private void createAdditionalPanes() {
.setComponentOrientation(BiDiUtils.isRtl(language) ? ComponentOrientation.RIGHT_TO_LEFT
: ComponentOrientation.LEFT_TO_RIGHT);
introPane.setEditable(false);
introPane.setName("IntroPane");
DragTargetOverlay.apply(introPane, dropInfo);
URI uri = Help.getHelpFileURI(OConsts.HELP_FIRST_STEPS_PREFIX, language, OConsts.HELP_FIRST_STEPS);
if (uri != null) {
Expand All @@ -1958,6 +1963,7 @@ private void createAdditionalPanes() {
emptyProjectPaneTitle = OStrings.getString("TF_INTRO_EMPTYPROJECT_FILENAME");
emptyProjectPane = new JTextPane();
emptyProjectPane.setEditable(false);
emptyProjectPane.setName("EmptyProjectPane");
emptyProjectPane.setText(OStrings.getString("TF_INTRO_EMPTYPROJECT"));
emptyProjectPane.setFont(mw.getApplicationFont());
DragTargetOverlay.apply(emptyProjectPane, dropInfo);
Expand Down
22 changes: 20 additions & 2 deletions src/org/omegat/gui/editor/EditorTextArea3.java
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Locale;

import javax.swing.JEditorPane;
import javax.swing.JPopupMenu;
Expand Down Expand Up @@ -141,6 +142,9 @@ public class EditorTextArea3 extends JEditorPane {
*/
protected boolean overtypeMode = false;

private Locale targetLocale;
private Locale sourceLocale;

public EditorTextArea3(EditorController controller) {
this.controller = controller;
setEditorKit(new StyledEditorKit() {
Expand All @@ -165,10 +169,16 @@ protected void createInputAttributes(Element element, MutableAttributeSet set) {
c.setBlinkRate(getCaret().getBlinkRate());
setCaret(c);

sourceLocale = getLocale();
targetLocale = getLocale();

addCaretListener(e -> {
try {
int start = EditorUtils.getWordStart(EditorTextArea3.this, e.getMark());
int end = EditorUtils.getWordEnd(EditorTextArea3.this, e.getMark());
// Detection of target string locale.
// It uses a source or a target language as a processing locale.
Locale locale = isInActiveTranslation(e.getMark()) ? targetLocale : sourceLocale;
int start = EditorUtils.getWordStart(EditorTextArea3.this, e.getMark(), locale);
int end = EditorUtils.getWordEnd(EditorTextArea3.this, e.getMark(), locale);
if (end - start <= 0) {
// word not defined
return;
Expand Down Expand Up @@ -200,6 +210,14 @@ public void setFont(Font font) {
}
}

void setTargetLocale(Locale targetLocale) {
this.targetLocale = targetLocale;
}

void setSourceLocale(Locale sourceLocale) {
this.sourceLocale = sourceLocale;
}

/**
* Return OmDocument instead just a Document. If editor was not initialized
* with OmDocument, it will contains other Document implementation. In this
Expand Down
107 changes: 89 additions & 18 deletions src/org/omegat/gui/editor/EditorUtils.java
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,13 @@
import java.util.Locale;

import javax.swing.text.BadLocationException;
import javax.swing.text.Document;
import javax.swing.text.Element;
import javax.swing.text.JTextComponent;
import javax.swing.text.Utilities;

import com.ibm.icu.text.BreakIterator;

import org.omegat.core.Core;
import org.omegat.core.data.ProtectedPart;
import org.omegat.core.data.SourceTextEntry;
Expand Down Expand Up @@ -61,15 +65,30 @@ private EditorUtils() {
* Determines the start of a word for the given model location. This method
* skips direction char.
*
* TODO: change to use document's locale
*
* @param c
* @param offs
* @return
* @param c TextComponent of the editor area.
* @param offs offset of the text.
* @return position of word start on the text component.
* @throws BadLocationException
* when there is no line found in the text component.
*/
@Deprecated
public static int getWordStart(JTextComponent c, int offs) throws BadLocationException {
int result = Utilities.getWordStart(c, offs);
return getWordStart(c, offs, c.getLocale());
}

/**
* Determines the start of a word for the given model location. This method
* skips direction char.
*
* @param c TextComponent of the editor area.
* @param offs offset of the text.
* @param locale locale of the text.
* @return position of word start on the text component.
* @throws BadLocationException
* when there is no line found in the text component.
*/
public static int getWordStart(JTextComponent c, int offs, Locale locale) throws BadLocationException {
int result = getWordBoundary(c, offs, locale, false);
char ch = c.getDocument().getText(result, 1).charAt(0);
if (isDirectionChar(ch)) {
result++;
Expand All @@ -81,15 +100,30 @@ public static int getWordStart(JTextComponent c, int offs) throws BadLocationExc
* Determines the end of a word for the given model location. This method
* skips direction char.
*
* TODO: change to use document's locale
*
* @param c
* @param offs
* @return
* @param c TextComponent of the editor area.
* @param offs offset of the text.
* @return position of the word end on the text component.
* @throws BadLocationException
* when there is no line found in the text component.
*/
@Deprecated
public static int getWordEnd(JTextComponent c, int offs) throws BadLocationException {
int result = Utilities.getWordEnd(c, offs);
return getWordEnd(c, offs, c.getLocale());
}

/**
* Determines the end of a word for the given model location. This method
* skips direction char.
*
* @param c TextComponent of the editor area.
* @param offs offset of the text.
* @param locale locale of the text.
* @return position of the word end on the text component.
* @throws BadLocationException
* when there is no line found in the text component.
*/
public static int getWordEnd(JTextComponent c, int offs, Locale locale) throws BadLocationException {
int result = getWordBoundary(c, offs, locale, true);
if (result > 0) {
char ch = c.getDocument().getText(result - 1, 1).charAt(0);
if (isDirectionChar(ch)) {
Expand All @@ -99,6 +133,46 @@ public static int getWordEnd(JTextComponent c, int offs) throws BadLocationExcep
return result;
}

private static int getWordBoundary(JTextComponent c, int offs, Locale locale, boolean end) throws BadLocationException {
int result = offs;
Element line = Utilities.getParagraphElement(c, offs);
if (line == null) {
throw new BadLocationException("No word at " + offs, offs);
}
int lineStart = line.getStartOffset();
Document doc = c.getDocument();
int lineEnd = Math.min(line.getEndOffset(), doc.getLength());
if (lineEnd - lineStart > 0) {
String lineString = doc.getText(lineStart, lineEnd - lineStart);
result = lineStart + getWordBoundary(locale, lineString, offs - lineStart, end);
}
return result;
}

/**
* Get word boundary.
* <p>
* When the end argument is true, return a word end.
* Otherwise, return a start of word.
* @param locale locale of the line string.
* @param lineString a string of the line.
* @param wordPosition target position of the line.
* @param end return end of word, otherwise start of word.
* @return index of the word boundary.
*/
static int getWordBoundary(Locale locale, String lineString, int wordPosition, boolean end) {
BreakIterator words = com.ibm.icu.text.BreakIterator.getWordInstance(locale);
words.setText(lineString);
if (wordPosition >= words.last()) {
wordPosition = words.last() - 1;
}
if (end) {
return words.following(wordPosition);
}
words.following(wordPosition);
return words.previous();
}

/**
* Check if char is direction char(u202A,u202B,u202C).
*
Expand Down Expand Up @@ -420,7 +494,7 @@ public static String addBidiAroundTags(String text, SourceTextEntry ste) {
StringBuilder s = new StringBuilder(text.length() * 12 / 10);
for (Tag t : tags) {
if (pos < t.pos) {
s.append(text.substring(pos, t.pos));
s.append(text, pos, t.pos);
}
s.append(SegmentBuilder.BIDI_RLM_CHAR);
s.append(SegmentBuilder.BIDI_LRM_CHAR);
Expand All @@ -437,11 +511,8 @@ public static String addBidiAroundTags(String text, SourceTextEntry ste) {

public static boolean hasBidiAroundTag(String text, String tag, int pos) {
try {
boolean has = true;
if (text.charAt(pos - 1) != SegmentBuilder.BIDI_LRM_CHAR
|| text.charAt(pos - 2) != SegmentBuilder.BIDI_RLM_CHAR) {
has = false;
}
boolean has = text.charAt(pos - 1) == SegmentBuilder.BIDI_LRM_CHAR
&& text.charAt(pos - 2) == SegmentBuilder.BIDI_RLM_CHAR;
if (text.charAt(pos + tag.length()) != SegmentBuilder.BIDI_LRM_CHAR
|| text.charAt(pos + tag.length() + 1) != SegmentBuilder.BIDI_RLM_CHAR) {
has = false;
Expand Down
3 changes: 3 additions & 0 deletions test-acceptance/data/project_CN_JP/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
project_stats.txt
project_stats.json
*.bak
Empty file.
3 changes: 3 additions & 0 deletions test-acceptance/data/project_CN_JP/glossary/glossary.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# Glossary in tab-separated format -*- coding: utf-8 -*-
介绍 紹介
中的 中心的な
33 changes: 33 additions & 0 deletions test-acceptance/data/project_CN_JP/omegat.project
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
<?xml version='1.0' encoding='UTF-8'?>
<omegat>
<project version="1.0">
<source_dir>source</source_dir>
<source_dir_excludes>
<mask>**/.svn/**</mask>
<mask>**/CVS/**</mask>
<mask>**/.cvs/**</mask>
<mask>**/.git/**</mask>
<mask>**/.hg/**</mask>
<mask>**/.repositories/**</mask>
<mask>**/desktop.ini</mask>
<mask>**/Thumbs.db</mask>
<mask>**/.DS_Store</mask>
<mask>**/~$*</mask>
</source_dir_excludes>
<target_dir>target</target_dir>
<tm_dir>tm</tm_dir>
<glossary_dir>glossary</glossary_dir>
<glossary_file>.-glossary.txt</glossary_file>
<dictionary_dir>dictionary</dictionary_dir>
<export_tm_dir></export_tm_dir>
<export_tm_levels></export_tm_levels>
<source_lang>zh-CN</source_lang>
<target_lang>ja-JP</target_lang>
<source_tok>org.omegat.tokenizer.LuceneSmartChineseTokenizer</source_tok>
<target_tok>org.omegat.tokenizer.LuceneJapaneseTokenizer</target_tok>
<sentence_seg>true</sentence_seg>
<support_default_translations>true</support_default_translations>
<remove_tags>true</remove_tags>
<external_command></external_command>
</project>
</omegat>
Empty file.
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
#Thu Nov 07 21:30:29 JST 2024
LAST_ENTRY_NUMBER=1
LAST_ENTRY_SRC=\u592A\u5E73\u5BFA\u4E2D\u7684\u6587\u7B14\u5854
LAST_ENTRY_FILE=source.txt
Empty file.
17 changes: 17 additions & 0 deletions test-acceptance/data/project_CN_JP/omegat/project_save.tmx
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
<?xml version='1.0' encoding='UTF-8'?>
<!DOCTYPE tmx SYSTEM "tmx11.dtd">
<tmx version="1.1">
<header creationtool="OmegaT" o-tmf="OmegaT TMX" adminlang="EN-US" datatype="plaintext" creationtoolversion="6.1.0_0_50ff299ad" segtype="sentence" srclang="zh-CN"/>
<body>
<!-- Default translations -->
<tu>
<tuv lang="zh-CN">
<seg>太平寺中的文笔塔</seg>
</tuv>
<tuv lang="ja-JP" changeid="Hiroshi Miura" changedate="20241107T122621Z" creationid="Hiroshi Miura" creationdate="20241107T122621Z">
<seg>太平寺の中心的なペン塔</seg>
</tuv>
</tu>
<!-- Alternative translations -->
</body>
</tmx>
5 changes: 5 additions & 0 deletions test-acceptance/data/project_CN_JP/source/source.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
太平寺中的文笔塔

文筆塔原是江苏省常州市太平寺中的塔。太平寺始建于南北朝齐梁时期,是常州最古老的佛寺之一,今已不存。
文笔塔为砖木结构,七级八面,每级4个拱门,中有旋梯。塔下有曲池、拱桥。
“夕照塔影”为文笔胜景。现存塔为光绪末年(1905-1908年)重建
Empty file.
Empty file.
Loading
Loading