Skip to content

Commit

Permalink
fix small problems
Browse files Browse the repository at this point in the history
  • Loading branch information
jsksxs360 committed Jul 29, 2021
1 parent edb425c commit 0c26759
Showing 1 changed file with 55 additions and 3 deletions.
58 changes: 55 additions & 3 deletions src/me/xiaosheng/chnlp/seg/Segment.java
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
package me.xiaosheng.chnlp.seg;

import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;

import com.hankcs.hanlp.dictionary.stopword.CoreStopWordDictionary;
import com.hankcs.hanlp.seg.common.Term;
import com.hankcs.hanlp.tokenizer.NLPTokenizer;
import com.hankcs.hanlp.tokenizer.StandardTokenizer;
import com.hankcs.hanlp.utility.SentencesUtil;

/**
* 分词器
Expand Down Expand Up @@ -153,7 +153,59 @@ public static List<String> getNatureList(List<Term> termList) {
* @return 句子列表
*/
public static List<String> splitSentence(String document) {
return SentencesUtil.toSentenceList(document, false);
return splitSentence(document.toCharArray(), false);
}

private static void insertIntoList(StringBuilder sb, List<String> sentences) {
String content = sb.toString().trim();
if (content.length() > 0) {
sentences.add(content);
}
}

private static List<String> splitSentence(char[] chars, boolean shortest) {
StringBuilder sb = new StringBuilder();
List<String> sentences = new LinkedList<String>();
for (int i = 0; i < chars.length; i++) {
if (sb.length() == 0 && (Character.isWhitespace(chars[i]) || chars[i] == ' '))
continue;
sb.append(chars[i]);
switch (chars[i]) {
case '.':
if (i < chars.length - 1 && chars[i + 1] > 128) {
insertIntoList(sb, sentences);
sb = new StringBuilder();
}
break;
case '…':
if (i < chars.length - 1 && chars[i + 1] == '…') {
sb.append('…');
++i;
insertIntoList(sb, sentences);
sb = new StringBuilder();
}
break;
case ',':
case ',':
case ';':
case ';':
if (!shortest)
continue;
case '。':
case '!':
case '!':
case '?':
case '?':
case '\n':
case '\r':
insertIntoList(sb, sentences);
sb = new StringBuilder();
break;
}
}
if (sb.length() > 0)
insertIntoList(sb, sentences);
return sentences;
}

/**
Expand All @@ -163,7 +215,7 @@ public static List<String> splitSentence(String document) {
* @return 句子列表
*/
public static List<String> splitSentence(String document, boolean shortest) {
return SentencesUtil.toSentenceList(document, shortest);
return splitSentence(document.toCharArray(), shortest);
}

/**
Expand Down

0 comments on commit 0c26759

Please sign in to comment.