Skip to content

Commit

Permalink
[issue-85] Refine training data and add invalid irregular rules
Browse files Browse the repository at this point in the history
  • Loading branch information
shin285 committed Jan 20, 2020
1 parent 5874757 commit aa980b6
Show file tree
Hide file tree
Showing 18 changed files with 326 additions and 9 deletions.
Binary file modified models_full/irregular.model
Binary file not shown.
Binary file modified models_full/observation.model
Binary file not shown.
Binary file modified models_full/transition.model
Binary file not shown.
Binary file modified models_light/irregular.model
Binary file not shown.
Binary file modified models_light/observation.model
Binary file not shown.
Binary file modified models_light/transition.model
Binary file not shown.
5 changes: 4 additions & 1 deletion resources/irrDic.remove.txt
Original file line number Diff line number Diff line change
Expand Up @@ -532,5 +532,8 @@ remove : ㅇㅣ/VCP ㅈㅣ
key : ㄱㅏㅈㅣ
remove : ㄲㅏㅈㅣ

key ; ㅇㅔㅅㅓ
key : ㅇㅔㅅㅓ
remove : ㅁㅕㄴ/NNG ㅇㅔㅅㅓ

key : ㄷㅗ
remove : ㅈㅜㅇ
245 changes: 245 additions & 0 deletions src/main/java/kr/co/shineware/nlp/komoran/util/HangulJamoUtil.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,245 @@
package kr.co.shineware.nlp.komoran.util;

import java.util.ArrayList;
import java.util.List;

public class HangulJamoUtil {
public static String ToHangulCompatibilityJamo(String source) {
StringBuilder dest = new StringBuilder();

for (int i = 0; i < source.length(); i++) {
char ch = source.charAt(i);
ch = convertChosungToCompatibilityJamo(ch);
ch = convertJungsungToCompatibilityJamo(ch);
ch = convertJongsungToCompatibilityJamo(ch);
dest.append(ch);
}
return dest.toString();
}

private static char convertJongsungToCompatibilityJamo(char ch) {
if(ch == 0x11A8){
ch = 'ㄱ';
}
if(ch == 0x11A9){
ch = 'ㄲ';
}
if(ch == 0x11AA){
ch = 'ㄳ';
}
if(ch == 0x11AB){
ch = 'ㄴ';
}
if(ch == 0x11AC){
ch = 'ㄵ';
}
if(ch == 0x11AD){
ch = 'ㄶ';
}
if(ch == 0x11AE){
ch = 'ㄷ';
}
if(ch == 0x11AF){
ch = 'ㄹ';
}
if(ch == 0x11B0){
ch = 'ㄺ';
}
if(ch == 0x11B1){
ch = 'ㄻ';
}
if(ch == 0x11B2){
ch = 'ㄼ';
}
if(ch == 0x11B3){
ch = 'ㄽ';
}
if(ch == 0x11B4){
ch = 'ㄾ';
}
if(ch == 0x11B5){
ch = 'ㄿ';
}
if(ch == 0x11B6){
ch = 'ㅀ';
}
if(ch == 0x11B7){
ch = 'ㅁ';
}
if(ch == 0x11B8){
ch = 'ㅂ';
}
if(ch == 0x11B9){
ch = 'ㅄ';
}
if(ch == 0x11BA){
ch = 'ㅅ';
}
if(ch == 0x11BB){
ch = 'ㅆ';
}
if(ch == 0x11BC){
ch = 'ㅇ';
}
if(ch == 0x11BD){
ch = 'ㅈ';
}
if(ch == 0x11BE){
ch = 'ㅊ';
}
if(ch == 0x11BF){
ch = 'ㅋ';
}
if(ch == 0x11C0){
ch = 'ㅌ';
}
if(ch == 0x11C1){
ch = 'ㅍ';
}
if(ch == 0x11C2){
ch = 'ㅎ';
}
return ch;
}

private static char convertJungsungToCompatibilityJamo(char ch) {
if(ch == 0x1161){
ch = 'ㅏ';
}
if(ch == 0x1162){
ch = 'ㅐ';
}
if(ch == 0x1163){
ch = 'ㅑ';
}
if(ch == 0x1164){
ch = 'ㅒ';
}
if(ch == 0x1165){
ch = 'ㅓ';
}
if(ch == 0x1166){
ch = 'ㅔ';
}
if(ch == 0x1167){
ch = 'ㅕ';
}
if(ch == 0x1168){
ch = 'ㅖ';
}
if(ch == 0x1169){
ch = 'ㅗ';
}
if(ch == 0x116A){
ch = 'ㅘ';
}
if(ch == 0x116B){
ch = 'ㅙ';
}
if(ch == 0x116C){
ch = 'ㅚ';
}
if(ch == 0x116D){
ch = 'ㅛ';
}
if(ch == 0x116E){
ch = 'ㅜ';
}
if(ch == 0x116F){
ch = 'ㅝ';
}
if(ch == 0x1170){
ch = 'ㅞ';
}
if(ch == 0x1171){
ch = 'ㅟ';
}
if(ch == 0x1172){
ch = 'ㅠ';
}
if(ch == 0x1173){
ch = 'ㅡ';
}
if(ch == 0x1174){
ch = 'ㅢ';
}
if(ch == 0x1175){
ch = 'ㅣ';
}
return ch;
}

private static char convertChosungToCompatibilityJamo(char ch) {
if(ch == 0x1100){
ch = 'ㄱ';
}
if(ch == 0x1101){
ch = 'ㄲ';
}
if(ch == 0x1102){
ch = 'ㄴ';
}
if(ch == 0x1103){
ch = 'ㄷ';
}
if(ch == 0x1104){
ch = 'ㄸ';
}
if(ch == 0x1105){
ch = 'ㄹ';
}
if(ch == 0x1106){
ch = 'ㅁ';
}
if(ch == 0x1107){
ch = 'ㅂ';
}
if(ch == 0x1108){
ch = 'ㅃ';
}
if(ch == 0x1109){
ch = 'ㅅ';
}
if(ch == 0x110A){
ch = 'ㅆ';
}
if(ch == 0x110B){
ch = 'ㅇ';
}
if(ch == 0x110C){
ch = 'ㅈ';
}
if(ch == 0x110D){
ch = 'ㅉ';
}
if(ch == 0x110E){
ch = 'ㅊ';
}
if(ch == 0x110F){
ch = 'ㅋ';
}
if(ch == 0x1110){
ch = 'ㅌ';
}
if(ch == 0x1111){
ch = 'ㅍ';
}
if(ch == 0x1112){
ch = 'ㅎ';
}
return ch;
}

public static List<Character> getHangulJamos(String source) {

List<Character> jamoList = new ArrayList<>();

for (int i = 0; i < source.length(); i++) {
char ch = source.charAt(i);
if (Character.UnicodeBlock.of(ch) == Character.UnicodeBlock.HANGUL_JAMO) {
jamoList.add(ch);
}
}
return jamoList;
}
}
Binary file modified src/main/resources/models_full/irregular.model
Binary file not shown.
Binary file modified src/main/resources/models_full/observation.model
Binary file not shown.
Binary file modified src/main/resources/models_full/transition.model
Binary file not shown.
Binary file modified src/main/resources/models_light/irregular.model
Binary file not shown.
Binary file modified src/main/resources/models_light/observation.model
Binary file not shown.
Binary file modified src/main/resources/models_light/transition.model
Binary file not shown.
20 changes: 19 additions & 1 deletion src/test/java/kr/co/shineware/nlp/komoran/core/KomoranTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,14 @@
import kr.co.shineware.nlp.komoran.util.ElapsedTimeChecker;
import kr.co.shineware.util.common.file.FileUtil;
import kr.co.shineware.util.common.model.Pair;
import kr.co.shineware.util.common.string.StringUtil;
import org.junit.Before;
import org.junit.Ignore;
import org.junit.Test;

import java.io.*;
import java.nio.charset.StandardCharsets;
import java.text.Normalizer;
import java.util.ArrayList;
import java.util.List;

Expand All @@ -25,9 +28,24 @@ public void init() {
this.komoran = new Komoran(DEFAULT_MODEL.LIGHT);
}

@Test
public void getUnicode() throws UnsupportedEncodingException {
String korean = "되ᄅ";
korean = "난";
printcodePointAndUnicodeBlock(korean);
}

private void printcodePointAndUnicodeBlock(String korean) {
korean = StringUtil.korean2JasoString(korean);
for(int i=0;i<korean.length();i++){
char ch = korean.charAt(i);
System.out.println(ch + " : " +Character.UnicodeBlock.of(ch)+ "("+String.format("U+%04X",korean.codePointAt(i))+")");
}
}

@Test
public void nBestAnalyzeResultTest() {
List<KomoranResult> nbestResult = this.komoran.analyze("가을", 2);
List<KomoranResult> nbestResult = this.komoran.analyze("치뜬", 1);
for (KomoranResult result : nbestResult) {
System.out.println(result.getPlainText());
}
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package kr.co.shineware.nlp.komoran.core;

import kr.co.shineware.nlp.komoran.constant.SEJONGTAGS;
import kr.co.shineware.nlp.komoran.util.HangulJamoUtil;
import kr.co.shineware.util.common.file.FileUtil;
import kr.co.shineware.util.common.string.StringUtil;
import org.junit.Before;
Expand Down Expand Up @@ -90,6 +91,10 @@ public void loadKEBilingualCorpusAndConvertUTF8() throws IOException {
answer += line.replaceAll("</mor><pos>", "/").replaceAll("<.+?>", "") + " ";
}
if (line.startsWith("</tok>")) {

problem = convertUnicodeJamoToJamoCompatibility(problem);
answer = convertUnicodeJamoToJamoCompatibility(answer);

if (!isValidFormat(problem + "\t" + answer)) {
problem = "";
answer = "";
Expand Down Expand Up @@ -150,7 +155,7 @@ private boolean isValidFormat(String convertedPair) {
}
String pos = morphPosToken[1].trim();
if (!sejongTagSet.contains(pos)) {
System.out.println("Wrong POS : (" + pos + ")" + convertedPair);
// System.out.println("Wrong POS : (" + pos + ")" + convertedPair);
return false;
}
for (int i = 0; i < pos.length(); i++) {
Expand Down Expand Up @@ -201,6 +206,7 @@ public void loadKJBilingualCorpusAndConvertUTF8() throws IOException {
}

if (isHeadArea || isSentenceArea) {
line = convertUnicodeJamoToJamoCompatibility(line);
String[] entity = line.split("\t");
if (entity.length != 2) {
continue;
Expand All @@ -213,7 +219,7 @@ public void loadKJBilingualCorpusAndConvertUTF8() throws IOException {
}
answer = answer.replaceAll("\\+", " ").replaceAll(" {2}", " +");
if (!isValidFormat(problem + "\t" + answer)) {
System.out.println(filename + ":" + lineCount + ":" + line);
// System.out.println(filename + ":" + lineCount + ":" + line);
continue;
}
bw.write(problem + "\t" + answer);
Expand Down Expand Up @@ -248,6 +254,7 @@ public void loadSejongSpeechTextAndConvertUTF8() throws IOException {
}

if (isTextArea) {
line = convertUnicodeJamoToJamoCompatibility(line);
String[] entity = line.split("\t");
if (entity.length != 3) {
continue;
Expand All @@ -266,7 +273,7 @@ public void loadSejongSpeechTextAndConvertUTF8() throws IOException {
continue;
}
if (!isValidFormat(problem + "\t" + answer)) {
System.out.println(filename + ":" + lineCount + ":" + line);
// System.out.println(filename + ":" + lineCount + ":" + line);
continue;
}
bw.write(problem + "\t" + answer);
Expand Down Expand Up @@ -313,11 +320,12 @@ public void loadSejongTextAndConvertUTF8() throws IOException {

if (isHeadArea || isPhraseArea) {
try {
line = convertUnicodeJamoToJamoCompatibility(line);
String problem = line.split("\t")[1];
String answers = line.split("\t")[2];
answers = answers.replaceAll(" \\+ ", " ");
if (!isValidFormat(problem + "\t" + answers)) {
System.out.println(filename + ":" + lineCount + ":" + line);
// System.out.println(filename + ":" + lineCount + ":" + line);
continue;
}
bw.write(problem + "\t" + answers);
Expand All @@ -334,4 +342,8 @@ public void loadSejongTextAndConvertUTF8() throws IOException {
}
bw.close();
}

private String convertUnicodeJamoToJamoCompatibility(String line) {
return HangulJamoUtil.ToHangulCompatibilityJamo(line);
}
}
Loading

0 comments on commit aa980b6

Please sign in to comment.