-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmorphology.py
144 lines (125 loc) · 3.43 KB
/
morphology.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
import random
import os
import esp_stems
endings = [
'i', 'as', 'is', 'os', 'us', 'u',
'a', 'an', 'aj', 'ajn',
'uj',
'o', 'oj', 'on', 'ojn',
'e', 'en', 'es',
]
suffixes = [
'at', 'it', 'ot', 'ant', 'int', 'ont',
'ig', 'iĝ',
# 'er', 'ec', 'aĉ', 'ist',
# 'uj', 'ej', 'ar', 'ad', 'in',
]
def get_ending(word):
for e in endings:
if word.endswith(e):
return [word[0:-len(e)], e]
return [word, '']
def get_suffix(word):
for s in suffixes:
if word.endswith(s):
return [word[0:-len(s)], s]
return [word, '']
def get_stem(word):
[w, e] = get_ending(word)
if e == '': return [w]
[w, s] = get_suffix(w)
if s == '': return [w, e]
# [w, t] = get_suffix(w)
# if t == '': return [w, s, e]
return [w, s, e]
seed = random.randint(0, 1)
def randint(a,b):
global seed
seed = random.randint(0,1)
# seed = (seed+1) % 2
# print (seed)
return seed
consonants = "cĉjĵklzvbnmrtsfgĝhĥp"
vowels = "aeiou"
def rand_char(group, c):
group = group.replace(c, '')
c2 = group[random.randint(0, len(group)-1)]
# print("rand_char:"+ c +","+ c2)
return c2
def change_chars(group: str, s: str) -> str:
s1 = s
# print(group+":"+s)
r = False
if randint(0,1) == 0:
s1 = s[::-1] # reverse
r = True
s2 = ""
changed = False
for i, c in enumerate(s1):
if not changed and c in group:
s2 += rand_char(group, c)
changed = True
else:
s2 += c
s1 = s2
if r == True:
s1 = s1[::-1] # reverse
if s1 == s:
return None
# print(s1)
return s1
def get_fake_word(word: str) -> str: #가짜 에스페란토 단어 만들기
global consonants
global vowels
w = get_stem(word)
if len(w[0]) <= 1:
return None
while True:
if randint(0,1) == 0:
w0 = change_chars(consonants, w[0])
else:
w0 = change_chars(vowels, w[0])
if w0 == None:
continue
# print(w[0]+","+w0)
if not w0 in esp_stems.esp_stems:
break
w[0] = w0
return "".join(w)
def remove_specials(s: str) -> str:
sp = "'\",.?!-()=[]{}#*+~1234567890"
for c in sp:
s = s.replace(c, ' ')
return s
def get_stems(lines: list, stems: dict):
for line in lines:
row = line.split('\t')
ws = remove_specials(row[1]).split(" ")
for w in ws:
if len(w) == 0: continue
stem_tail = get_stem(w)
if stem_tail[0].lower() == stem_tail[0]:
stems[stem_tail[0]] = get_fake_word(w)
print(w + " " + str(stems[stem_tail[0]]))
courses = "../memlingo/courses"
def find_pattern_files(directory, pattern):
out_files = []
# 디렉토리 안의 모든 항목에 대해 반복
for root, dirs, files in os.walk(directory):
for file in files:
# 파일이 .mp3 확장자를 가지고 있는지 확인
if file.endswith(pattern):
# *.mp3 파일을 찾았으므로 경로를 mp3_files 리스트에 추가
out_files.append(os.path.join(root, file))
return out_files
stems = {}
tsvs = find_pattern_files(courses, '.tsv')
for tsv_file in tsvs:
lines=[]
fp = open(tsv_file, "r")
for line in fp:
lines.append(line)
fp.close()
get_stems(lines,stems)
for stem in stems:
print(stem + "," + stems[stem])