Skip to content
51 changes: 45 additions & 6 deletions silnlp/common/compare_lex.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,26 @@
import argparse
from collections import Counter
import numpy
import re
from typing import List

from ..common.environment import SIL_NLP_ENV

from machine.tokenization import LatinWordTokenizer

# Latin Tokenizer from machine library
#def get_all_words(src_file: str) -> List:
# words = []
# tokenizer = LatinWordTokenizer()
# with open(src_file, "r", encoding = "utf8") as src_data_file:
# for line in src_data_file:
# line_words = tokenizer.tokenize(line)
# for word in line_words:
# word = word.strip().strip("\'\"\\;,:.!?()-[]").lower()
# if word != "" and not word.isnumeric():
# words.append(word)
# return words

# Naive whitespace-based script-agnostic word splitter
def get_all_words(src_file: str) -> List:
words = []
pattern = re.compile(r",(?=\S)") # Look for commas with no following space
Expand Down Expand Up @@ -59,6 +75,8 @@ def main() -> None:
unique_src_words2 = numpy.unique(numpy.array(src_words2))
src1_only_words = find_unique(unique_src_words1,unique_src_words2)
src2_only_words = find_unique(unique_src_words2,unique_src_words1)
src1_word_counter = Counter(src_words1).most_common()
src2_word_counter = Counter(src_words2).most_common()

# Write unique source words to files
src_words_file1 = lex_path1 / "src_words.txt"
Expand All @@ -70,15 +88,36 @@ def main() -> None:
for word in unique_src_words2:
output_file.writelines(word+'\n')

# Re-write src_words files with counts
with (lex_path1 / "src_words.txt").open("w", encoding = "utf8") as output_file:
for entry in src1_word_counter:
output_file.writelines(entry[0] + '\t' + str(entry[1]) + '\n')
with (lex_path2 / "src_words.txt").open("w", encoding = "utf8") as output_file:
for entry in src2_word_counter:
output_file.writelines(entry[0] + '\t' + str(entry[1]) + '\n')

# Write source words missing from the alternate source file
#with (lex_path1 / "unmatched_src_words.txt").open("w", encoding="utf8") as output_file:
# output_file.writelines(f'src.txt words not found in {src_file2}\n')
# for word in src1_only_words:
# output_file.writelines(word+'\n')
#with (lex_path2 / "unmatched_src_words.txt").open("w", encoding="utf8") as output_file:
# output_file.writelines(f'src.txt words not found in {src_file1}\n')
# for word in src2_only_words:
# output_file.writelines(word+'\n')


# Rewrite of above section to include counts in the output file:
with (lex_path1 / "unmatched_src_words.txt").open("w", encoding="utf8") as output_file:
output_file.writelines(f'src.txt words not found in {src_file2}\n')
for word in src1_only_words:
output_file.writelines(word+'\n')
for entry in src1_word_counter:
if entry[0] in src1_only_words:
output_file.writelines(entry[0] + '\t' + str(entry[1]) + '\n')
with (lex_path2 / "unmatched_src_words.txt").open("w", encoding="utf8") as output_file:
output_file.writelines(f'src.txt words not found in {src_file1}\n')
for word in src2_only_words:
output_file.writelines(word+'\n')
for entry in src2_word_counter:
if entry[0] in src2_only_words:
output_file.writelines(entry[0] + '\t' + str(entry[1]) + '\n')

# Compare target words and write results to files
if args.trg == True:
Expand Down Expand Up @@ -109,7 +148,7 @@ def main() -> None:
for word in trg1_only_words:
output_file.writelines(word+'\n')
with (lex_path2 / "unmatched_trg_words.txt").open("w", encoding="utf8") as output_file:
output_file.writelines(f'src.txt words not found in {trg_file1}\n')
output_file.writelines(f'trg.txt words not found in {trg_file1}\n')
for word in trg2_only_words:
output_file.writelines(word+'\n')

Expand Down
17 changes: 12 additions & 5 deletions silnlp/common/count_words.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,9 @@ def main() -> None:
parser.add_argument("experiment", help="Experiment folder from path S:\\Alignment\\experiments\\")
parser.add_argument("--aligner", help="Aligner: eflomal, fast-align, hmm", default="eflomal")
parser.add_argument("--num", help="Number of most common words to include", type=int, default=100)
parser.add_argument("--stats", help="True or False: Print word count and number of renderings for common words",
parser.add_argument("--stats", help="Print word count and number of renderings for common words",
action='store_true')
parser.add_argument("--count", help="Include count in src word files", action='store_true')
args = parser.parse_args()

# Set up path and lex files
Expand Down Expand Up @@ -49,6 +50,8 @@ def main() -> None:
if word != "" and not word.isnumeric():
src_words.append(word)
src_data_word_counter = Counter(src_words).most_common(args.num)
if args.count:
src_word_counter = Counter(src_words).most_common()
unique_src_words = numpy.unique(numpy.array(src_words))

# Pull all the separate words from the target data. Take all unique.
Expand All @@ -65,7 +68,7 @@ def main() -> None:
trg_words.append(word)
unique_trg_words = numpy.unique(numpy.array(trg_words))

# Clean lexicon file and prep for pandas csv reader
# Prep lexicon file for pandas csv reader (escape quotes)
with (lex_path / lex_txt_file).open("r", encoding="utf8") as lexicon:
with (lex_path / new_lex_txt_file).open("w", encoding="utf8") as new_lex:
for line in lexicon.readlines():
Expand Down Expand Up @@ -111,9 +114,13 @@ def main() -> None:
for src_wd in common_wd:
writer.writerow([src_wd, *common_wd[src_wd]])

with (lex_path / "src_words.txt").open("w", encoding = "utf8") as output_file:
for word in unique_src_words:
output_file.writelines(word + '\n')
with (lex_path / "src_words.txt").open("w", encoding = "utf8") as output_file:
if args.count:
for entry in src_word_counter:
output_file.writelines(entry[0] + '\t' + str(entry[1]) + '\n')
else:
for word in unique_src_words:
output_file.writelines(word + '\n')

# Optionally, output a few stats
if args.stats:
Expand Down
71 changes: 71 additions & 0 deletions silnlp/common/find_words.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
import argparse
from collections import Counter
import csv
import unicodedata

from ..common.environment import SIL_NLP_ENV

# Normalize combined characters for Devanagari (Ref: https://docs.python.org/3/howto/unicode.html#comparing-strings)
def NFD(s):
return unicodedata.normalize('NFD', s)

def main():
parser = argparse.ArgumentParser(description="Counts lexicon entries")
parser.add_argument("experiment", help="Experiment folder from path S:\\Alignment\\experiments\\")
parser.add_argument("--word_list", help="File containing words to find", default="unmatched_src_words.txt")
args = parser.parse_args()

# Set up path and files
lex_path = SIL_NLP_ENV.align_experiments_dir / args.experiment
word_filename = SIL_NLP_ENV.align_experiments_dir / args.experiment / args.word_list
vref_filename = SIL_NLP_ENV.align_experiments_dir / args.experiment/ "refs.txt"

# Get count of each word in the file
with (lex_path / "src_words.txt").open("r", encoding="utf8") as src_wd_file:
src_word_counts = []
for entry in src_wd_file:
entry = list(entry.split('\t'))
if len(entry) > 1:
entry[1] = int(entry[1].strip())
src_word_counts.append(entry)
else:
print("Error: word counts are missing. Please run count_words.py with the --count flag set.")
return 1

# Extract list of words
src_word_dict = dict(list(src_word_counts))
with(word_filename).open("r", encoding = "utf8") as word_file:
words = []
for word in word_file:
words.append(word.rstrip('\n'))
# Check for words and word count in each verse; write to output file.
with (lex_path / "src.txt").open("r", encoding = "utf8") as src_data_file:
with(vref_filename).open("r", encoding = "utf8") as ref_file:
word_list = list(enumerate(words))
result = []
seen_words = []
for verse in zip(ref_file, src_data_file):
word_text = []
word_num = []
word_count = 0
for word in word_list:
#if NFD(NFD(word[1])) in NFD(NFD(verse[1])):
#if word[1] in verse[1]: # (to find all instances; not just first)
if word[1] in verse[1] and word[1] not in seen_words:
for entry in src_word_counts:
if entry[0] == word[1]:
word_count += entry[1]
seen_words.append(word[1])
word_text.append(word[1])
word_num.append(src_word_dict[word[1]])
result.append([verse[0].rstrip('\n'), word_count, word_num, word_text])
with (lex_path / "unmatched_word_verses.txt").open("w", encoding = "utf8") as output_file:
writer = csv.writer(output_file, lineterminator="\n")
writer.writerow(['Reference','Novelty Score','Word Counts','Words'])
for line in result:
writer.writerow([line[0], line[1], line[2], *line[3]])
#print(result)


if __name__ == '__main__':
main()