From c78287dc51ddd904d162f8d3a678b95109a7ee75 Mon Sep 17 00:00:00 2001 From: Matt Chambers Date: Sat, 20 Aug 2022 19:03:11 -0400 Subject: [PATCH 1/6] Add polyphone overrides to Dictionary --- chinese/database.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/chinese/database.py b/chinese/database.py index 09f1d22..d9278b0 100644 --- a/chinese/database.py +++ b/chinese/database.py @@ -19,6 +19,7 @@ from os.path import dirname, join, realpath from sqlite3 import connect +import csv from .util import add_with_space @@ -28,6 +29,12 @@ def __init__(self): db_path = join(dirname(realpath(__file__)), 'data', 'db', 'chinese.db') self.conn = connect(db_path) self.c = self.conn.cursor() + polyphone_map_path = join(dirname(realpath(__file__)), 'data', 'db', 'polyphones.tsv') + self.polyphone_map = {} + with open(polyphone_map_path, encoding="utf-8") as file: + for line in csv.reader(file, delimiter="\t"): + if not line[0].startswith("#"): + self.polyphone_map[line[0]] = line[1] def create_indices(self): self.c.execute( @@ -42,6 +49,9 @@ def create_indices(self): def _get_word_pinyin(self, word, type_, prefer_tw=False, no_variants=True): from .transcribe import accentuate + if type_ == 'simp' and word in self.polyphone_map: + return ' '.join(accentuate(list(map(str.lower, self.polyphone_map[word].split())), 'pinyin')) + if type_ == 'trad': query = 'SELECT pinyin, pinyin_tw FROM cidian WHERE traditional=?' elif type_ == 'simp': @@ -248,4 +258,4 @@ def get_sentences(self, word): try: return self.c.fetchone() except: - return [] \ No newline at end of file + return [] From 72139831ae93cf38e75ee8f87e7395dd240f0ab2 Mon Sep 17 00:00:00 2001 From: Matt Chambers Date: Sat, 20 Aug 2022 19:04:40 -0400 Subject: [PATCH 2/6] Add polyphones.tsv --- chinese/polyphones.tsv | 265 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 265 insertions(+) create mode 100644 chinese/polyphones.tsv diff --git a/chinese/polyphones.tsv b/chinese/polyphones.tsv new file mode 100644 index 0000000..08b9e87 --- /dev/null +++ b/chinese/polyphones.tsv @@ -0,0 +1,265 @@ +# hanzi pinyin +的 de5 +和 he2 +了 le5 +为 wei4 +将 jiang1 +说 shuo1 +与 yu3 +上 shang4 +大 da4 +要 yao4 +地 de5 +据 ju4 +着 zhe5 +种 zhong3 +把 ba4 +比 bi3 +好 hao3 +同 tong2 +分 fen1 +更 geng4 +会 hui4 +可 ke3 +几 ji3 +给 gei3 +场 chang3 +占 zhan4 +得 de2 +长 chang2 +作 zuo4 +阿 a1 +号 hao4 +只 zhi3 +看 kan4 +正 zheng4 +强 qiang2 +间 jian1 +卡 ka3 +当 dang1 +打 da3 +底 di3 +夫 fu1 +称 cheng1 +便 bian4 +头 tou2 +那 na4 +少 shao3 +难 nan4 +发 fa1 +令 ling4 +重 zhong4 +创 chuang4 +没 mei2 +率 shuai4 +奇 qi2 +塞 sai1 +华 hua2 +勒 le4 +度 du4 +差 cha1 +边 bian1 +化 hua4 +通 tong1 +合 he2 +跑 pao3 +菲 fei1 +见 jian4 +降 jiang4 +子 zi3 +扎 zha1 +草 cao3 +远 yuan3 +供 gong1 +耶 ye1 +什 shen2 +朝 chao2 +假 jia3 +听 ting1 +尽 jin4 +呢 ne5 +藏 cang2 +转 zhuan3 +蒙 meng2 +价 jia4 +板 ban3 +调 diao4 +压 ya1 +宁 ning2 +句 ju4 +抢 qiang3 +足 zu2 +空 kong1 +论 lun4 +卷 juan3 +吗 ma5 +传 chuan2 +般 ban1 +雨 yu3 +追 zhui1 +脚 jiao3 +校 xiao4 +圈 quan1 +娜 na4 +落 luo4 +倒 dao3 +伯 bo2 +结 jie2 +冲 chong1 +待 dai4 +鸟 niao3 +采 cai3 +读 du2 +哪 na3 +杆 gan1 +丽 li2 +氏 shi4 +堡 bao3 +色 se4 +畜 chu4 +铺 pu4 +佛 fo2 +炸 zha4 +免 mian3 +答 da2 +骑 qi2 +载 zai3 +喝 he1 +幢 zhuang4 +背 bei1 +页 ye4 +奔 ben1 +症 zheng4 +横 heng2 +食 shi2 +尾 wei3 +稍 shao1 +术 shu4 +语 yu3 +累 lei4 +趟 tang4 +亲 qin1 +划 hua2 +涌 yong3 +散 san4 +担 dan4 +赚 zhuan4 +服 fu2 +操 cao1 +解 jie3 +咱 za2 +岭 ling3 +鲜 xian1 +弹 tan2 +涨 zhang3 +甚 shen4 +薄 bao2 +撒 sa3 +斗 dou3 +禁 jin4 +泥 ni2 +挑 tiao1 +埋 mai2 +钻 zuan4 +折 zhe2 +监 jian1 +挣 zheng4 +挡 dang3 +摸 mo1 +炮 pao4 +参 can1 +辟 pi4 +似 si4 +屯 tun2 +虾 xia1 +弄 nong4 +秘 mi4 +喷 pen1 +荷 he2 +泡 pao4 +浅 qian3 +否 fou3 +混 hun4 +匹 pi3 +磨 mo2 +沈 shen3 +模 mo2 +夹 jia2 +踏 ta4 +渐 jian4 +尺 chi3 +扛 kang2 +喂 wei4 +浆 jiang1 +轴 zhou2 +谜 mi2 +档 dang3 +拚 pin1 +脏 zang1 +歪 wai1 +莎 sha1 +扫 sao3 +沉 chen2 +识 shi2 +侧 ce4 +仔 zai3 +恶 e4 +晃 huang4 +尿 niao4 +臭 chou4 +饮 yin3 +柜 gui4 +脉 mai4 +芯 xin1 +凉 liang2 +妻 qi1 +挨 ai1 +缝 feng2 +腊 la4 +刹 sha1 +罢 ba4 +宿 su4 +刷 shua1 +俊 jun4 +膏 gao1 +抹 mo3 +削 xiao1 +唯 wei2 +咋 za3 +旋 xuan2 +钉 ding1 +拾 shi2 +澄 cheng2 +掺 chan1 +桔 ju2 +杠 gang4 +锯 ju4 +厕 ce4 +匾 bian3 +揣 chuai3 +弟 di4 +笼 long2 +衰 shuai1 +胖 pang4 +汞 gong3 +漂 piao1 +殷 yin3 +劈 pi1 +驮 tuo2 +刨 pao2 +粥 zhou1 +朴 pu3 +估 gu1 +熬 ao2 +隐 yin3 +卒 zu2 +么 me5 +叉 cha1 +舌 she2 +攒 zan3 +楞 leng4 +雀 que4 +荡 dang4 +淋 lin2 +缉 ji1 +凹 ao1 From c6cab7340a0fe41a6a6d0b39d040b39531248060 Mon Sep 17 00:00:00 2001 From: Matt Chambers Date: Sat, 20 Aug 2022 19:10:53 -0400 Subject: [PATCH 3/6] Add tests for polyphones --- tests/test_transcribe.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tests/test_transcribe.py b/tests/test_transcribe.py index ec8c027..8aacbe1 100644 --- a/tests/test_transcribe.py +++ b/tests/test_transcribe.py @@ -109,6 +109,14 @@ def test_multiple_words(self): transcribe(['图书', '馆'], 'pinyin', 'simp'), ['tú shū', 'guǎn'] ) + def test_single_polyphone(self): + self.assertEqual(transcribe(['说'], 'pinyin', 'simp'), ['shuō']) + + def test_multiple_polyphones(self): + self.assertEqual( + transcribe(['你', '要', '说', '什么'], 'pinyin', 'simp'), ['nǐ', 'yào', 'shuō', 'shénme'] + ) + def test_no_chinese(self): self.assertEqual(transcribe(['foo'], 'pinyin', 'simp'), []) From dd0a757d7c0bb4b91c2c59be10e6958f98eadaff Mon Sep 17 00:00:00 2001 From: Matt Chambers Date: Sun, 21 Aug 2022 20:34:42 -0400 Subject: [PATCH 4/6] Add hanzi lookup for single character words --- chinese/database.py | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/chinese/database.py b/chinese/database.py index d9278b0..4cbf1e6 100644 --- a/chinese/database.py +++ b/chinese/database.py @@ -49,20 +49,25 @@ def create_indices(self): def _get_word_pinyin(self, word, type_, prefer_tw=False, no_variants=True): from .transcribe import accentuate + # first check polyphones override map if type_ == 'simp' and word in self.polyphone_map: return ' '.join(accentuate(list(map(str.lower, self.polyphone_map[word].split())), 'pinyin')) - if type_ == 'trad': - query = 'SELECT pinyin, pinyin_tw FROM cidian WHERE traditional=?' - elif type_ == 'simp': - query = 'SELECT pinyin, pinyin_tw FROM cidian WHERE simplified=?' + # second use zidian for single characters instead of cidian + if len(word) == 1: + query = 'SELECT kMandarin, kMandarin FROM hanzi WHERE cp=?' else: - raise ValueError(type_) - - if no_variants: - query += """AND (english NOT LIKE '%variant%' OR english IS NULL) - AND (german NOT LIKE '%variant%' OR german IS NULL) - AND (french NOT LIKE '%variant%' OR french IS NULL)""" + if type_ == 'trad': + query = 'SELECT pinyin, pinyin_tw FROM cidian WHERE traditional=?' + elif type_ == 'simp': + query = 'SELECT pinyin, pinyin_tw FROM cidian WHERE simplified=?' + else: + raise ValueError(type_) + + if no_variants: + query += """AND (english NOT LIKE '%variant%' OR english IS NULL) + AND (german NOT LIKE '%variant%' OR german IS NULL) + AND (french NOT LIKE '%variant%' OR french IS NULL)""" self.c.execute(query, (word,)) res = self.c.fetchone() From c703bd6e7a39daccce42e8811aa438f01eb22141 Mon Sep 17 00:00:00 2001 From: Matt Chambers Date: Sun, 21 Aug 2022 20:36:14 -0400 Subject: [PATCH 5/6] Remove redundant polyphone overrides The removed polyphones will get the correct pinyin from the hanzi table. --- chinese/polyphones.tsv | 222 +---------------------------------------- 1 file changed, 1 insertion(+), 221 deletions(-) diff --git a/chinese/polyphones.tsv b/chinese/polyphones.tsv index 08b9e87..13cd7d6 100644 --- a/chinese/polyphones.tsv +++ b/chinese/polyphones.tsv @@ -1,265 +1,45 @@ # hanzi pinyin -的 de5 -和 he2 -了 le5 -为 wei4 -将 jiang1 -说 shuo1 -与 yu3 -上 shang4 -大 da4 -要 yao4 地 de5 -据 ju4 -着 zhe5 -种 zhong3 把 ba4 -比 bi3 -好 hao3 -同 tong2 -分 fen1 -更 geng4 -会 hui4 -可 ke3 -几 ji3 -给 gei3 -场 chang3 -占 zhan4 -得 de2 长 chang2 -作 zuo4 -阿 a1 -号 hao4 -只 zhi3 -看 kan4 -正 zheng4 -强 qiang2 -间 jian1 -卡 ka3 -当 dang1 -打 da3 -底 di3 -夫 fu1 -称 cheng1 -便 bian4 -头 tou2 -那 na4 -少 shao3 难 nan4 -发 fa1 -令 ling4 -重 zhong4 -创 chuang4 -没 mei2 率 shuai4 -奇 qi2 -塞 sai1 -华 hua2 勒 le4 -度 du4 差 cha1 -边 bian1 -化 hua4 -通 tong1 -合 he2 -跑 pao3 -菲 fei1 -见 jian4 -降 jiang4 子 zi3 -扎 zha1 -草 cao3 -远 yuan3 -供 gong1 耶 ye1 -什 shen2 -朝 chao2 -假 jia3 -听 ting1 尽 jin4 -呢 ne5 -藏 cang2 -转 zhuan3 -蒙 meng2 -价 jia4 -板 ban3 -调 diao4 -压 ya1 -宁 ning2 -句 ju4 -抢 qiang3 -足 zu2 -空 kong1 -论 lun4 -卷 juan3 -吗 ma5 -传 chuan2 -般 ban1 -雨 yu3 -追 zhui1 -脚 jiao3 -校 xiao4 -圈 quan1 -娜 na4 -落 luo4 倒 dao3 -伯 bo2 -结 jie2 -冲 chong1 -待 dai4 -鸟 niao3 -采 cai3 -读 du2 -哪 na3 -杆 gan1 丽 li2 -氏 shi4 -堡 bao3 -色 se4 -畜 chu4 -铺 pu4 佛 fo2 -炸 zha4 -免 mian3 -答 da2 -骑 qi2 载 zai3 -喝 he1 幢 zhuang4 背 bei1 -页 ye4 -奔 ben1 -症 zheng4 -横 heng2 -食 shi2 -尾 wei3 -稍 shao1 -术 shu4 -语 yu3 -累 lei4 -趟 tang4 -亲 qin1 划 hua2 -涌 yong3 -散 san4 担 dan4 -赚 zhuan4 -服 fu2 -操 cao1 -解 jie3 咱 za2 -岭 ling3 -鲜 xian1 弹 tan2 -涨 zhang3 甚 shen4 薄 bao2 撒 sa3 斗 dou3 -禁 jin4 -泥 ni2 -挑 tiao1 -埋 mai2 钻 zuan4 -折 zhe2 -监 jian1 挣 zheng4 -挡 dang3 -摸 mo1 -炮 pao4 -参 can1 -辟 pi4 似 si4 -屯 tun2 -虾 xia1 -弄 nong4 -秘 mi4 -喷 pen1 -荷 he2 -泡 pao4 -浅 qian3 -否 fou3 -混 hun4 -匹 pi3 -磨 mo2 沈 shen3 -模 mo2 夹 jia2 -踏 ta4 -渐 jian4 -尺 chi3 -扛 kang2 -喂 wei4 -浆 jiang1 -轴 zhou2 -谜 mi2 档 dang3 拚 pin1 脏 zang1 -歪 wai1 -莎 sha1 -扫 sao3 -沉 chen2 识 shi2 -侧 ce4 仔 zai3 -恶 e4 晃 huang4 -尿 niao4 -臭 chou4 -饮 yin3 -柜 gui4 -脉 mai4 -芯 xin1 -凉 liang2 -妻 qi1 -挨 ai1 缝 feng2 -腊 la4 -刹 sha1 -罢 ba4 -宿 su4 -刷 shua1 -俊 jun4 -膏 gao1 -抹 mo3 削 xiao1 -唯 wei2 -咋 za3 -旋 xuan2 -钉 ding1 -拾 shi2 -澄 cheng2 掺 chan1 -桔 ju2 杠 gang4 -锯 ju4 -厕 ce4 -匾 bian3 揣 chuai3 -弟 di4 -笼 long2 -衰 shuai1 -胖 pang4 -汞 gong3 漂 piao1 殷 yin3 -劈 pi1 -驮 tuo2 -刨 pao2 -粥 zhou1 -朴 pu3 -估 gu1 -熬 ao2 -隐 yin3 -卒 zu2 -么 me5 -叉 cha1 -舌 she2 -攒 zan3 楞 leng4 -雀 que4 -荡 dang4 -淋 lin2 -缉 ji1 -凹 ao1 +陂 po1 From 991dfe47aac74f1d07f48c4153d43ffb1937c6d6 Mon Sep 17 00:00:00 2001 From: Matt Chambers Date: Sun, 21 Aug 2022 23:44:38 -0400 Subject: [PATCH 6/6] Add two-character polyphones and tests Fix Ruby/Bopomofo test (maybe due to using hanzi transcription) Move polyphones.tsv to the right location --- chinese/data/db/polyphones.tsv | 126 +++++++++++++++++++++++++++++++++ chinese/polyphones.tsv | 45 ------------ tests/test_ruby.py | 4 +- tests/test_transcribe.py | 10 ++- 4 files changed, 137 insertions(+), 48 deletions(-) create mode 100644 chinese/data/db/polyphones.tsv delete mode 100644 chinese/polyphones.tsv diff --git a/chinese/data/db/polyphones.tsv b/chinese/data/db/polyphones.tsv new file mode 100644 index 0000000..0fb048d --- /dev/null +++ b/chinese/data/db/polyphones.tsv @@ -0,0 +1,126 @@ +# hanzi pinyin +地 de5 +把 ba4 +长 chang2 +难 nan4 +率 shuai4 +勒 le4 +差 cha1 +子 zi3 +耶 ye1 +尽 jin4 +倒 dao3 +丽 li2 +佛 fo2 +载 zai3 +幢 zhuang4 +背 bei1 +划 hua2 +担 dan4 +咱 za2 +弹 tan2 +甚 shen4 +薄 bao2 +撒 sa3 +斗 dou3 +钻 zuan4 +挣 zheng4 +似 si4 +沈 shen3 +夹 jia2 +档 dang3 +拚 pin1 +脏 zang1 +识 shi2 +仔 zai3 +晃 huang4 +缝 feng2 +削 xiao1 +掺 chan1 +杠 gang4 +揣 chuai3 +漂 piao1 +殷 yin3 +楞 leng4 +陂 po1 +不是 bu2 shi5 +起来 qi3 lai5 +出来 chu1 lai2 +东西 dong1 xi5 +地方 di4 fang1 +告诉 gao4 su4 +当时 dang1 shi2 +女人 nü3 ren2 +过去 guo4 qu4 +结果 jie1 guo3 +多少 duo1 shao3 +过来 guo4 lai2 +故事 gu4 shi4 +精神 jing1 shen2 +人家 ren2 jia1 +不了 bu4 liao3 +当年 dang1 nian2 +妻子 qi1 zi3 +说道 shuo1 dao4 +便宜 pian2 yi5 +重点 zhong4 dian3 +土地 tu3 di4 +高中 gao1 zhong1 +说法 shuo1 fa3 +生意 sheng1 yi4 +老公 lao3 gong1 +尽量 jin3 liang4 +得了 de2 le5 +当天 dang1 tian1 +小子 xiao3 zi5 +好处 hao3 chu5 +好吃 hao3 chi1 +分子 fen1 zi3 +为人 wei2 ren2 +同行 tong2 hang2 +老子 lao3 zi5 +好玩 hao3 wan2 +大都 da4 dou1 +正当 zheng4 dang1 +所长 suo3 zhang3 +言语 yan2 yu3 +本事 ben3 shi4 +孙子 sun1 zi5 +恶心 e3 xin1 +重重 chong2 chong2 +跟前 gen1 qian2 +琢磨 zhuo2 mo2 +乖乖 guai1 guai1 +大方 da4 fang1 +个头 ge4 tou2 +温和 wen1 he2 +狮子 shi1 zi5 +当晚 dang1 wan3 +教会 jiao1 hui4 +开通 kai1 tong1 +看好 kan4 hao3 +大爷 da4 ye2 +工夫 gong1 fu1 +口音 kou3 yin1 +当日 dang1 ri4 +大王 da4 wang2 +得罪 de2 zui4 +转动 zhuan3 dong4 +结实 jie1 shi2 +转头 zhuan3 tou2 +空地 kong1 di4 +款式 kuan3 shi4 +扎实 zha1 shi5 +下场 xia4 chang5 +公道 gong1 dao4 +明朝 ming2 chao2 +澄清 cheng2 qing1 +分量 fen4 liang5 +小儿 xiao3 er2 +上头 shang4 tou2 +本色 ben3 se4 +单子 dan1 zi5 +下水 xia4 shui3 +冷战 leng3 zhan4 +端详 duan1 xiang2 +丁丁 ding1 ding1 diff --git a/chinese/polyphones.tsv b/chinese/polyphones.tsv deleted file mode 100644 index 13cd7d6..0000000 --- a/chinese/polyphones.tsv +++ /dev/null @@ -1,45 +0,0 @@ -# hanzi pinyin -地 de5 -把 ba4 -长 chang2 -难 nan4 -率 shuai4 -勒 le4 -差 cha1 -子 zi3 -耶 ye1 -尽 jin4 -倒 dao3 -丽 li2 -佛 fo2 -载 zai3 -幢 zhuang4 -背 bei1 -划 hua2 -担 dan4 -咱 za2 -弹 tan2 -甚 shen4 -薄 bao2 -撒 sa3 -斗 dou3 -钻 zuan4 -挣 zheng4 -似 si4 -沈 shen3 -夹 jia2 -档 dang3 -拚 pin1 -脏 zang1 -识 shi2 -仔 zai3 -晃 huang4 -缝 feng2 -削 xiao1 -掺 chan1 -杠 gang4 -揣 chuai3 -漂 piao1 -殷 yin3 -楞 leng4 -陂 po1 diff --git a/tests/test_ruby.py b/tests/test_ruby.py index 1c9770a..f37c95e 100644 --- a/tests/test_ruby.py +++ b/tests/test_ruby.py @@ -43,7 +43,7 @@ def test_ruby_bottom(self): def test_bopomofo(self): self.assertEqual(ruby(['機場'], 'bopomofo'), ['機[ㄐㄧ]場[ㄔㄤˇ]']) - self.assertEqual(ruby(['機', '場'], 'bopomofo'), ['機[ㄐㄧ]', '場[ㄔㄤˊ]']) + self.assertEqual(ruby(['機', '場'], 'bopomofo'), ['機[ㄐㄧ]', '場[ㄔㄤˇ]']) self.assertEqual( ruby(['加拿大人'], 'bopomofo'), ['加[ㄐㄧㄚ]拿[ㄋㄚˊ]大[ㄉㄚˋ]人[ㄖㄣˊ]'] ) @@ -51,7 +51,7 @@ def test_bopomofo(self): def test_bopomofo_punc(self): self.assertEqual(ruby(['機場。'], 'bopomofo'), ['機[ㄐㄧ]場[ㄔㄤˇ]。']) self.assertEqual( - ruby(['機', '場', '。'], 'bopomofo'), ['機[ㄐㄧ]', '場[ㄔㄤˊ]', '。'] + ruby(['機', '場', '。'], 'bopomofo'), ['機[ㄐㄧ]', '場[ㄔㄤˇ]', '。'] ) def test_jyutping_available(self): diff --git a/tests/test_transcribe.py b/tests/test_transcribe.py index 8aacbe1..c05f561 100644 --- a/tests/test_transcribe.py +++ b/tests/test_transcribe.py @@ -112,9 +112,17 @@ def test_multiple_words(self): def test_single_polyphone(self): self.assertEqual(transcribe(['说'], 'pinyin', 'simp'), ['shuō']) + def test_single_zici_polyphone(self): + self.assertEqual(transcribe(['分子'], 'pinyin', 'simp'), ['fēn zǐ']) + def test_multiple_polyphones(self): self.assertEqual( - transcribe(['你', '要', '说', '什么'], 'pinyin', 'simp'), ['nǐ', 'yào', 'shuō', 'shénme'] + transcribe(['你', '要', '说', '什么'], 'pinyin', 'simp'), ['nǐ', 'yào', 'shuō', 'shén me'] + ) + + def test_multiple_zici_polyphones(self): + self.assertEqual( + transcribe(['重点', '分子', '便宜'], 'pinyin', 'simp'), ['zhòng diǎn', 'fēn zǐ', 'pián yi'] ) def test_no_chinese(self):