-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathcn_cheaker.py
65 lines (58 loc) · 2.06 KB
/
cn_cheaker.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
from spell_checker import cn_correct
import jieba
import jieba.posseg as pseg
m_list=['你','那','为','了','呢','太','呀','很','真','是','吗','我','们','有','也','他','她','您','它','给','让','请','对','将','这','每','帮','看','弄','能','在','去','说','以','可','和','与']
bd_list=['"','"','“','”','。','.',',',',','!','!','?','?',';',';','、',':',':','《','》',' ']
def cheakBD(word):
for i in bd_list:
if(i==word):
return True
return False
def cheakM(word):
for i in m_list:
if(i==word):
return False
return True
def cheekEn(word):
if((word>='a' and word<='z') or (word>='A' and word<='Z')):
return True
return False
def de_correct(cx_q,cx_h):
if((cx_q=='d' or cx_q=='a') and cx_h=='v'):
return '地'
if(cx_q=='v' and cx_h=='a'):
return '得'
return '的'
def cn_sen_correct(res,cx):
out=''
next_b=False
for i in range(len(res)):
if(next_b):
next_b=False
continue
if(len(res[i])<2 and i<len(res)):
if(res[i]=='的' or res[i]=='地' or res[i]=='得'):
if(i+1==len(res)):
out=out+de_correct(cx[i-1],'')
else:
out=out+de_correct(cx[i-1],cx[i+1])
continue
if(cheakBD(res[i]) or res[i]==' ' or cheekEn(res[i])):
out=out+res[i]
continue
if(i<len(res)-1 and cheakM(res[i]) and cheakM(res[i+1]) and not cheakBD(res[i+1])):
out=out+cn_correct(res[i]+res[i+1])
next_b=True
#elif(cheakM(res[i+1]) and len(res[i+1])>1):
#out=out+res[i]+cn_correct(res[i+1])
else:
out=out+res[i]
elif(i==0 or len(res[i])>=2):
out=out+cn_correct(res[i])
return out
while(1):
inText=input('input: ')
cut_out=pseg.cut(inText)
res=list(jieba.cut(inText))
#res=[x.word for x in cut_out]
print(cn_sen_correct(res,[x.flag for x in cut_out]))