#CalLunyuV2.py
import jieba
txt = open("lunyu.txt", "r", encoding = "utf-8").read()
excloudes = {
"君子","可以","仁者","何如","可谓","不知","小人","不可","不能","天下","三子","不足","而已","不如","无道","大夫","不食","问政","朋友","至于","与其","不得","弟子","三年","何以","以为","如之何","何有"}
words = jieba.lcut(txt)
counts = {
}
for word in words:
if len(word)==1:
continue
elif word == "夫子" or word == "子谓" or word =="子曰" or word =="子见" :
rword ="孔子"
elif word == "子贡曰":
rword = "子贡"
else:
rword = word
counts[rword]= counts.get(rword,0) + 1
for word in excloudes:
del counts[word]
items = list(counts.items())
items.sort(key=lambda x:x[1],reverse=True)
for i in range(10):
word, count = items[i]
print("{0:<10}{1:>5}".format(word,count))
结果显示
孔子 71
子路 34
子贡 30
颜渊 15
曾子 11
樊迟 9
子张 9
冉有 8
管仲 8
仲弓 7
问题:
《论语》里有不少“子曰”,似乎没有算进去。
改进
#CalLunyuV2.py
import jieba
jieba.add_word("子曰")
jieba.add_word("子见")
jieba.add_word("子闻")
txt = open("lunyu.txt", "r", encoding = "utf-8").read()
excloudes = {
"君子","可以","仁者","何如","可谓","不知","小人","不可","不能","天下","三子","不足","而已","不如","无道","大夫","不食","问政","朋友","至于","与其","不得","弟子","三年","何以","以为","如之何","何有"}
words = jieba.lcut(txt)
counts = {
}
for word in words:
if len(word)==1:
continue
elif word == "夫子" or word == "子谓" or word =="子曰" or word =="子见" or word=="子闻":
rword ="孔子"
elif word == "子贡曰":
rword = "子贡"
else:
rword = word
counts[rword]= counts.get(rword,0) + 1
for word in excloudes:
del counts[word]
items = list(counts.items())
items.sort(key=lambda x:x[1],reverse=True)
for i in range(10):
word, count = items[i]
print("{0:<10}{1:>5}".format(word,count))
结果
孔子 415
子路 34
子贡 30
颜渊 15
樊迟 9
子张 9
冉有 8
管仲 8
仲弓 7
从政 7