代码拉取完成,页面将自动刷新
import jieba
# jieba库 分词的三种模式
# 1. 精确模式, 2. 全模式 3. 搜索引擎模式
#精确模式
# print(jieba.lcut('中国是一个伟大的国家'))
# #全模式
# print(jieba.lcut('中国是一个伟大的国家', cut_all=True))
# #搜索引擎模式
# print(jieba.lcut_for_search("中华人民共和国是伟大的"))
# #添加分词
# jieba.add_word('蟒蛇语言')
# def getText():
# txt = open("e:\hamlet.txt","r").read() # 打开源文件,并将文件内容转换成字符串
# txt = txt.lower() # 字符串全变成小写
# for ch in '!"#$%&()*+,-./:;<=>?@[\\]^_‘{|}~':
# txt.replace(ch, " ") # 特殊字符替换成空格
# return txt
# hamletTxt = getText() # 获取哈姆雷特文件
# words = hamletTxt.split() # 将文件内容由字符串以空格分割成列表
# counts = {} # 词名和次数组成的空字典
# for word in words:
# counts[word] = counts.get(word,0) + 1# 添加单词和次数到字典中
# items = list(counts.items()) # 字典转换成列表
# items.sort(key=lambda x:x[1], reverse=True) # 按统计次数,倒叙排列
# for i in range(10): # 显示出现次数最多的10个词
# word, count = items[i]
# print("{0:<10}{1:>5}".format(word,count))
# CalThreeKingdomsV1.py
txt = open("e:\\threekingdoms.txt", "r", encoding="utf-8").read()
words = jieba.lcut(txt)
excludes = {"将军","却说","荆州","二人","不可","不能","如此","左右","军马", \
"军士","主公","如何","商议","引兵","次日","大喜","天下","东吴","于是"}
counts = {}
for word in words:
if len(word) ==1:
continue
elif word == "诸葛亮" or word == "孔明曰":
rword = "孔明"
elif word == "关公" or word == "云长":
rword = "关羽"
elif word == "玄德" or word == "玄德曰":
rword = "刘备"
elif word == "孟德" or word == "丞相":
rword = "曹操"
elif word in excludes:
continue
else:
rword = word
counts[rword] = counts.get(rword, 0 ) + 1
items = list(counts.items())
items.sort(key=lambda x:x[1], reverse=True)
for i in range(15):
word, count = items[i]
print("{0:<10}{1:>15}".format(word, count))
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。