1 Star 0 Fork 0

toliong/pythonbasic

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
克隆/下载
spliteword.py 2.31 KB
一键复制 编辑 原始数据 按行查看 历史
toliong 提交于 2020-04-02 10:20 . add file
import jieba
# jieba库 分词的三种模式
# 1. 精确模式, 2. 全模式 3. 搜索引擎模式
#精确模式
# print(jieba.lcut('中国是一个伟大的国家'))
# #全模式
# print(jieba.lcut('中国是一个伟大的国家', cut_all=True))
# #搜索引擎模式
# print(jieba.lcut_for_search("中华人民共和国是伟大的"))
# #添加分词
# jieba.add_word('蟒蛇语言')
# def getText():
# txt = open("e:\hamlet.txt","r").read() # 打开源文件,并将文件内容转换成字符串
# txt = txt.lower() # 字符串全变成小写
# for ch in '!"#$%&()*+,-./:;<=>?@[\\]^_‘{|}~':
# txt.replace(ch, " ") # 特殊字符替换成空格
# return txt
# hamletTxt = getText() # 获取哈姆雷特文件
# words = hamletTxt.split() # 将文件内容由字符串以空格分割成列表
# counts = {} # 词名和次数组成的空字典
# for word in words:
# counts[word] = counts.get(word,0) + 1# 添加单词和次数到字典中
# items = list(counts.items()) # 字典转换成列表
# items.sort(key=lambda x:x[1], reverse=True) # 按统计次数,倒叙排列
# for i in range(10): # 显示出现次数最多的10个词
# word, count = items[i]
# print("{0:<10}{1:>5}".format(word,count))
# CalThreeKingdomsV1.py
txt = open("e:\\threekingdoms.txt", "r", encoding="utf-8").read()
words = jieba.lcut(txt)
excludes = {"将军","却说","荆州","二人","不可","不能","如此","左右","军马", \
"军士","主公","如何","商议","引兵","次日","大喜","天下","东吴","于是"}
counts = {}
for word in words:
if len(word) ==1:
continue
elif word == "诸葛亮" or word == "孔明曰":
rword = "孔明"
elif word == "关公" or word == "云长":
rword = "关羽"
elif word == "玄德" or word == "玄德曰":
rword = "刘备"
elif word == "孟德" or word == "丞相":
rword = "曹操"
elif word in excludes:
continue
else:
rword = word
counts[rword] = counts.get(rword, 0 ) + 1
items = list(counts.items())
items.sort(key=lambda x:x[1], reverse=True)
for i in range(15):
word, count = items[i]
print("{0:<10}{1:>15}".format(word, count))
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化
Python
1
https://gitee.com/toliong/pythonbasic.git
git@gitee.com:toliong/pythonbasic.git
toliong
pythonbasic
pythonbasic
master

搜索帮助