pythonbasic
/
spliteword.py

import jieba

# jieba库 分词的三种模式
# 1. 精确模式， 2. 全模式 3. 搜索引擎模式

#精确模式
# print(jieba.lcut('中国是一个伟大的国家'))
# #全模式
# print(jieba.lcut('中国是一个伟大的国家', cut_all=True))
# #搜索引擎模式
# print(jieba.lcut_for_search("中华人民共和国是伟大的"))
# #添加分词
# jieba.add_word('蟒蛇语言')

# def getText():
#     txt = open("e:\hamlet.txt","r").read()  # 打开源文件,并将文件内容转换成字符串
#     txt = txt.lower()                    # 字符串全变成小写
#     for ch in '!"#$%&()*+,-./:;<=>?@[\\]^_‘{|}~':
#         txt.replace(ch, " ")             # 特殊字符替换成空格
#     return txt


# hamletTxt = getText()                    # 获取哈姆雷特文件
# words = hamletTxt.split()                # 将文件内容由字符串以空格分割成列表

# counts = {}                              # 词名和次数组成的空字典
# for word in words:
#     counts[word] = counts.get(word,0) + 1# 添加单词和次数到字典中

# items = list(counts.items())             # 字典转换成列表
# items.sort(key=lambda x:x[1], reverse=True)   # 按统计次数，倒叙排列

# for i in range(10):                      # 显示出现次数最多的10个词
#     word, count = items[i]
#     print("{0:<10}{1:>5}".format(word,count))


# CalThreeKingdomsV1.py
txt = open("e:\\threekingdoms.txt", "r", encoding="utf-8").read()
words = jieba.lcut(txt)
excludes = {"将军","却说","荆州","二人","不可","不能","如此","左右","军马", \
            "军士","主公","如何","商议","引兵","次日","大喜","天下","东吴","于是"}
counts = {}
for word in words:
    if len(word) ==1:
        continue
    elif word == "诸葛亮" or word == "孔明曰":
        rword = "孔明"
    elif word == "关公" or word == "云长":
        rword = "关羽"
    elif word == "玄德" or word == "玄德曰":
        rword = "刘备"
    elif word == "孟德" or word == "丞相":
        rword = "曹操"
    elif word in excludes:
        continue
    else:
        rword = word
    counts[rword] = counts.get(rword, 0 ) + 1

items = list(counts.items())
items.sort(key=lambda x:x[1], reverse=True)

for i in range(15):
    word, count = items[i]
    print("{0:<10}{1:>15}".format(word, count))