代码拉取完成,页面将自动刷新
同步操作将从 冰封飞飞/计算机英语词频统计 强制同步,此操作会覆盖自 Fork 仓库以来所做的任何修改,且无法恢复!!!
确定后同步将在后台操作,完成时将刷新页面,请耐心等待。
'''
词频分析主模块
'''
import asyncio
import collections
import time
from datastore import datastore
from model.models import modelHtmlText, modelWordCount
import statistics
from log import log
import settings
logging = log(filename=settings.WORDCOUNT_LOG_FILE, level=log.INFO)
class wordCount:
'''
读取数据库htmltext表
'''
SLEEP_TIME = 2
CYCLE_GAP_TIME = 0.1
IGNORE_WORD = [chr(x) for x in range(ord('a'), ord('z') + 1) if chr(x) not in 'ai']
def __init__(self):
self.datastore = datastore()
self.STAT_FUNC = {'wikipedia' : self.__wikiStat}
self.logging = logging
def __wikiStat(self, wordList):
'''
处理wikipedia单词的统计
'''
statistics.addwikiDoneLinkCount(1)
statistics.decwikiPreAnalyzeHtmlCount(1)
statistics.addwikiWordCount(len(wordList))
async def count(self):
'''
统计htmltext中每条记录的词频,然后写入到wordcount表,异步执行启动方式
'''
try:
while True:
textObj = self.datastore.top(modelHtmlText)
if not textObj:
await asyncio.sleep(wordCount.SLEEP_TIME)
continue
site = textObj.site
text = textObj.text
textList = text.split()
textList = [x for x in textList if x not in wordCount.IGNORE_WORD]
wordTuples = collections.Counter(textList).most_common() #将单词从文章中分割,然后组成(单词,词频)的tuple的list
self.datastore.updateWordCount(wordTuples)
self.STAT_FUNC[site](textList)
await asyncio.sleep(wordCount.CYCLE_GAP_TIME)
except Exception as e:
self.logging.error("wordCount count exception={}".format(e))
def syncCount(self):
'''
统计htmltext中每条记录的词频,然后写入到wordcount表,同步执行启动方式
'''
try:
while True:
textObj = self.datastore.top(modelHtmlText)
if not textObj:
self.logging.warning('no text')
time.sleep(wordCount.SLEEP_TIME)
continue
site = textObj.site
text = textObj.text
textList = text.split()
textList = [x for x in textList if x not in wordCount.IGNORE_WORD]
wordTuples = collections.Counter(textList).most_common() #将单词从文章中分割,然后组成(单词,词频)的tuple的list
self.datastore.updateWordCount(wordTuples)
self.STAT_FUNC[site](textList)
except Exception as e:
self.logging.error("wordCount sync count exception={}".format(e))
def main():
try:
asyncio.run(wordCount().count())
except KeyboardInterrupt:
pass
if __name__ == '__main__':
main()
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。