代码拉取完成,页面将自动刷新
# -*-coding:utf-8-*-
# @Time : 18-9-18下午6:38
# @Author : Zreal
# @File :generatefeedvector.py
# @Email :Zenglz_pro@163.com
import feedparser
import re
def getwordcounts(url):
try:
d=feedparser.parse(url)
wc={}
for e in d.entries:
if 'summary' in e:summary=e.summary
else:
summary=e.description
words=getwords(e.title+' '+summary)
for word in words:
wc.setdefault(word,0)
wc[word]+=1
return getattr(d.feed , 'title' , 'Unknown title') , wc
except Exception as e:
return "None" , -1
def getwords(html):
#去除html标记
txt=re.compile(r'<[^>]+>').sub('',html)
#拆分出单词
words=re.compile(r'[^a-z^A-Z]+').split(txt)
return [word.lower() for word in words if word!='']
if __name__ =="__main__":
feedlist=[line for line in open('feedlist.txt')]
#出现这些单词的博客数目
apcount={}
wordcounts={}
for feedurl in feedlist:
print(feedurl)
title,wc=getwordcounts(feedurl)
print(title)
print(wc)
if wc!=-1:
wordcounts[title]=wc
for word,count in wc.items():
apcount.setdefault(word,0)
if count>1:
apcount[word]+=1
print("finished !!!\n\n")
#选出有代表性的词汇
#像the 可能再每篇博客中都会出现 而又有像一些偏僻的名词 在文中出现较少,不足以代表这个源的特点
#因此取一个上界和下界进行筛选
wordlist=[]
for w,bc in apcount.items():
frac=float(bc).len(feedlist)
if frac>0.1 and frac<0.5:
wordlist.append(w)
out=open('blogdata.txt','w')
out.write('Blog')
for word in wordlist:
out.write('\t%s'.format(word))
for blog,wc in wordcounts.items():
out.write(blog)
for word in wordlist:
if word in wc:out.write("\t%d".format(wc[word]))
else: out.write("\t0")
out.write('\n')
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。