1 Star 0 Fork 0

Zreal/Collective Intelligence

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
该仓库未声明开源许可证文件(LICENSE),使用请关注具体项目描述及其代码上游依赖。
克隆/下载
generatefeedvector.py 2.04 KB
一键复制 编辑 原始数据 按行查看 历史
Zreal 提交于 2018-09-20 21:55 . 2018.9.20
# -*-coding:utf-8-*-
# @Time : 18-9-18下午6:38
# @Author : Zreal
# @File :generatefeedvector.py
# @Email :Zenglz_pro@163.com
import feedparser
import re
def getwordcounts(url):
try:
d=feedparser.parse(url)
wc={}
for e in d.entries:
if 'summary' in e:summary=e.summary
else:
summary=e.description
words=getwords(e.title+' '+summary)
for word in words:
wc.setdefault(word,0)
wc[word]+=1
return getattr(d.feed , 'title' , 'Unknown title') , wc
except Exception as e:
return "None" , -1
def getwords(html):
#去除html标记
txt=re.compile(r'<[^>]+>').sub('',html)
#拆分出单词
words=re.compile(r'[^a-z^A-Z]+').split(txt)
return [word.lower() for word in words if word!='']
if __name__ =="__main__":
feedlist=[line for line in open('feedlist.txt')]
#出现这些单词的博客数目
apcount={}
wordcounts={}
for feedurl in feedlist:
print(feedurl)
title,wc=getwordcounts(feedurl)
print(title)
print(wc)
if wc!=-1:
wordcounts[title]=wc
for word,count in wc.items():
apcount.setdefault(word,0)
if count>1:
apcount[word]+=1
print("finished !!!\n\n")
#选出有代表性的词汇
#像the 可能再每篇博客中都会出现 而又有像一些偏僻的名词 在文中出现较少,不足以代表这个源的特点
#因此取一个上界和下界进行筛选
wordlist=[]
for w,bc in apcount.items():
frac=float(bc).len(feedlist)
if frac>0.1 and frac<0.5:
wordlist.append(w)
out=open('blogdata.txt','w')
out.write('Blog')
for word in wordlist:
out.write('\t%s'.format(word))
for blog,wc in wordcounts.items():
out.write(blog)
for word in wordlist:
if word in wc:out.write("\t%d".format(wc[word]))
else: out.write("\t0")
out.write('\n')
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化
Python
1
https://gitee.com/Zrea1/Collective-Intelligence.git
git@gitee.com:Zrea1/Collective-Intelligence.git
Zrea1
Collective-Intelligence
Collective Intelligence
master

搜索帮助