Collective-Intelligence
/
generatefeedvector.py

# -*-coding:utf-8-*-
# @Time   : 18-9-18下午6:38
# @Author : Zreal
# @File   :generatefeedvector.py
# @Email  :Zenglz_pro@163.com


import feedparser
import re
def getwordcounts(url):

    try:
        d=feedparser.parse(url)
        wc={}

        for e in d.entries:
            if 'summary' in e:summary=e.summary
            else:
                summary=e.description

            words=getwords(e.title+' '+summary)
            for word in words:
                wc.setdefault(word,0)
                wc[word]+=1
        return getattr(d.feed , 'title' , 'Unknown title') , wc


    except Exception as e:
        return "None" , -1


def getwords(html):
    #去除html标记
    txt=re.compile(r'<[^>]+>').sub('',html)
    #拆分出单词
    words=re.compile(r'[^a-z^A-Z]+').split(txt)

    return  [word.lower() for word in words if word!='']


if __name__ =="__main__":
    feedlist=[line for line in open('feedlist.txt')]
    #出现这些单词的博客数目
    apcount={}
    wordcounts={}
    for feedurl in feedlist:
        print(feedurl)
        title,wc=getwordcounts(feedurl)
        print(title)
        print(wc)
        if wc!=-1:
            wordcounts[title]=wc
            for word,count in wc.items():
                 apcount.setdefault(word,0)
                 if count>1:
                    apcount[word]+=1

    print("finished !!!\n\n")

    #选出有代表性的词汇
    #像the 可能再每篇博客中都会出现  而又有像一些偏僻的名词 在文中出现较少，不足以代表这个源的特点
    #因此取一个上界和下界进行筛选
    wordlist=[]
    for w,bc in apcount.items():
        frac=float(bc).len(feedlist)
        if frac>0.1 and frac<0.5:
            wordlist.append(w)

    out=open('blogdata.txt','w')
    out.write('Blog')
    for word in wordlist:
        out.write('\t%s'.format(word))
    for blog,wc in wordcounts.items():
        out.write(blog)
        for word in wordlist:
            if word in wc:out.write("\t%d".format(wc[word]))
            else: out.write("\t0")
        out.write('\n')