代码拉取完成,页面将自动刷新
#! /usr/bin/env python
# -*- coding: utf-8 -*-
#======================================================================
#
# linguist.py -
#
# Created by skywind on 2017/03/22
# Last change: 2017/03/22 13:44:42
#
#======================================================================
import sys, os, time
# https://www.nodebox.net/code/index.php/Linguistics
#----------------------------------------------------------------------
# python 2/3 compatible
#----------------------------------------------------------------------
if sys.version_info[0] >= 3:
long = int
xrange = range
unicode = str
#----------------------------------------------------------------------
# 词形变换
#----------------------------------------------------------------------
class WordHelper (object):
# 初始化
def __init__ (self):
self.__lemmatizer = None
# 取得 WordNet 的词定义
def definition (self, word, txt = False):
from nltk.corpus import wordnet as wn
syns = wn.synsets(word)
output = []
for syn in syns:
name = syn.name()
part = name.split('.')
mode = part[1]
output.append((mode, syn.definition()))
if txt:
output = '\n'.join([ (m + ' ' + n) for m, n in output ])
return output
# 取得动词的:-ing, -ed, -en, -s
# NodeBox 的 Linguistics 软件包 11487 个动词只能处理 6722 个
def verb_tenses (self, word):
word = word.lower()
if ' ' in word:
return None
import en
if not en.is_verb(word):
return None
tenses = {}
try:
tenses['i'] = en.verb.present_participle(word)
tenses['p'] = en.verb.past(word)
tenses['d'] = en.verb.past_participle(word)
tenses['3'] = en.verb.present(word, person = 3, negate = False)
except:
return None
valid = True
for k in tenses:
v = tenses[k]
if not v:
valid = False
break
elif "'" in v:
valid = False
break
if not valid:
return None
return tenses
# 名词的复数:有时候不可数名词也会被加上 -s,需要先判断是否可数(语料库)
def noun_plural (self, word, method = 0):
plural = None
if method == 0:
import en
plural = en.noun.plural(word)
elif method == 1:
import pattern.en
plural = pattern.en.pluralize(word)
elif method == 2:
import inflect
plural = inflect.pluralize(word)
elif method < 0:
plural = self.noun_plural(word, 0)
if not plural:
plural = self.noun_plural(word, 1)
if not plural:
try:
import inflect
plural = inflect.pluralize(word)
except:
pass
if not plural:
return None
return plural
# 求解比较级
def adjective_comparative (self, word):
import pattern.en
return pattern.en.comparative(word)
# 求解最高级
def adjective_superlative (self, word):
import pattern.en
return pattern.en.superlative(word)
# 求解复数,使用 pattern.en 软件包
def pluralize (self, word):
import pattern.en
return pattern.en.pluralize(word)
# 取得所有动词
def all_verbs (self):
import en
words = []
for n in en.wordnet.all_verbs():
words.append(n.form)
return words
# 取得所有副词
def all_adverbs (self):
import en
words = []
for n in en.wordnet.all_adverbs():
words.append(n.form)
return words
# 取得所有形容词
def all_adjectives (self):
import en
words = []
for n in en.wordnet.all_adjectives():
words.append(n.form)
return words
# 取得所有名词
def all_nouns (self):
import en
words = []
for n in en.wordnet.all_nouns():
words.append(n.form)
return words
# 返回原始单词
def lemmatize (self, word, pos = 'n'):
word = word.lower()
if self.__lemmatizer is None:
from nltk.stem.wordnet import WordNetLemmatizer
self.__lemmatizer = WordNetLemmatizer()
return self.__lemmatizer.lemmatize(word, pos)
#----------------------------------------------------------------------
# global
#----------------------------------------------------------------------
tools = WordHelper()
#----------------------------------------------------------------------
# WordRoot
#----------------------------------------------------------------------
class WordRoot (object):
def __init__ (self, root):
self.root = root
self.count = 0
self.words = {}
def add (self, c5, word, n = 1):
if c5 and word:
term = (c5, word)
if not term in self.words:
self.words[term] = n
else:
self.words[term] += n
self.count += n
return True
def dump (self):
output = []
for term in self.words:
c5, word = term
output.append((c5, word, self.words[term]))
output.sort(key = lambda x: (x[2], x[0]), reverse = True)
return output
def __len__ (self):
return len(self.words)
def __getitem__ (self, key):
return self.words[key]
#----------------------------------------------------------------------
# testing
#----------------------------------------------------------------------
if __name__ == '__main__':
def test1():
for word in ['was', 'gave', 'be', 'bound']:
print('%s -> %s'%(word, tools.lemmatize(word, 'v')))
return 0
def test2():
import ascmini
rows = ascmini.csv_load('bnc-words.csv')
output = []
words = {}
for row in rows:
root = row[0]
size = int(row[1])
c5 = row[2]
word = row[3]
count = int(row[4])
head = root[:1].lower()
if size <= 1:
continue
if count * 1000 / size < 1:
continue
if '*' in word:
continue
if c5 in ('UNC', 'CRD'):
continue
if '(' in root or '/' in root:
continue
if head != '\'' and (not head.isalpha()):
if head.isdigit():
continue
if head in ('$', '#', '-'):
continue
if root.count('\'') >= 2:
continue
if not root in words:
stem = WordRoot(root)
words[root] = stem
else:
stem = words[root]
stem.add(c5, word.lower(), count)
for key in words:
stem = words[key]
for c5, word, count in stem.dump():
output.append((stem.root, stem.count, c5, word, count))
output.sort(key = lambda x: (x[1], x[0]), reverse = True)
# ascmini.csv_save(output, 'bnc-clear.csv')
print 'count', len(words)
def test3():
import ascmini
rows = ascmini.csv_load('bnc-clear.csv')
output = []
words = {}
for row in rows:
root = row[0]
size = int(row[1])
c5 = row[2]
word = row[3].lower()
count = int(row[4])
if word == root:
continue
if not root in words:
stem = WordRoot(root)
words[root] = stem
else:
stem = words[root]
stem.add('*', word, count)
stem.count = size
fp = open('bnc-lemma.txt', 'w')
lemmas = []
for key in words:
stem = words[key]
part = []
for c5, word, count in stem.dump():
output.append((stem.root, stem.count, c5, word, count))
part.append('%s/%d'%(word, count))
if not part:
continue
text = '%s/%d -> '%(stem.root, stem.count)
lemmas.append((stem.count, stem.root, text + ','.join(part)))
output.sort(key = lambda x: (x[1], x[0]), reverse = True)
lemmas.sort(reverse = True)
for _, _, text in lemmas:
fp.write(text + '\n')
ascmini.csv_save(output, 'bnc-test.csv')
print 'count', len(words)
return 0
def test4():
import stardict
lm1 = stardict.LemmaDB()
lm2 = stardict.LemmaDB()
lm1.load('bnc-lemma.txt')
lm2.load('lemma.en.txt')
count1 = 0
count2 = 0
for stem in lm2.dump('stem'):
childs = lm2.get(stem)
stem = stem.lower()
if len(stem) <= 2 and stem.isupper():
continue
if not stem in lm1:
count1 += 1
else:
obj = lm1.get(stem)
for word in childs:
word = word.lower()
if not word in obj:
print '%s -> %s'%(stem, word)
count2 += 1
for word in childs:
lm1.add(stem, word.lower())
print 'count', count1, count2
lm1.save('lemma-bnc.txt')
return 0
test4()
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。