1 Star 0 Fork 1

天外飞碟/CyberSecurity_Knowledge_graph

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
文件
该仓库未声明开源许可证文件(LICENSE),使用请关注具体项目描述及其代码上游依赖。
克隆/下载
ngram_learn.py 2.96 KB
一键复制 编辑 原始数据 按行查看 历史
HoloLen 提交于 2020-04-07 13:18 . the easy demo
# coding:utf-8
"""
Python release: 3.2.3
"""
import math
class LM:
def __init__(self, train_path, step=2, lam=1):
# punctuations need to be removed
self.punctuation = set([",", "、", "。", "?", "“", "”", "!", ";", "‘", "’", "《", "》", "%"])
self.step = step
self.lam = lam
self.words = []
self.freq = {}
# read the train data and remove the punctuation
with open(train_path) as f:
unique = set()
for line in f:
s = line.strip().split()
# remove the punctuation
s = [item.strip() for item in s if item not in self.punctuation]
# gather the unique word
for r in s:
unique.add(r)
# add the start/end symbol
s.insert(0, '<s>')
s.append('</s>')
self.words.append(s)
self.size = len(unique)
def getNgram(self, sentence):
for e in range(1, len(sentence) - 1):
if e > self.step:
s = e - self.step + 1
else:
s = 0
words = sentence[s:e + 1]
words.insert(-1, '|')
words = ','.join(words)
words = words.replace(',|,', '|')
yield words
def train(self):
for sentence in self.words:
for words in self.getNgram(sentence):
s = words.split('|')
cond = s[0]
w = s[1]
if cond not in self.freq:
self.freq[cond] = {}
self.freq[cond][w] = 1
else:
self.freq[cond][w] = self.freq[cond].get(w, 0) + 1
def getProb(self, word, condition):
cond = condition
cond_num = 0
w_num = 0
if cond in self.freq:
for key in self.freq[cond]:
cond_num += self.freq[cond][key]
w_num = self.freq[cond].get(word, 0)
# smooth
w_num = w_num + self.lam
cond_num += (self.size * self.lam)
return float(w_num / cond_num)
def sentenceProb(self, sentence):
mysentence = sentence.strip().split()
mysentence.insert(0, '<s>')
mysentence.append('</s>')
result = 0
for words in self.getNgram(mysentence):
s = words.split('|')
cond = s[0]
w = s[1]
result += math.log(self.getProb(w, cond), 2)
return result
if __name__ == '__main__':
inpath = './Data/train.in'
print('Reading data...')
lm = LM(inpath, 2)
print('Start training...')
lm.train()
s1 = '我 在 北京 天安门'
s2 = '在 我 北京 天安门'
s3 = '北京 我 在 天安门'
s4 = '北京 在 我 天安门'
for s in [s1, s2, s3, s4]:
print(s)
print(lm.sentenceProb(s))
Loading...
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化
1
https://gitee.com/tianwaifeidie/CyberSecurity_Knowledge_graph.git
git@gitee.com:tianwaifeidie/CyberSecurity_Knowledge_graph.git
tianwaifeidie
CyberSecurity_Knowledge_graph
CyberSecurity_Knowledge_graph
master

搜索帮助