代码拉取完成,页面将自动刷新
#!/usr/bin/env python
# -*- coding:utf-8 -*-
# author:roger
# datetime:19-11-12 下午2:52
# software: PyCharm
import codecs
import math
import random
import re
import jieba
import numpy as np
def create_dico(item_list):
"""
create a dictionary of items from a list of list of items
"""
assert type(item_list) is list
dico = {}
for items in item_list:
for item in items:
if item not in dico:
dico[item] = 1
else:
dico[item] += 1
return dico
def create_mapping(dico):
"""
create a mapping (item to ID/ ID to item) from a dictionary
Items ard order by decreasing frequency
"""
sorted_items = sorted(dico.items(), key=lambda x: (-x[1], x[0]))
id_to_item = {i: v[0] for i, v in enumerate(sorted_items)}
item_to_id = {v: k for k, v in id_to_item.items()}
return item_to_id, id_to_item
def zero_digits(s):
"""
replace every digit in a string by zero
"""
return re.sub("\d", "0", s)
def iob2(tags):
"""
check that tags have a valid IOB format
Tags in IOB1 format converted to IOB2
"""
for i, tag in enumerate(tags):
if tag == "O":
continue
splits = tag.split("-")
if len(splits) != 2 or splits[0] not in ["I", "B"]:
return False
if splits[0] == "B":
continue
elif i == 0 or tags[i - 1] == "O":
tags[i] = 'B' + tag[1:]
elif tags[i - 1][1:] == tag[1:]:
continue
else:
tags[i] = "B" + tag[1:]
return True
def iob_iobes(tags):
"""
IOB_>IOBES
"""
new_tags = []
for i, tag in enumerate(tags):
if tag == "O":
new_tags.append(tag)
elif tag.split("-")[0] == "B":
if i + 1 != len(tags) and tags[i + 1].split('-') == "I":
new_tags.append(tag)
else:
new_tags.append(tag.replace("B-", "S-"))
elif tag.split("-")[0] == "I":
if i + 1 < len(tags) and tags[i + 1].split("-") == "I":
new_tags.append(tag)
else:
new_tags.append(tag.replace("I-", "E-"))
else:
raise Exception("Invalid IOB format")
def iobes_iob(tags):
"""
IOBES -> IOB
"""
new_tags = []
for i, tag in enumerate(tags):
if tag.split('-')[0] == 'B':
new_tags.append(tag)
elif tag.split('-')[0] == 'I':
new_tags.append(tag)
elif tag.split('-')[0] == 'S':
new_tags.append(tag.replace('S-', 'B-'))
elif tag.split('-')[0] == 'E':
new_tags.append(tag.replace('E-', 'I-'))
elif tag.split('-')[0] == 'O':
new_tags.append(tag)
else:
raise Exception('Invalid format!')
return new_tags
def insert_singletons(words, singletons, p=0.5):
"""
Replace singletons by the unknown word with a probability p.
"""
new_words = []
for word in words:
if word in singletons and np.random.uniform() < p:
new_words.append(0)
else:
new_words.append(word)
return new_words
def get_seg_features(string):
"""
Segment text with jieba
features are represented in bies format
s donates single word
"""
seg_feature = []
for word in jieba.cut(string):
if len(word) == 1:
seg_feature.append(0) # o
else:
tmp = [2] * len(word) # i
tmp[0] = 1 # b
tmp[-1] = 3 # e
seg_feature.extend(tmp)
return seg_feature
def create_input(data):
"""
Take sentence data and return an input for
the training or the evaluation function.
"""
inputs = list()
inputs.append(data['chars'])
inputs.append(data["segs"])
inputs.append(data['tags'])
return inputs
def load_word2vec(emb_path, id_to_word, word_dim, old_weights):
"""
Load word embedding from pre-trained file
embedding size must match
"""
new_weights = old_weights
print('Loading pretrained embeddings from {}...'.format(emb_path))
pre_trained = {}
emb_invalid = 0
for i, line in enumerate(codecs.open(emb_path, 'r', 'utf-8')):
line = line.rstrip().split()
if len(line) == word_dim + 1:
pre_trained[line[0]] = np.array(
[float(x) for x in line[1:]]
).astype(np.float32)
else:
emb_invalid += 1
if emb_invalid > 0:
print('WARNING: %i invalid lines' % emb_invalid)
c_found = 0
c_lower = 0
c_zeros = 0
n_words = len(id_to_word)
# Lookup table initialization
for i in range(n_words):
word = id_to_word[i]
if word in pre_trained:
new_weights[i] = pre_trained[word]
c_found += 1
elif word.lower() in pre_trained:
new_weights[i] = pre_trained[word.lower()]
c_lower += 1
elif re.sub('\d', '0', word.lower()) in pre_trained: # replace numbers to zero
new_weights[i] = pre_trained[
re.sub('\d', '0', word.lower())
]
c_zeros += 1
print('Loaded %i pretrained embeddings.' % len(pre_trained))
print('%i / %i (%.4f%%) words have been initialized with '
'pretrained embeddings.' % (
c_found + c_lower + c_zeros, n_words,
100. * (c_found + c_lower + c_zeros) / n_words)
)
print('%i found directly, %i after lowercasing, '
'%i after lowercasing + zero.' % (
c_found, c_lower, c_zeros
))
return new_weights
def full_to_half(s):
"""
Convert full-width character to half-width one
"""
n = []
for char in s:
num = ord(char)
if num == 0x3000:
num = 32
elif 0xFF01 <= num <= 0xFF5E:
num -= 0xfee0
char = chr(num)
n.append(char)
return ''.join(n)
def cut_to_sentence(text):
"""
Cut text to sentences
"""
sentence = []
sentences = []
len_p = len(text)
pre_cut = False
for idx, word in enumerate(text):
sentence.append(word)
cut = False
if pre_cut:
cut = True
pre_cut = False
if word in u"。;!?\n":
cut = True
if len_p > idx + 1:
if text[idx + 1] in ".。”\"\'“”‘’?!":
cut = False
pre_cut = True
if cut:
sentences.append(sentence)
sentence = []
if sentence:
sentences.append("".join(list(sentence)))
return sentences
def replace_html(s):
s = s.replace('"', '"')
s = s.replace('&', '&')
s = s.replace('<', '<')
s = s.replace('>', '>')
s = s.replace(' ', ' ')
s = s.replace("“", "“")
s = s.replace("”", "”")
s = s.replace("—", "")
s = s.replace("\xa0", " ")
return (s)
class BatchManager(object):
def __init__(self, data, batch_size):
self.batch_data = self.sort_and_pad(data, batch_size)
self.len_data = len(self.batch_data)
def sort_and_pad(self, data, batch_size):
num_batch = int(math.ceil(len(data) / batch_size))
sorted_data = sorted(data, key=lambda x: len(x[0]))
batch_data = list()
for i in range(num_batch):
batch_data.append(self.arrange_batch(sorted_data[int(i * batch_size): int((i + 1) * batch_size)]))
return batch_data
@staticmethod
def arrange_batch(batch):
'''
把batch整理为一个[5, ]的数组
:param batch:
:return:
'''
strings = []
segment_ids = []
chars = []
mask = []
targets = []
for string, seg_ids, char, msk, target in batch:
strings.append(string)
segment_ids.append(seg_ids)
chars.append(char)
mask.append(msk)
targets.append(target)
return [strings, segment_ids, chars, mask, targets]
@staticmethod
def pad_data(data):
strings = []
chars = []
segs = []
targets = []
max_length = max([len(sentence[0]) for sentence in data])
for line in data:
string, segment_ids, char, seg, target = line
padding = [0] * (max_length - len(string))
strings.append(string + padding)
chars.append(char + padding)
segs.append(seg + padding)
targets.append(target + padding)
return [strings, chars, segs, targets]
def iter_batch(self, shuffle=False):
if shuffle:
random.shuffle(self.batch_data)
for idx in range(self.len_data):
yield self.batch_data[idx]
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。