代码拉取完成,页面将自动刷新
import pandas as pd
import numpy as np
import re
import jieba.posseg as pseg
import codecs
from langconv import Converter
import os
LTP_DATA_DIR = './ltp_data_v3.4.0/ltp_data_v3.4.0' # ltp模型目录的路径
par_model_path = os.path.join(LTP_DATA_DIR, 'parser.model') # 依存句法分析模型路径,模型名称为`parser.model`
from pyltp import Parser
parser = Parser() # 初始化实例
parser.load(par_model_path) # 加载模型
filtrate = re.compile(u'[^\u4E00-\u9FA5A,]')
filtrate2 = re.compile(u'[^\u4E00-\u9FA5,:]')
# 转换繁体到简体
def cht_to_chs(line):
line = Converter('zh-hans').convert(line)
line.encode('utf-8')
return line
def input1(word):
word = cht_to_chs(word)
word = word.replace(' ', '')
# word = word.replace('~', ',').replace('…', ',')
# word = word.replace('wifi', '无线网').replace('Wifi', '无线网').replace('WIFI', '无线网')
# word = word.replace('1分', '一分').replace('2分', '二分').replace('3分', '三分').replace('4分', '四分').replace('5分', '五分')
# word = word.replace(',', ',').replace('。', ',').replace('.', ',').replace('!', ',').replace('?', ',').replace('!',
# ',').replace(
# '?', ',').replace(':', ',').replace(';', ',')
# word = filtrate.sub(r'', word)
words = word.split(',')
string = [x for x in words if len(x) > 0]
return string
def data_prepare(words):
dataList = []
flagList = []
parserList = []
for i in range(len(words)):
# datalist
dataList.append(list(words[i]))
# flaglist
sequence = []
sequence_flag = []
data = []
data_flag = []
sequence_parser = []
dataP = []
dataP_flag = []
sequence_words=[]
lines = pseg.cut(words[i])
for word, flag in lines:
sequence.append(list(word))
sequence_words.append(word)
sequence_flag.append(flag)
arcs = parser.parse(sequence_words, sequence_flag) # 句法分析
print("\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs))
for arc in arcs:
sequence_parser.append(arc.relation)
for s in range(len(sequence_flag)):
for zi in sequence[s]:
data.append(zi)
data_flag.append(sequence_flag[s])
flagList.append(data_flag)
for s in range(len(sequence_parser)):
for zi in sequence[s]:
dataP.append(zi)
dataP_flag.append(sequence_parser[s])
parserList.append(dataP_flag)
return dataList, flagList,parserList
def reDealText(string):
# 汽车品牌
string = string.replace('奥迪叉六', 'A6').replace('奥迪叉八', 'A8')
# 汽车术语
string = string.replace('汽售中心', '4S店')
string = string.replace('七档无极变速', '7档CVT')
string = string.replace('车载智能通信系统', 'GBOOK')
string = string.replace('右侧立柱', '右侧B柱')
string = string.replace('手动档', 'MT')
string = string.replace('自动档', 'AT')
string = string.replace('定位系统', 'GPS')
string = string.replace('智能可变气门正时系统', 'VVT-I')
string = string.replace('四档', 'D档')
string = string.replace('六档位', '6MT')
string = string.replace('辅助', 'AUX')
string = string.replace('光盘', 'CD')
return string
def dealText(string):
string = cht_to_chs(string)
string = string.upper()
# 符号 (没有过英文冒号)
string = string.replace('。', ',').replace('?', ',').replace('!', ',')
string = string.replace(',', ',').replace('.', ',').replace('?', ',').replace('!', ',').replace('~', ',').replace(
'…', ',').replace('-', ',')
string = string.replace(',', '')
# 口语
string = string.replace('牛B', '牛逼').replace('NB', '牛逼')
string = string.replace('TMD', '他妈的')
string = string.replace('MM', '妹妹')
string = string.replace('NO1', '第一')
string = string.replace('OK', '不错')
string = string.replace('数1数2', '数一数二')
# 汽车品牌
string = string.replace('ES', '雷克萨斯').replace('LS', '雷克萨斯').replace('IS300', '雷克萨斯')
string = string.replace('SANTANA', '桑塔纳')
string = string.replace('CIVIC', '思域')
string = string.replace('MDX', '讴歌')
string = string.replace('X6', '宝马')
string = string.replace('A6', '奥迪叉六').replace('A8', '奥迪叉八')
string = string.replace('KK', '斯帕克')
string = string.replace('G35', '英菲迪尼')
string = string.replace('BYD', '比亚迪')
string = string.replace('CRV', '本田')
string = string.replace('CTS', '凯迪拉克')
# 汽车术语
string = string.replace('SSSS', '4S').replace('SSSS店', '4S').replace('4S店', '4S').replace('4S', '汽售中心')
string = string.replace('7档CVT', '七档无极变速')
string = string.replace('G,BOOK', '车载智能通信系统').replace('GBOOK', '车载智能通信系统').replace('BOOK', '车载智能通信系统')
string = string.replace('右侧B柱', '右侧立柱')
string = string.replace('MT', '手动档')
string = string.replace('AT', '自动档')
string = string.replace('V6', '发动机')
string = string.replace('GPS', '定位系统')
string = string.replace('VVT,I', '智能可变气门正时系统')
string = string.replace('D档', '四档')
string = string.replace('6MT', '六档位')
string = string.replace('AUX', '辅助')
string = string.replace('CD', '光盘')
string = string.replace('CROWN', '皇冠')
# 过滤
string = filtrate2.sub(r'', string)
return string
def write(word, flag,parser):
fw = codecs.open('./data/sample_test.txt', 'w', 'utf-8')
for i in range(len(word)):
for j in range(len(word[i])):
line = ''.join([word[i][j] + '\t' + flag[i][j] + '\t'+parser[i][j] + '\t']) + '\n'
fw.writelines(line)
fw.writelines('\n')
fw.close()
def writetxt(string):
# string = dealText(string)
lab = input1(string)
word, flag, parser = data_prepare(lab)
write(word, flag,parser)
return lab
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。