tuples
/
deal.py

import pandas as pd
import numpy as np
import re
import jieba.posseg as pseg
import codecs
from langconv import Converter
import os
LTP_DATA_DIR = './ltp_data_v3.4.0/ltp_data_v3.4.0'  # ltp模型目录的路径
par_model_path = os.path.join(LTP_DATA_DIR, 'parser.model')  # 依存句法分析模型路径，模型名称为`parser.model`

from pyltp import Parser
parser = Parser() # 初始化实例
parser.load(par_model_path)  # 加载模型
filtrate = re.compile(u'[^\u4E00-\u9FA5A，]')
filtrate2 = re.compile(u'[^\u4E00-\u9FA5，：]')


# 转换繁体到简体
def cht_to_chs(line):
    line = Converter('zh-hans').convert(line)
    line.encode('utf-8')
    return line


def input1(word):
    word = cht_to_chs(word)
    word = word.replace(' ', '')
    # word = word.replace('~', '，').replace('…', '，')
    # word = word.replace('wifi', '无线网').replace('Wifi', '无线网').replace('WIFI', '无线网')
    # word = word.replace('1分', '一分').replace('2分', '二分').replace('3分', '三分').replace('4分', '四分').replace('5分', '五分')
    # word = word.replace(',', '，').replace('。', '，').replace('.', '，').replace('！', '，').replace('？', '，').replace('!',
    #                                                                                                               '，').replace(
    #     '?', '，').replace('：', '，').replace('；', '，')
    # word = filtrate.sub(r'', word)
    words = word.split('，')
    string = [x for x in words if len(x) > 0]
    return string


def data_prepare(words):
    dataList = []
    flagList = []
    parserList = []

    for i in range(len(words)):
        # datalist
        dataList.append(list(words[i]))

        # flaglist
        sequence = []
        sequence_flag = []
        data = []
        data_flag = []
        sequence_parser = []
        dataP = []
        dataP_flag = []
        sequence_words=[]
        lines = pseg.cut(words[i])
        for word, flag in lines:
            sequence.append(list(word))
            sequence_words.append(word)
            sequence_flag.append(flag)
        arcs = parser.parse(sequence_words, sequence_flag)  # 句法分析
        print("\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs))
        for arc in arcs:
            sequence_parser.append(arc.relation)
        for s in range(len(sequence_flag)):
            for zi in sequence[s]:
                data.append(zi)
                data_flag.append(sequence_flag[s])
        flagList.append(data_flag)
        for s in range(len(sequence_parser)):
            for zi in sequence[s]:
                dataP.append(zi)
                dataP_flag.append(sequence_parser[s])
        parserList.append(dataP_flag)
    return dataList, flagList,parserList


def reDealText(string):
    # 汽车品牌
    string = string.replace('奥迪叉六', 'A6').replace('奥迪叉八', 'A8')
    # 汽车术语
    string = string.replace('汽售中心', '4S店')
    string = string.replace('七档无极变速', '7档CVT')
    string = string.replace('车载智能通信系统', 'GBOOK')
    string = string.replace('右侧立柱', '右侧B柱')
    string = string.replace('手动档', 'MT')
    string = string.replace('自动档', 'AT')
    string = string.replace('定位系统', 'GPS')
    string = string.replace('智能可变气门正时系统', 'VVT-I')
    string = string.replace('四档', 'D档')
    string = string.replace('六档位', '6MT')
    string = string.replace('辅助', 'AUX')
    string = string.replace('光盘', 'CD')
    return string


def dealText(string):
    string = cht_to_chs(string)
    string = string.upper()
    # 符号    （没有过英文冒号）
    string = string.replace('。', '，').replace('？', '，').replace('！', '，')
    string = string.replace(',', '，').replace('.', '，').replace('?', '，').replace('!', '，').replace('~', '，').replace(
        '…', '，').replace('-', '，')
    string = string.replace(',', '')
    # 口语
    string = string.replace('牛B', '牛逼').replace('NB', '牛逼')
    string = string.replace('TMD', '他妈的')
    string = string.replace('MM', '妹妹')
    string = string.replace('NO1', '第一')
    string = string.replace('OK', '不错')
    string = string.replace('数1数2', '数一数二')
    # 汽车品牌
    string = string.replace('ES', '雷克萨斯').replace('LS', '雷克萨斯').replace('IS300', '雷克萨斯')
    string = string.replace('SANTANA', '桑塔纳')
    string = string.replace('CIVIC', '思域')
    string = string.replace('MDX', '讴歌')
    string = string.replace('X6', '宝马')
    string = string.replace('A6', '奥迪叉六').replace('A8', '奥迪叉八')
    string = string.replace('KK', '斯帕克')
    string = string.replace('G35', '英菲迪尼')
    string = string.replace('BYD', '比亚迪')
    string = string.replace('CRV', '本田')
    string = string.replace('CTS', '凯迪拉克')
    # 汽车术语
    string = string.replace('SSSS', '4S').replace('SSSS店', '4S').replace('4S店', '4S').replace('4S', '汽售中心')
    string = string.replace('7档CVT', '七档无极变速')
    string = string.replace('G，BOOK', '车载智能通信系统').replace('GBOOK', '车载智能通信系统').replace('BOOK', '车载智能通信系统')
    string = string.replace('右侧B柱', '右侧立柱')
    string = string.replace('MT', '手动档')
    string = string.replace('AT', '自动档')
    string = string.replace('V6', '发动机')
    string = string.replace('GPS', '定位系统')
    string = string.replace('VVT，I', '智能可变气门正时系统')
    string = string.replace('D档', '四档')
    string = string.replace('6MT', '六档位')
    string = string.replace('AUX', '辅助')
    string = string.replace('CD', '光盘')
    string = string.replace('CROWN', '皇冠')
    # 过滤
    string = filtrate2.sub(r'', string)

    return string


def write(word, flag,parser):
    fw = codecs.open('./data/sample_test.txt', 'w', 'utf-8')
    for i in range(len(word)):
        for j in range(len(word[i])):
            line = ''.join([word[i][j] + '\t' + flag[i][j] + '\t'+parser[i][j] + '\t']) + '\n'
            fw.writelines(line)
        fw.writelines('\n')
    fw.close()


def writetxt(string):
    # string = dealText(string)
    lab = input1(string)
    word, flag, parser = data_prepare(lab)
    write(word, flag,parser)
    return lab