代码拉取完成,页面将自动刷新
import os
import csv
import random
import pandas as pd
import numpy as np
def readList(path): #读取一个csv,输出样本列表,元素形式[{"title"}...]
data = pd.read_csv(path,encoding='utf-8')
col = data["title"]
sample_list = np.array(col)
return sample_list
def getDataSet(list,proportion):
"""
:exception
获取训练集和测试集(将数据按比例随机划分)
:parameter
proportion - 测试集/数据集
:return
trainDataSet - 训练集
testDataSet - 测试集
"""
# dataSet = open('数据集.csv')
# dataSetReader = csv.reader(dataSet)
# Lists = os.listdir(path)
# for eachFile in Lists:
# eachPathFile = path + eachFile
# dataSetReader = readList(path)
"""
:exception
将数据保存到数组
"""
dataSet = []
for item in list:
dataSet.append(item) # 我也不晓得为何要新建[],再append进去,直接导入list作数据集不行?
# next(dataSetReader, 'none') # 跳过表头,但由于基于表头输出则本身导入数组不带表头
# data = next(dataSetReader, 'none')
# print(dataSet)
"""
:exception
按照比例随机划分出训练集和测试集
"""
dataNumber = dataSet.__len__() # 数据集数据条数
testNumber = int(dataNumber * proportion) # 测试集数据条数
testDataSet = [] # 测试数据集
trainDataSet = [] # 训练数据集
testDataSet = random.sample(dataSet, testNumber) # 测试集 ,random中使用其实不是list,而是set
for testData in testDataSet: # 将已经选定的测试集数据从数据集中删除
dataSet.remove(testData)
trainDataSet = dataSet # 训练集
# print(trainDataSet)
# print('--------------------------')
# print(testDataSet)
return trainDataSet, testDataSet
def segText(inputPath, resultPath):
fatherLists = os.listdir(inputPath) # 主目录
for eachDir in fatherLists: # 遍历主目录中各个文件夹
eachPath = inputPath + eachDir + "/" # 保存主目录中每个文件夹目录,便于遍历二级文件
childLists = os.listdir(eachPath) # 获取每个文件夹中的各个文件
for eachFile in childLists: # 遍历数据集下的csv文件
total = []
eachPathFile = eachPath + eachFile
content = readList(eachPathFile)
# print(content)
total = getDataSet(content,0.1)
# print(total[0])
num = 0
for items in total:
each_resultPath = resultPath[num] + eachDir + "/" # 数据集文件存入的目录
if not os.path.exists(each_resultPath):
os.makedirs(each_resultPath)
csvfile = open(each_resultPath + eachFile, 'w', encoding='utf-8-sig', newline='')
fieldnames = ['title']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
if not os.path.exists(each_resultPath):
os.makedirs(each_resultPath)
for item in items:
saveascsv(writer,item)
num+=1
def saveascsv(writer,title):
writer.writeheader()
item = {}
item['title'] = title
# print(item)
writer.writerow(item)
if __name__ == '__main__':
data_path = './total/' # 数据集所在路径
set_path = ['./train/','./test/']
segText(data_path,set_path)
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。