1 Star 0 Fork 0

KunCheng-He/LawText-Analysis

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
文件
克隆/下载
data-to-mongodb.py 4.20 KB
一键复制 编辑 原始数据 按行查看 历史
byack 提交于 2021-04-13 20:53 . Model
'''
重命名之后的数据,需要进行数据清洗
这里我们只记录文书名,案件内容,裁定结果是批准还是驳回
将这些清洗好的数据放到mongodb中
'''
from docx import Document
from pymongo import MongoClient
import jieba
import os
def data_to_mongodb(db):
'''将数据写入数据库库中'''
# 遍历data目录下的文件,进行数据清洗后构建插入数据库的数据样式
for dir_name in os.listdir("./data"):
book_path = "./data/" + dir_name
for LawBook in os.listdir(book_path):
# 插入的数据格式
data = {
"num_id": LawBook,
"title": None,
"content": None,
"label_content": None,
"label": None
}
# 数据存取列表
data_table = []
# 数据读取路径
path = book_path + "/" + LawBook
print(path)
# 开始打开文件进行处理
f = open(path, 'rb')
document = Document(f)
for text in document.paragraphs:
data_table.append(text)
# 开始清洗数据并打包
data['title'] = data_table[1].text
for i in range(0, len(data_table)):
if "裁定如下" in data_table[i].text:
data["content"] = data_table[i].text
data["label_content"] = data_table[i + 1].text
if "准许" in data_table[i + 1].text:
data["label"] = 1
elif "驳回" in data_table[i + 1].text:
data["label"] = 0
break
# 将数据装入mongodb数据库中
if not data["label"] == None:
db.law_data.insert_one(data)
def clear(db):
'''将数据进一步清洗,去掉无用的数据项'''
for content in db.law_data.find():
data = {
'num_id': content['num_id'],
'content': None,
'label': content['label']
}
text_old = content['content']
text_list = text_old.split('。')
text_new = ''
for i in text_list:
if not '《' in i:
text_new += i
text_new += '。'
data['content'] = text_new
if not text_new == '':
db.clear.insert_one(data)
print(data['num_id'])
def train_test_class(db):
'''将训练集和测试集的数据分开'''
test_1_num = 5
test_0_num = 56
for item in db.clear.find():
if item["label"] == 1 and test_1_num > 0:
db.test.insert_one(item)
test_1_num -= 1
continue
elif item["label"] == 0 and test_0_num > 0:
db.test.insert_one(item)
test_0_num -= 1
continue
db.train.insert_one(item)
def data_jieba(db):
'''对数据进行jieba分词后另行存储'''
punctuations = [',', '。', '、', ';', '“', '”', '——', '—-']
for item in db.train.find():
data = {
"num_id": item["num_id"],
"content": None,
"label": item["label"]
}
out = jieba.lcut(item["content"])
out_new = ""
for i in out:
if not i in punctuations:
out_new = out_new + " " + i
data["content"] = out_new
db.train_jieba.insert_one(data)
for item in db.test.find():
data = {
"num_id": item["num_id"],
"content": None,
"label": item["label"]
}
out = jieba.lcut(item["content"])
out_new = ""
for i in out:
if not i in punctuations:
out_new = out_new + " " + i
data["content"] = out_new
db.test_jieba.insert_one(data)
if __name__ == "__main__":
# 建立连接
client = MongoClient()
# 连接到数据库,没有会在插入数据时自动创建
db = client["LawData"]
# 将数据写入数据库中
data_to_mongodb(db)
# 将数据再次清洗
clear(db)
# 分出训练集和测试集的数据
train_test_class(db)
# 用jieba对数据分词并存储
data_jieba(db)
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化
Python
1
https://gitee.com/byack/law-text-analysis.git
git@gitee.com:byack/law-text-analysis.git
byack
law-text-analysis
LawText-Analysis
master

搜索帮助

0d507c66 1850385 C8b1a773 1850385