代码拉取完成,页面将自动刷新
# -*- coding: utf-8 -*-
"""
Created on Sun Apr 7 17:19:35 2019
@author: quliang
"""
import pymysql
from pymongo import MongoClient
import logging
import re
import json
def get_text(s):
pattern=re.compile(r'\'text\': \'(.*?)\'')
res_list = re.findall(pattern,str(s))
res_str = ' '.join(res_list)
return res_str
def get_court_consider(s):
pattern=re.compile(r'\'text\': \"(.*?)\"')
res_list = re.findall(pattern,str(s))
res_str = ' '.join(res_list)
return res_str
def data_to_mysql(data,keys,test=False):
db_data='test' if test else 'renren_data'
db = pymysql.connect(host='193.112.56.204', user='renrendata', passwd='data@2018',
db=db_data, port=3306, charset='utf8')
cursor = db.cursor()
cols = ", ".join('`{}`'.format(k) for k in keys)
val_cols = ', '.join('%({})s'.format(k) for k in keys)
sql = "insert into judgement_doc(%s) values(%s)"
res_sql = sql % (cols, val_cols)
#print(res_sql)
cursor.executemany(res_sql,data)
db.commit()
logger = logging.getLogger()
logging.basicConfig(format='%(asctime)s - %(filename)s[line:%(lineno)d] - %(levelname)s: %(message)s')
logger.setLevel(logging.ERROR)
client = MongoClient('mongodb://admin:admin@193.112.23.12:27017')
db=client.law_nlp
#collist = db.list_collection_names()
collist=['zhejiang', 'fujian', 'hebei', 'neimenggu',
'hainan', 'hubei','heilongjiang', 'jiangsu', 'anhui','guizhou','liaoning',
'qinghai','gansu','chongqing','henan','hunan', 'guangxi']
#collist1=['zhejiang']
keys=['province', 'save_id', 'judgementId', 'judgementType', 'reason_name',
'title', 'regulation', 'trial_procedure', 'original_told', 'was_told',
'court_get', 'court_consider', 'court_result','frist_court_get','appellor_told',
'by_appellor_told','frist_court_consider','second_court_result']
for col in collist:
w=[]
collection=db[col]
print('finding data ing')
result = collection.find({"content.judgementType":"判决","content.title":{'$regex':'婚'}})
if not result:
continue
print('{} data collected finished'.format(col))
result = list(result)
lens=len(list(result))
print('{}有{}条数据'.format(col,lens))
for inx,res in enumerate(result):
logger.info('res detail',res)
panjue_data={k:'' for k in keys}
panjue_data['province']=col
panjue_data['save_id']=inx
panjue_data['judgementType']=res['content']['judgementType']
panjue_data['title']=res['content']['title']
if res.get('judgementId'):
panjue_data['judgementId']=res['judgementId']
if res['content'].get('reason'):
if res['content']['reason'].get('name'):
panjue_data['reason_name']=res['content']['reason']['name']
#解析法条
if res['content'].get('regulationGroupByTrialRoundInfos'):
for trialRoundText in res['content']['regulationGroupByTrialRoundInfos']:
#一审 , 二审 分隔符&&
if trialRoundText.get('trialRoundText'):
panjue_data['regulation']+='&'+trialRoundText['trialRoundText']+'&'+'\n'
for sectionParagraphs in trialRoundText['regulations']:
logger.info('sectionParagraphs detail',sectionParagraphs)
panjue_data['regulation']+=sectionParagraphs['text']
panjue_data['regulation']+=get_text(sectionParagraphs)
#多个法条分割符'######'
panjue_data['regulation']+='\n'
else:
logger.info('judgementId',res['judgementId'],col,'is not exsixt keywords:regulationGroupByTrialRoundInfos')
#解析各个分块
for sub in res['content']['paragraphs']:
if sub['typeText']=='审理经过':
panjue_data['trial_procedure']=get_text(sub['subParagraphs'])
continue
if sub['typeText']=='一审法院查明':
panjue_data['frist_court_get']=get_text(sub['subParagraphs'])
continue
if sub['typeText']=='原告诉称':
panjue_data['original_told']=get_text(sub['subParagraphs'])
continue
if sub['typeText']=='上诉人诉称':
panjue_data['appellor_told']=get_text(sub['subParagraphs'])
continue
if sub['typeText']=='被告辩称':
panjue_data['was_told']=get_text(sub['subParagraphs'])
continue
if sub['typeText']=='被上诉人辩称':
panjue_data['by_appellor_told']=get_text(sub['subParagraphs'])
continue
if sub['typeText']=='本院查明':
panjue_data['court_get']=get_text(sub['subParagraphs'])
continue
if sub['typeText']=='一审法院认为':
panjue_data['frist_court_consider']=get_court_consider(sub['subParagraphs'])
continue
if sub['typeText']=='本院认为':
panjue_data['court_consider']=get_court_consider(sub['subParagraphs'])
continue
if sub['typeText']=='裁判结果':
panjue_data['court_result']=get_text(sub['subParagraphs'])
continue
if sub['typeText']=='二审裁判结果':
panjue_data['second_court_result']=get_text(sub['subParagraphs'])
continue
#str_json=json.dumps(panjue_data,indent=2, ensure_ascii=False)
#with open('data.txt','a') as f:
#f.write(str_json+'\n')
w.append(panjue_data)
if inx%1000==0:
print('=======> {}有{}条数据已保存'.format(col,inx))
data_to_mysql(w,keys,test=False)
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。