1 Star 0 Fork 0

仓葵与暮/DataExtraction

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
文件
克隆/下载
extract_change.py 9.20 KB
一键复制 编辑 原始数据 按行查看 历史
仓葵与暮 提交于 2022-04-20 21:02 . 补充注释
import argparse
import re
import os
import pymongo
from tqdm import tqdm
from utils.utils import write_to_pkl, read_from_pkl, remove_comments
# e.g. cv_change/qt/qt_train_changed.pkl
from_path = 'cv_change/{}/{}_{}_changed.pkl'
# e.g. results/cv_change/qt/qt_train_changed.pkl
out_path = 'results/cv_change/{}/'
temp_path = 'results/cv_change/temp_{}/'
# args, run python file through cmd
parser = argparse.ArgumentParser()
parser.add_argument('-project', type=str, default='openstack')
def dimension_original_pkl(db_name, data):
"""
生成一个pkl用于后面的三种处理,减少数据库连接的时间与资源消耗
:param db_name:
:param data:
:return: 列表1:[ids, codes]
"""
print("extract_changed.py: start dimension original pkl from database...")
ids = data[0]
diff = db_name['diff']
bar = tqdm(ids, ncols=100)
id_to_save, codes, label_save, msg_save = [], [], [], []
for pos, _id in enumerate(bar):
bar.set_description("Processing %s" % (_id[:8] + '...'))
obj = diff.find_one({"_id": _id}, {"_id": 0})
# 查不到返回None
if obj is not None:
id_to_save.append(_id)
label_save.append(data[1][pos])
msg_save.append(data[2][pos])
# 遍历所有文件
code = []
for key, value in obj.items():
# 遍历一个文件Object下的content数组
file_diff = []
for field in value['content']:
block_diff = {'a': [], 'b': []}
# content数组的每个元素都是一个字典Object,包含a/b/ab的键及其值
for model, lines in field.items():
if model == 'a' or model == 'b':
lines = remove_comments(lines)
if len(lines) != 0:
for index, line in enumerate(lines):
lines[index] = line.strip()
block_diff[model].extend(lines)
if len(block_diff['a']) == 0:
del block_diff['a']
if len(block_diff['b']) == 0:
del block_diff['b']
if len(block_diff) > 0:
file_diff.append(block_diff)
if len(file_diff) > 0:
code.append(file_diff)
codes.append(code)
print("extract_change.py: dimension original pkl ended.")
return [id_to_save, label_save, msg_save, codes]
def code_merge(data):
"""
将add和remove均融合在一起,生成最左侧的分支输入tokens序列
SEPARATOR_FOR_SENTENCE分割不同的文件,每个文件内存在添加代码块和删除代码块,用add和remove标记
默认:e.g. add codes remove codes SEPARATOR_FOR_SENTENCE add codes remove codes ...
:param data:
:return:
"""
codes = data
add_tag, remove_tag, place_tag, file_tag = 'add', 'remove', '<s>', 'SEPARATOR_FOR_SENTENCE'
codes_str = []
# 每个code表示这个commit的所有文件变更。e.g. code = [file1, file2, ... ]
for code in codes:
token = []
# 每个file_diff表示该文件下的添加与删除。
# e.g. file_diff = [{'a': [...], 'b': [...]}, {'a': [...], 'b': [...]}, ...]
for file_diff in code:
for file in file_diff:
for key, val in file.items():
if key == 'a':
token.append(add_tag)
token.extend(val)
elif key == 'b':
token.append(remove_tag)
token.extend(val)
else:
print('error')
if len(file_diff) == 0:
token.append(place_tag)
token.append(file_tag)
del token[-1:]
code_str = re.sub(r"[\t ]+", " ", ' '.join(token))
codes_str.append(code_str)
return codes_str
def code_merge_no_tag(data):
codes = data
place_tag, file_tag = '<s>', 'SEPARATOR_FOR_SENTENCE'
codes_str = []
for code in codes:
token = []
for file_diff in code:
for file in file_diff:
for key, val in file.items():
token.extend(val)
if len(file_diff) == 0:
token.append(place_tag)
token.append(file_tag)
del token[-1:]
code_str = re.sub(r"[\t ]+", " ", ' '.join(token))
codes_str.append(code_str)
return codes_str
def code_merge_no_code(data):
codes = data
add_tag, remove_tag, place_tag, file_tag = 'add', 'remove', '<s>', 'SEPARATOR_FOR_SENTENCE'
codes_str = []
for code in codes:
token = []
for file_diff in code:
for file in file_diff:
for key, val in file.items():
if key == 'a':
token.append(add_tag)
elif key == 'b':
token.append(remove_tag)
else:
print('error: code_merge_no_code')
if len(file_diff) == 0:
token.append(place_tag)
token.append(file_tag)
del token[-1:]
code_str = re.sub(r"[\t ]+", " ", ' '.join(token))
codes_str.append(code_str)
return codes_str
def add_remove_merge(data, pattern):
"""
单独拆分add部分与remove部分的代码,为了保持文件一致,引入了占位符<s>
:param data:
:param pattern:
:return:
"""
codes = data
place_tag, file_tag = '<s>', 'SEPARATOR_FOR_SENTENCE'
codes_str = []
for code in codes:
token = []
# 每个file_diff表示该文件下的添加与删除。e.g. file_diff = [{'a': [...], 'b': [...]}, {'a': [...], 'b': [...]}, ...]
for file_diff in code:
has = False
for file in file_diff:
for key, val in file.items():
if key == pattern:
token.extend(val)
has = True
if not has:
token.append(place_tag)
token.append(file_tag)
del token[-1:]
code_str = re.sub(r"[\t ]+", " ", ' '.join(token))
codes_str.append(code_str)
return codes_str
def read_write_process(t, p, has_tag=True):
print(f'processing {t} data ...')
# 提前进行提取处理
if not os.path.exists(temp_path + f'{p}_{t}.pkl'):
data = read_from_pkl(from_path.format(p, p, 'test'))
pkl = dimension_original_pkl(client[p], data)
write_to_pkl(pkl, temp_path, f'{p}_{t}.pkl')
data = read_from_pkl(temp_path + f'{p}_{t}.pkl')
pkl = [data[0], data[1], data[2]]
if has_tag:
pkl.append(code_merge(data[3]))
has_path = 'codes_has_tag/'
else:
pkl.append(code_merge_no_tag(data[3]))
has_path = 'codes_no_tag/'
pkl.extend([add_remove_merge(data[3], 'a'),
add_remove_merge(data[3], 'b'),
code_merge_no_code(data[3])])
write_to_pkl(pkl, out_path + has_path, f'{p}_{t}_changed.pkl')
print(f'processing {t} data done.')
if __name__ == '__main__':
args = parser.parse_args()
project = str(args.project)
# connect to mongoDB
client = pymongo.MongoClient('mongodb://localhost:27017/')
db_list = client.list_database_names()
if project in db_list:
print(project, ' exists.')
temp_path = temp_path.format(project)
out_path = out_path.format(project)
read_write_process('test', project)
read_write_process('train', project)
read_write_process('test', project, has_tag=False)
read_write_process('train', project, has_tag=False)
# 提前进行提取处理
# if not os.path.exists(temp_path + f'{project}_test.pkl'):
# test_data = read_from_pkl(from_path.format(project, project, 'test'))
# test_pkl = dimension_original_pkl(client[project], test_data)
# write_to_pkl(test_pkl, temp_path, f'{project}_test.pkl')
#
# test_data = read_from_pkl(temp_path + f'{project}_test.pkl')
# test_pkl = [test_data[0], test_data[1], test_data[2],
# code_merge(test_data[3]),
# add_remove_merge(test_data[3], 'a'),
# add_remove_merge(test_data[3], 'b'),
# code_merge_no_tag(test_data[3])]
# write_to_pkl(test_pkl, out_path, f'{project}_test_changed.pkl')
#
# if not os.path.exists(temp_path + f'{project}_train.pkl'):
# train_data = read_from_pkl(from_path.format(project, project, 'train'))
# train_pkl = dimension_original_pkl(client[project], train_data)
# write_to_pkl(train_pkl, temp_path, f'{project}_train.pkl')
#
# train_data = read_from_pkl(temp_path + f'{project}_train.pkl')
# train_pkl = [train_data[0], train_data[1], train_data[2],
# code_merge(train_data[3]),
# add_remove_merge(train_data[3], 'a'),
# add_remove_merge(train_data[3], 'b'),
# code_merge_no_tag(train_data[3])]
# write_to_pkl(train_pkl, out_path, f'{project}_train_changed.pkl')
Loading...
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化
Python
1
https://gitee.com/fortherepublic-cpp/data-extraction.git
git@gitee.com:fortherepublic-cpp/data-extraction.git
fortherepublic-cpp
data-extraction
DataExtraction
master

搜索帮助