1 Star 2 Fork 1

sesepp/Image-Captioning-PyTorch

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
克隆/下载
pre_process.py 1.65 KB
一键复制 编辑 原始数据 按行查看 历史
Yang Liu 提交于 2020-01-09 14:56 . update
import json
import zipfile
from collections import Counter
import jieba
from tqdm import tqdm
from config import *
from utils import ensure_folder
def extract(folder):
filename = '{}.zip'.format(folder)
print('Extracting {}...'.format(filename))
with zipfile.ZipFile(filename, 'r') as zip_ref:
zip_ref.extractall('data')
def create_input_files():
json_path = train_annotations_filename
# Read JSON
with open(json_path, 'r') as j:
samples = json.load(j)
# Read image paths and captions for each image
word_freq = Counter()
for sample in tqdm(samples):
caption = sample['caption']
for c in caption:
seg_list = jieba.cut(c, cut_all=True)
# Update word frequency
word_freq.update(seg_list)
# Create word map
words = [w for w in word_freq.keys() if word_freq[w] > min_word_freq]
word_map = {k: v + 1 for v, k in enumerate(words)}
word_map['<unk>'] = len(word_map) + 1
word_map['<start>'] = len(word_map) + 1
word_map['<end>'] = len(word_map) + 1
word_map['<pad>'] = 0
print(len(word_map))
print(words[:10])
# Save word map to a JSON
with open(os.path.join(data_folder, 'WORDMAP.json'), 'w') as j:
json.dump(word_map, j)
if __name__ == '__main__':
# parameters
ensure_folder('data')
if not os.path.isdir(train_image_folder):
extract(train_folder)
if not os.path.isdir(valid_image_folder):
extract(valid_folder)
if not os.path.isdir(test_a_image_folder):
extract(test_a_folder)
if not os.path.isdir(test_b_image_folder):
extract(test_b_folder)
create_input_files()
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化
1
https://gitee.com/sesepp/Image-Captioning-PyTorch.git
git@gitee.com:sesepp/Image-Captioning-PyTorch.git
sesepp
Image-Captioning-PyTorch
Image-Captioning-PyTorch
master

搜索帮助