encrypted-traffic-analysis-2021
/
method_DL_template2_wxw.py

#!D:/Code/python
# -*- coding: utf-8 -*-
# @Time : 2021/5/7 0007 20:20
# @Author : xgf
# @File : method_DL_template.py
# @Software : PyCharm
import numpy

import torch
from torch import nn, Tensor, optim
from torch.autograd import Variable
import torch.nn.functional as F
from typing import (
    TypeVar, Type, Union, Optional, Any,
    List, Dict, Tuple, Callable, NamedTuple
)

import random
import time
import os
import copy
import re
import logging
from concurrent.futures import ThreadPoolExecutor
from concurrent import futures
import itertools

import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from torchvision import datasets
from torchvision.transforms import ToTensor, Lambda, Compose
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from utils import Args, D, timeit

# from log import Log
method_name = 'encrypted-traffic-analysis-2021'
# mylog = Log('../encrypted-traffic-analysis-2021_log', method_name)

def get_args() -> Args:
    """
    获取参数
    """
    default_raw_data_dir = os.path.join(os.path.dirname(
        __file__), "dataset/traffic")
    default_feature_data_dir = os.path.join(os.path.dirname(
        __file__), "dataset/traindata")
    return Args([
        D("batchSize", int, 64),
        D("learningRate", float, 1e-2),
        D("numEpochs", int, 5000),
        D("rawDataDir", str, default_raw_data_dir),
        D("dataDir", str, default_feature_data_dir),
        D("saveDir", str, None),
        D("nClass", int, 50),
        D('splitData', int, 0.8),
    ])

max_features =   [22607, 0.9339419978517723, 1599, 0.06605800214822771, 24206, 6971.819376000754, 10492.093183239525,
         6974.724357711386, 12216.404564957757, 10039.0, 12290.0, 2.0249652023687483, 1.3203963666391412, 1, 19, 1, 1,
         22, 8, 23, 7, 672.3888888888889, 413.1607353196532, 13, 1781, 586.5, 0.24022639701446408,
         0.0014869999230530883, 0.006851341989027328, 0.43530082699999895, 0.0015701625259172789, 0.008577127393046323,
         0.6145144896628096, 0.0225155147029179, 0.03382628600025843, 1, 1, 1, 1, 1, 0.012953087945389186,
         2.0627147460742803, 18.66804293971924, 20.0, 20, 3.272895675818782, 3.296907216494845, 3.0, 29,
         3.7787047836052214, 46.61237113402062, 47.0, 50, 80, 20, 88, 12, 136, 31.187239944521497, 25.0,
         26.882848597066, 721, 16, 2.906801007556675, 2.0, 1.8423725513133162, 397]
def readNpy(file_path):
    """
    读取npy文件
    @param file_path:npy文件路径
    @return:读取的npy内容，内容具体格式未知
    """
    res = np.load(file_path, allow_pickle=True)
    features = []
    for index, sample in enumerate(res):
        features.append(sample[0])
    print(len(features), len(features[0]))
    features = np.array(features)
    # print(features.max(axis=0))
    features = features / np.array(
      max_features
    )
    # print(res)
    # print(features)
    # exit(0)

    for index, sample in enumerate(res):
        res[index][0] = features[index]
    return res

class AverageMeter(object):
    """Computes and stores the average and current value
       Imported from https://github.com/pytorch/examples/blob/master/imagenet/main.py#L247-L262
    """
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

class EncryptTrafficDataset(Dataset):
    """
    特征数据集
    """
    def __init__(self, traffic_data, transform=None, target_transform=None):
        traffic_data = np.array(traffic_data)
        traffic_data = traffic_data[:, :2]
        try:
            self.traffic_features = traffic_data[:, 0]
            self.traffic_labels = traffic_data[:, 1]
        except IOError:
            print("EncryptTrafficDataset初始化数据集失败，因为数据集传入错误")

        self.transform = transform
        self.target_transform = target_transform

    def __len__(self):
        return len(self.traffic_labels)

    def __getitem__(self, idx):
        feature = self.traffic_features[idx]
        label = self.traffic_labels[idx]

        if self.transform:
            feature = self.transform(feature)
        if self.target_transform:
            label = self.target_transform(label)

        feature = Tensor(feature)
        # print(label)
        # sample = {"feature": feature, "label": label, "lag": lag}
        sample = {"feature": feature, "label": label}
        return (feature, label)

class ThreeLinearNetwork(nn.Module):
    """
    定义一个简单的三层神经网络用于测试
    """
    def __init__(self):
        super(ThreeLinearNetwork, self).__init__()
        # self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(67, 1024),
            nn.ReLU(),
            nn.Linear(1024, 1024),
            nn.ReLU(),
            nn.Linear(1024, 1024),
            nn.ReLU(),
            nn.Linear(1024, 512),
            nn.ReLU(),
            nn.Dropout(),
            nn.Linear(512, 50),
            nn.ReLU()
        )

    def forward(self, x):
        # x = self.flatten(x)
        # x = x.float()
        logits = self.linear_relu_stack(x)
        return logits

def train(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    for batch, (X, y) in enumerate(dataloader):
        y.long()
        X, y = X.to(device), y.to(device)

        # Compute prediction error
        pred = model(X)
        loss = loss_fn(pred, y)

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if batch % 100 == 0:
            loss, current = loss.item(), batch * len(X)
            # print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")


from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestClassifier

defence_90 = np.load("./feature_extraction/defence_90.npy", allow_pickle=True)
# defence_10 = np.load("./feature_extraction/defence_10.npy", allow_pickle=True)
features_train = []
labels_train = []
for sample in defence_90:
    # features_train.append(sample[0])
    norm_features = np.array(sample[0]) / max_features
    features_train.append(norm_features)
    labels_train.append(sample[1])
rfc = RandomForestClassifier(n_estimators=100,n_jobs=-1,random_state=0).fit(features_train, labels_train)


def test(dataloader, model, epoch, name):
    size = len(dataloader.dataset)
    model.eval()
    test_loss, correct = 0, 0
    with torch.no_grad():
        for X, y in dataloader:
            y.long()
            X, y = X.to(device), y.to(device)
            pred = model(X)
            # print(pred)
            #

            print(X.size())
            pred = pred.to('cpu')
            y = y.to('cpu')
            rfc_inputs = X.to('cpu').numpy()
            rfc_pred = rfc.predict(rfc_inputs)
            rfc_pred_prob = rfc.predict_proba(rfc_inputs)
            pred_list = []
            for idx, p in enumerate(rfc_pred):
                mlp_p = pred.argmax(1)[idx]
                label_p = y[idx]
                if p == mlp_p:
                    pred_list.append(p)
                else:
                    # mlp_probs = (pred[idx].numpy() - np.mean(pred[idx].numpy())) / np.std(pred[idx].numpy())
                    # mlp_probs = mlp_probs / mlp_probs.sum()
                    mlp_probs = pred[idx].numpy() / (pred[idx].numpy().sum() if pred[idx].numpy().sum()!=0 else 1)
                    rfc_probs = np.array(rfc_pred_prob[idx])
                    std_pro = 0
                    tmp = rfc_probs[p] / mlp_probs[p]
                    select = random.randint(0, 1)
                    # if select == 0:
                    if (rfc_probs[p] - np.mean(rfc_probs))  > (mlp_probs[mlp_p] - np.mean(mlp_probs)):
                        pred_list.append(p)
                    else:
                        pred_list.append(mlp_p)
                    # print(pred[idx])
                    print(f"mlp_prob: {mlp_probs[mlp_p]}:{mlp_probs[p]}, rfc_prob: {rfc_probs[p]} label: {label_p}, mlp: {mlp_p} ({pred[idx][mlp_p]}) {'对' if mlp_p == label_p else '错'}, rfc: {p} {'对' if p == label_p else '错'} {pred_list[-1]} {pred_list[-1] == label_p}")
            # print(rfc_pred)
            # for index,p in enumerate(rfc_pred):
            #     pred[index][p] += np.mean(pred[index].numpy())*3.5
            pred = pred.to(device)
            y = y.to(device)


            test_loss += loss_fn(pred, y).item()

            # preds = torch.tensor(pred_list).to(device)
            preds = pred.argmax(1).to(device)
            correct += (preds == y).type(torch.float).sum().item()
    test_loss /= size
    correct /= size
    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")
    writer.add_scalar(f'{name} acc', correct, epoch)

    return test_loss, correct


from torch.utils.tensorboard import SummaryWriter

writer = SummaryWriter(f"./tb_log/mlp_5layer")  # 这行会自动创建文件夹

if __name__ == '__main__':
    device = "cuda" if torch.cuda.is_available() else "cpu"
    print("Using {} device".format(device))

    args = get_args()

    batch_size = args.batchSize  # 批的大小
    learning_rate = args.learningRate  # 学习率
    num_epochs = args.numEpochs  # 遍历训练集的次数
    data_dir = args.dataDir
    save_dir = args.saveDir
    n_class = args.nClass
    split_data = args.splitData

    # # 读取数据
    # # data = readNpy('./feature_extraction/undefence_features.npy')
    traindata = readNpy('./feature_extraction/defence_90.npy')
    testdata = readNpy('./feature_extraction/defence_10.npy')
    # traffic_data = EncryptTrafficDataset(data)
    # print(traffic_data[0])
    # # 划分数据
    # train_size = int(split_data * len(traffic_data))
    # test_size = len(traffic_data) - train_size
    # train_data, test_data = torch.utils.data.random_split(traffic_data, [train_size, test_size])
    train_data, test_data = EncryptTrafficDataset(traindata), EncryptTrafficDataset(testdata)
    # 定义dataloader
    train_dataloader = DataLoader(train_data, batch_size= batch_size, shuffle=True)
    train_features, train_labels = next(iter(train_dataloader))
    print(f"Feature batch shape: {train_features.size()}")
    print(f"Labels batch shape: {train_labels.size()}")
    test_dataloader = DataLoader(test_data, batch_size= batch_size, shuffle=True)

    model = ThreeLinearNetwork().to(device)
    model_path = "model_5layer_67fea_tensorborad.pth"
    # model = torch.load(model_path).to(device)
    print(model)

    loss_fn = nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(model.parameters(), lr= learning_rate)

    max_acc = 0
    for t in range(num_epochs):
        print(f"Epoch {t + 1}\n-------------------------------")
        train_loss = train(train_dataloader, model, loss_fn, optimizer)
        train_loss, train_test = test(train_dataloader, model, t+1, 'train')
        test_loss, test_acc = test(test_dataloader, model, t+1, 'test')
        # mylog.state_dict_update([('train_loss_list', train_loss),
        #                          ('train_acc_list', test_acc),
        #                          ('valid_loss_list', test_loss),
        #                          ('valid_acc_list', test_acc),
        #                          ])

        if t % 100 == 99:
            torch.save(model, model_path)
        print(f"当前最大：{max_acc}, test_acc:{test_acc}")
        if test_acc >= max_acc and test_acc >= 0.5:
            print(test_acc)
            max_acc = test_acc
            torch.save(model, f"model_acc{int(test_acc*100)}_5layer_67fea_tensorboard.pth")


    # model = torch.load(model_path)
    # test_loss, test_acc = test(test_dataloader, model)


    print("Done!")