master

分支 (17)

标签 (36)

管理

管理

master

imitation_prep_release

grasp_pytorch0.4+

trial_reward

height_squared

stack_height_classifier_v2

check_stack_rows

z_height_v2

stack_height_classifier

flops_counter

grasp_color_task

experience_replay

first_grasp

combined_model_weights

stack_classification

nouyang_throw_reboot

stack

v0.17.2

v0.17.1

v0.17.0

v0.16.5

v0.16.4

v0.16.2

v0.16.1

v0.16.0

v0.15.0

v0.14.0

v0.13.0

v0.11.0

v0.12.0

v0.2.2-any-stack

v0.10.0

v0.8.-1_rows_spott

0.10.0

0.9.0

v0.7.0-grasp-push-place-train

v0.7.0-grasp-push-train

good_robot
/
train_transformer.py

import json
from jsonargparse import ArgumentParser, ActionConfigFile
import yaml
from typing import List, Dict
import glob
import os
import pathlib
import pdb
import subprocess
import copy
from io import StringIO
from collections import defaultdict

import torch
from spacy.tokenizer import Tokenizer
from spacy.lang.en import English
from einops import rearrange
import logging
from tqdm import tqdm
from matplotlib import pyplot as plt
import matplotlib
from matplotlib import gridspec
import numpy as np
import torch.autograd.profiler as profiler
from torch.nn import functional as F
from torch.optim.lr_scheduler import StepLR
from allennlp.training.scheduler import Scheduler
from allennlp.training.learning_rate_schedulers import NoamLR
import pandas as pd

from transformer import TransformerEncoder, ResidualTransformerEncoder, image_to_tiles, tiles_to_image
from metrics import TransformerTeleportationMetric, MSEMetric, AccuracyMetric, F1Metric
from language_embedders import RandomEmbedder, GloveEmbedder, BERTEmbedder
from data import DatasetReader
from train_language_encoder import get_free_gpu, load_data, get_vocab, LanguageTrainer, FlatLanguageTrainer

logger = logging.getLogger(__name__)

class TransformerTrainer(FlatLanguageTrainer):
    def __init__(self,
                 train_data: List,
                 val_data: List,
                 encoder: TransformerEncoder,
                 optimizer: torch.optim.Optimizer,
                 scheduler: Scheduler,
                 num_epochs: int,
                 num_blocks: int,
                 device: torch.device,
                 checkpoint_dir: str,
                 num_models_to_keep: int,
                 generate_after_n: int,
                 resolution: int = 64,
                 patch_size: int = 8,
                 block_size: int = 4,
                 output_type: str = "per-pixel",
                 depth: int = 7,
                 score_type: str = "acc",
                 best_epoch: int = -1,
                 seed: int = 12,
                 zero_weight: float = 0.05,
                 next_weight: float = 1.0,
                 prev_weight: float = 1.0,
                 do_regression: bool = False,
                 do_reconstruction: bool = False,
                 n_epochs_pre_valid: int = 0):
        super(TransformerTrainer, self).__init__(train_data=train_data,
                                                 val_data=val_data,
                                                 encoder=encoder,
                                                 optimizer=optimizer,
                                                 num_epochs=num_epochs,
                                                 num_blocks=num_blocks,
                                                 device=device,
                                                 checkpoint_dir=checkpoint_dir,
                                                 num_models_to_keep=num_models_to_keep,
                                                 generate_after_n=generate_after_n,
                                                 score_type=score_type,
                                                 resolution=resolution,
                                                 depth=depth,
                                                 best_epoch=best_epoch,
                                                 do_regression=do_regression)

        weight = torch.tensor([zero_weight, 1.0-zero_weight]).to(device)
        total_steps = num_epochs * len(train_data)
        self.n_epochs_pre_valid = n_epochs_pre_valid
        print(f"total steps {total_steps}")
        self.weighted_xent_loss_fxn = torch.nn.CrossEntropyLoss(weight = weight)
        self.xent_loss_fxn = torch.nn.CrossEntropyLoss()

        self.next_loss_weight = next_weight
        self.prev_loss_weight = prev_weight

        self.scheduler = scheduler
        self.patch_size = patch_size
        self.output_type = output_type
        self.next_to_prev_weight = (next_weight, prev_weight)
        self.do_reconstruction = do_reconstruction
        self.teleportation_metric = TransformerTeleportationMetric(block_size = block_size,
                                                                   image_size = resolution,
                                                                   patch_size = patch_size)

        self.f1_metric = F1Metric()
        self.masked_f1_metric = F1Metric(mask=True)

        if self.do_regression:
            self.mse_metric = MSEMetric()
            self.reg_loss_fxn = torch.nn.MSELoss()

        if self.do_reconstruction:
            self.reconstruction_metric = AccuracyMetric()

        self.set_all_seeds(seed)

    def set_all_seeds(self, seed):
        np.random.seed(seed)
        torch.manual_seed(seed)
        torch.backends.cudnn.deterministic = True

    def train_and_validate_one_epoch(self, epoch):
        print(f"Training epoch {epoch}...")
        self.encoder.train()
        skipped = 0
        for b, batch_instance in tqdm(enumerate(self.train_data)):

            self.optimizer.zero_grad()
            outputs = self.encoder(batch_instance)
            #next_outputs, prev_outputs = self.encoder(batch_instance)
            # skip bad examples
            if outputs is None:
                skipped += 1
                continue
            if self.output_type == "per-pixel":
                loss = self.compute_weighted_loss(batch_instance, outputs, (epoch + 1) * (b+1))
            elif self.output_type == "per-patch":
                loss = self.compute_patch_loss(batch_instance, outputs, self.next_to_prev_weight)
            elif self.output_type == "patch-softmax":
                loss = self.compute_xent_loss(batch_instance, outputs)
            else:
                raise AssertionError("must have output in ['per-pixel', 'per-patch', 'patch-softmax']")
            loss.backward()
            self.optimizer.step()
            it = (epoch + 1) * (b+1)
            self.scheduler.step_batch(it)

        print(f"skipped {skipped} examples")
        print(f"Validating epoch {epoch}...")

        total_dict = defaultdict(float)
        total = 0
        if epoch >= self.n_epochs_pre_valid:
            self.encoder.eval()
            for b, dev_batch_instance in tqdm(enumerate(self.val_data)):
                #prev_pixel_acc, block_acc = self.validate(dev_batch_instance, epoch, b, 0)
                score_dict = self.validate(dev_batch_instance, epoch, b, 0)
                for k,v in score_dict.items():
                    if type(v) in [float, int, np.float64, np.int]:
                        total_dict[k] += score_dict[k]

                total += 1

            for k,v in total_dict.items():
                total_dict[k] = v / total

            print(f"Epoch {epoch}")
            ordered_keys = sorted(list(total_dict.keys()))
            for k in ordered_keys:
                if type(v) in [float, int, np.float64, np.int]:
                    print(f"\t{k}: {total_dict[k]:.2f}")
            mean_next_acc = total_dict["next_f1"]
            mean_prev_acc = total_dict["prev_f1"]
            mean_block_acc = total_dict["block_accuracy"]
            mean_tele_score = total_dict["tele_score"]

            #print(f"Epoch {epoch} has next pixel F1 {mean_next_acc * 100} prev F1 {mean_prev_acc * 100}, block acc {mean_block_acc * 100} teleportation score: {mean_tele_score}, MSE: {mean_mse}, prev recon acc: {mean_prev_recon*100}, next recon acc {mean_next_recon*100}")
            #print(f"Epoch {epoch}  prev acc {mean_prev_acc * 100} ")
            #return (mean_next_acc + mean_prev_acc)/2, mean_block_acc
            if self.score_type == "acc":
                return (mean_next_acc + mean_prev_acc)/2, mean_block_acc
            elif self.score_type == "block_acc":
                return mean_block_acc, 0.0
            elif self.score_type == "tele_score":
                return mean_tele_score, 0.0
            else:
                raise AssertionError(f"invalid score type {self.score_type}")
        else:
            if self.score_type == "acc" or self.score_type == "block_acc":
                # return s.t. best epoch is latest, but will never be greater than an actual validation
                return 0 + 0.00001 * epoch, 0
            else:
                # return s.t. best epoch is latest, but will never be less than an actual tele score
                return 100 - 0.001 * epoch, 0

    def compute_weighted_loss(self, inputs, outputs, it):
        """
        compute per-pixel for all pixels, with additional loss term for only foreground pixels (where true label is 1)
        """
        pred_next_image = outputs["next_position"]
        true_next_image = inputs["next_pos_for_pred"]
        bsz, n_blocks, width, height, depth = pred_next_image.shape

        pred_next_image = pred_next_image.squeeze(-1)
        true_next_image = true_next_image.squeeze(-1).squeeze(-1)
        true_next_image = true_next_image.long().to(self.device)

        next_pixel_loss = self.next_loss_weight * self.weighted_xent_loss_fxn(pred_next_image, true_next_image)

        pred_prev_image = outputs["prev_position"]
        true_prev_image = inputs["prev_pos_for_pred"]

        pred_prev_image = pred_prev_image.squeeze(-1)
        true_prev_image = true_prev_image.squeeze(-1).squeeze(-1)
        true_prev_image = true_prev_image.long().to(self.device)

        prev_pixel_loss = self.prev_loss_weight * self.weighted_xent_loss_fxn(pred_prev_image, true_prev_image)

        total_loss = next_pixel_loss + prev_pixel_loss
        print(f"loss {total_loss.item()}")

        return total_loss

    def compute_patch_loss(self, inputs, outputs, next_to_prev_weight = [1.0, 1.0]):
        """
        compute per-patch for each patch
        """
        bsz, __, w, h = inputs['prev_pos_input'].shape

        pred_next_image = outputs["next_position"]
        pred_prev_image = outputs["prev_position"]

        true_next_image = image_to_tiles(inputs["next_pos_for_pred"].reshape(bsz, 1, w, h), self.patch_size)
        true_prev_image = image_to_tiles(inputs["prev_pos_for_pred"].reshape(bsz, 1, w, h), self.patch_size)

        # binarize patches
        prev_sum_image = torch.sum(true_prev_image, dim = 2, keepdim=True)
        prev_patches = torch.zeros_like(prev_sum_image)
        next_sum_image = torch.sum(true_next_image, dim = 2, keepdim=True)
        next_patches = torch.zeros_like(next_sum_image)
        # any patch that has a 1 pixel in it gets 1
        prev_patches[prev_sum_image != 0] = 1
        next_patches[next_sum_image != 0] = 1

        pred_prev_image = pred_prev_image.squeeze(-1)
        pred_next_image = pred_next_image.squeeze(-1)
        prev_patches = prev_patches.squeeze(-1).to(self.device).long()
        next_patches = next_patches.squeeze(-1).to(self.device).long()

        pred_prev_image = rearrange(pred_prev_image, 'b n c -> b c n')
        pred_next_image = rearrange(pred_next_image, 'b n c -> b c n')

        prev_pixel_loss = self.weighted_xent_loss_fxn(pred_prev_image, prev_patches)
        next_pixel_loss = self.weighted_xent_loss_fxn(pred_next_image, next_patches)
        next_weight = next_to_prev_weight[0]
        prev_weight = next_to_prev_weight[1]

        total_loss = next_weight * next_pixel_loss + prev_weight * prev_pixel_loss
        print(f"loss {total_loss.item()}")

        if self.do_regression:
            pred_pos = outputs["next_pos_xyz"].reshape(-1)
            true_pos = inputs["next_pos_for_regression"].reshape(-1).to(self.device)
            reg_loss = self.reg_loss_fxn(pred_pos, true_pos)
            total_loss += reg_loss

        if self.do_reconstruction:
            # do state reconstruction from image input for previous and next image
            true_next_image_recon = image_to_tiles(inputs["next_pos_for_acc"].reshape(bsz, 1, w, h), self.patch_size)
            true_prev_image_recon = image_to_tiles(inputs["prev_pos_for_acc"].reshape(bsz, 1, w, h), self.patch_size)
            # take max of each patch so that even mixed patches count as having a block
            true_next_image_recon, __ = torch.max(true_next_image_recon, dim=2)
            true_prev_image_recon, __ = torch.max(true_prev_image_recon, dim=2)

            pred_next_image_recon = outputs["next_per_patch_class"]
            pred_prev_image_recon = outputs["prev_per_patch_class"]
            bsz, n = true_next_image_recon.shape
            pred_next_image_recon  = pred_next_image_recon.reshape(bsz * n, 21)
            pred_prev_image_recon  = pred_prev_image_recon.reshape(bsz * n, 21)

            true_next_image_recon = true_next_image_recon.reshape(-1).to(pred_next_image_recon.device).long()
            true_prev_image_recon = true_prev_image_recon.reshape(-1).to(pred_next_image_recon.device).long()

            prev_loss = self.xent_loss_fxn(pred_prev_image_recon, true_prev_image_recon)
            next_loss = self.xent_loss_fxn(pred_next_image_recon, true_next_image_recon)
            total_loss += prev_loss + next_loss

        return total_loss

    def compute_xent_loss(self, inputs, outputs):
        """
        instead of bce against each patch, one distribution over all patches
        """
        bsz, __, w, h = inputs['prev_pos_input'].shape
        pred_next_image = outputs["next_position"]
        pred_prev_image = outputs["prev_position"]

        pred_next_image = pred_next_image.reshape((bsz, -1))
        pred_prev_image = pred_prev_image.reshape((bsz, -1))

        true_next_image = image_to_tiles(inputs["next_pos_for_pred"].reshape(bsz, 1, w, h), self.patch_size)
        true_prev_image = image_to_tiles(inputs["prev_pos_for_pred"].reshape(bsz, 1, w, h), self.patch_size)
        # binarize patches
        prev_sum_image = torch.sum(true_prev_image, dim = 2, keepdim=True)
        prev_patches = torch.zeros_like(prev_sum_image)
        next_sum_image = torch.sum(true_next_image, dim = 2, keepdim=True)
        next_patches = torch.zeros_like(next_sum_image)
        # any patch that has a 1 pixel in it gets 1
        prev_patches[prev_sum_image != 0] = 1
        next_patches[next_sum_image != 0] = 1
        # get single patch index (for now)
        prev_patches_max = torch.argmax(prev_patches, dim = 1).reshape(-1)
        next_patches_max = torch.argmax(next_patches, dim = 1).reshape(-1)

        prev_patches_max = prev_patches_max.to(pred_prev_image.device)
        next_patches_max = next_patches_max.to(pred_next_image.device)
        prev_loss = self.xent_loss_fxn(pred_prev_image, prev_patches_max)
        next_loss = self.xent_loss_fxn(pred_next_image, next_patches_max)

        total_loss = prev_loss + next_loss
        print(f"loss {total_loss.item()}")

        return total_loss

    def validate(self, batch_instance, epoch_num, batch_num, instance_num):
        self.encoder.eval()
        outputs = self.encoder(batch_instance)
        prev_position = outputs['prev_position']
        next_position = outputs['next_position']

        if self.output_type == 'per-patch':
            prev_position = tiles_to_image(prev_position, self.patch_size, output_type="per-patch", upsample=True)
            next_position = tiles_to_image(next_position, self.patch_size, output_type="per-patch", upsample=True)
            prev_position = prev_position.unsqueeze(-1)
            next_position = next_position.unsqueeze(-1)

        elif self.output_type == "patch-softmax":
            prev_position = tiles_to_image(prev_position, self.patch_size, output_type="patch-softmax", upsample=True)
            next_position = tiles_to_image(next_position, self.patch_size, output_type="patch-softmax", upsample=True)
            prev_position = prev_position.unsqueeze(-1)
            next_position = next_position.unsqueeze(-1)

        else:
            pass

        prev_p, prev_r, prev_f1 = self.f1_metric.compute_f1(batch_instance["prev_pos_for_pred"], prev_position)
        next_p, next_r, next_f1 = self.f1_metric.compute_f1(batch_instance["next_pos_for_pred"], next_position)
        masked_prev_p, masked_prev_r, masked_prev_f1 = self.masked_f1_metric.compute_f1(batch_instance["prev_pos_for_pred"], prev_position)
        masked_next_p, masked_next_r, masked_next_f1 = self.masked_f1_metric.compute_f1(batch_instance["next_pos_for_pred"], next_position)

        all_tele_scores = []
        all_oracle_tele_scores = []
        all_tele_dicts = []
        block_accs = []
        pred_centers, true_centers = [], []
        bsz = prev_position.shape[0]
        for batch_idx in range(bsz):
            if self.do_regression:
                # NOT AS ACCURATE
                # next_xyz = outputs['next_pos_xyz'].reshape(bsz, 3)[batch_idx]
                #pdb.set_trace()
                next_xyz_batch = None
            else:
                next_xyz_batch = None

            tele_dict = self.teleportation_metric.get_metric(batch_instance["next_pos_for_acc"][batch_idx].clone(),
                                                             batch_instance["prev_pos_for_acc"][batch_idx].clone(),
                                                             prev_position[batch_idx].clone(),
                                                             outputs["next_position"][batch_idx].clone(),
                                                             batch_instance["block_to_move"][batch_idx].clone(),
                                                             next_xyz = next_xyz_batch)
            all_tele_dicts.append(tele_dict)
            all_tele_scores.append(tele_dict['distance'])
            all_oracle_tele_scores.append(tele_dict['oracle_distance'])
            block_accs.append(tele_dict['block_acc'])
            pred_centers.append(tele_dict['pred_center'])
            true_centers.append(tele_dict['true_center'])

        total_tele_score = np.mean(all_tele_scores)
        total_oracle_tele_score = np.mean(all_oracle_tele_scores)
        block_accuracy = np.mean(block_accs)

        bin_dict = defaultdict(list)

        if self.do_regression:
            mse = self.mse_metric(batch_instance['next_pos_for_regression'],
                                  outputs['next_pos_xyz'])
        else:
            mse = 100

        if self.do_reconstruction:
            bsz, w, h, __, __ = batch_instance["next_pos_for_acc"].shape
            true_next_image_recon = image_to_tiles(batch_instance["next_pos_for_acc"].reshape(bsz, 1, w, h), self.patch_size)
            true_prev_image_recon = image_to_tiles(batch_instance["prev_pos_for_acc"].reshape(bsz, 1, w, h), self.patch_size)
            # take max of each patch so that even mixed patches count as having a block
            true_next_image_recon, __= torch.max(true_next_image_recon, dim=2)
            true_prev_image_recon, __ = torch.max(true_prev_image_recon, dim=2)
            next_recon_metric = self.reconstruction_metric(true_next_image_recon,
                                                           outputs['next_per_patch_class'])
            prev_recon_metric = self.reconstruction_metric(true_prev_image_recon,
                                                           outputs['prev_per_patch_class'])
        else:
            prev_recon_metric = 0.0
            next_recon_metric = 0.0


        if epoch_num > self.generate_after_n:
            for i in range(outputs["next_position"].shape[0]):
                output_path = self.checkpoint_dir.joinpath(f"batch_{batch_num}").joinpath(f"instance_{i}")
                output_path.mkdir(parents = True, exist_ok=True)
                command = batch_instance["command"][i]
                command = [x for x in command if x != "<PAD>"]
                command = " ".join(command)
                next_pos = batch_instance["next_pos_for_acc"][i]
                prev_pos = batch_instance["prev_pos_for_acc"][i]

                if "prev_per_patch_class" in outputs.keys() and outputs["prev_per_patch_class"] is not None:
                    self.generate_reconstruction_image(prev_pos,
                                                    outputs['prev_per_patch_class'][i],
                                                    output_path.joinpath("prev_recon"),
                                                    caption = command)
                    self.generate_reconstruction_image(next_pos,
                                                    outputs['next_per_patch_class'][i],
                                                    output_path.joinpath("next_recon"),
                                                    caption = command)


                self.generate_debugging_image(next_pos,
                                             next_position[i],
                                             output_path.joinpath("next"),
                                             caption = command,
                                             pred_center = pred_centers[i],
                                             true_center = true_centers[i])

                self.generate_debugging_image(prev_pos,
                                              prev_position[i],
                                              output_path.joinpath("prev"),
                                              caption = command)
                bin_distance = int(all_tele_dicts[i]["distance"])
                bin_dict[bin_distance].append(str(output_path) )


                try:
                    with open(output_path.joinpath("attn_weights"), "w") as f1:
                        # for now, just take the last layer
                        to_dump = {"command": batch_instance['command'][i],
                                   "prev_weight": outputs['prev_attn_weights'][-1][i],
                                   "next_weight": outputs['next_attn_weights'][-1][i]}
                        json.dump(to_dump, f1)

                except IndexError:
                    # train-time, pass
                    pass

        return {
                "prev_r": prev_r,
                "prev_p": prev_p,
                "prev_f1": prev_f1,
                "next_r": next_r,
                "next_p": next_p,
                "next_f1": next_f1,
                "masked_prev_r": masked_prev_r,
                "masked_prev_p": masked_prev_p,
                "masked_prev_f1": masked_prev_f1,
                "masked_next_r": masked_next_r,
                "masked_next_p": masked_next_p,
                "masked_next_f1": masked_next_f1,
                "block_acc": block_accuracy,
                "mse": mse,
                "prev_recon_acc": prev_recon_metric,
                "next_recon_acc": next_recon_metric,
                "tele_score": total_tele_score,
                "oracle_tele_score": total_oracle_tele_score,
                "bin_dict": bin_dict}


    def compute_localized_accuracy(self, true_pos, pred_pos, waste):
        values, pred_pixels = torch.max(pred_pos, dim=1)
        pred_pixels = pred_pixels.unsqueeze(-1)

        gold_pixels_ones = true_pos[true_pos == 1]
        pred_pixels_ones = pred_pixels[true_pos == 1]

        # flatten
        pred_pixels_ones = pred_pixels_ones.reshape(-1).detach().cpu()
        gold_pixels_ones = gold_pixels_ones.reshape(-1).detach().cpu()

        # compare
        total_foreground = gold_pixels_ones.shape[0]
        matching_foreground = torch.sum(pred_pixels_ones == gold_pixels_ones).item()
        try:
            foreground_acc = matching_foreground/total_foreground
        except ZeroDivisionError:
            foreground_acc = 0.0

        gold_pixels_zeros = true_pos[true_pos == 0]
        pred_pixels_zeros = pred_pixels[true_pos == 0]
        # flatten
        pred_pixels_zeros = pred_pixels_zeros.reshape(-1).detach().cpu()
        gold_pixels_zeros = gold_pixels_zeros.reshape(-1).detach().cpu()

        total_background = gold_pixels_zeros.shape[0]
        matching_background = torch.sum(pred_pixels_zeros == gold_pixels_zeros).item()
        try:
            background_acc = matching_background/total_background
        except ZeroDivisionError:
            background_acc = 0.0

        #print(f"foreground {foreground_acc} background {background_acc}")
        return (foreground_acc + background_acc ) / 2

    def generate_reconstruction_image(self,
                                 true_data,
                                 pred_data,
                                 out_path,
                                 is_input=False,
                                 caption = None,
                                 pred_center = None,
                                 true_center = None):

        # upsample predictions
        pred_data = pred_data.unsqueeze(0).unsqueeze(-1)
        pred_data_image = tiles_to_image(pred_data, self.patch_size, output_type="per-patch", upsample=True)
        pred_classes = torch.argmax(pred_data_image, dim=1)

        order = ["adidas", "bmw", "burger king", "coca cola", "esso", "heineken", "hp",
                 "mcdonalds", "mercedes benz", "nvidia", "pepsi", "shell", "sri", "starbucks",
                 "stella artois", "target", "texaco", "toyota", "twitter", "ups"]
        legend = [f"{i+1}: {name}" for i, name in enumerate(order)]
        legend_str = "\n".join(legend)
        caption = self.wrap_caption(caption)

        cmap = plt.get_cmap("tab20b")
        # num_blocks x depth x 64 x 64
        xs = np.arange(0, self.resolution, 1)
        zs = np.arange(0, self.resolution, 1)

        depth = 0
        fig = plt.figure(figsize=(16,12))
        gs = gridspec.GridSpec(1, 2, width_ratios=[4, 1])
        # add text command for debugging
        text_ax = plt.subplot(gs[1])
        text_ax.axis([0, 1, 0, 1])
        text_ax.text(0.2, 0.02, legend_str, fontsize = 12)
        text_ax.axis("off")

        props = dict(boxstyle='round',
                     facecolor='wheat', alpha=0.5)
        text_ax.text(0.05, 0.95, caption, wrap=True, fontsize=14,
            verticalalignment='top', bbox=props)
        ax = plt.subplot(gs[0])
        ticks = [i for i in range(0, self.resolution + 16, 16)]
        ax.set_xticks(ticks)
        ax.set_yticks(ticks)
        ax.set_ylim(0, self.resolution)
        ax.set_xlim(0, self.resolution)
        plt.grid()
        to_plot_xs_lab, to_plot_zs_lab, to_plot_labels = [], [], []
        to_plot_xs_prob, to_plot_zs_prob, to_plot_probs = [], [], []
        for x_pos in xs:
            for z_pos in zs:
                label = true_data[x_pos, z_pos, depth].item()
                # don't plot background
                if label > 0:
                    to_plot_xs_lab.append(x_pos)
                    to_plot_zs_lab.append(z_pos)
                    to_plot_labels.append(int(label))
                prob = pred_classes[0, x_pos, z_pos].item()
                to_plot_xs_prob.append(x_pos)
                to_plot_zs_prob.append(z_pos)
                to_plot_probs.append(prob)


        ax.plot(to_plot_xs_lab, to_plot_zs_lab, ".")
        for x,z, lab in zip(to_plot_xs_lab, to_plot_zs_lab, to_plot_labels):
            ax.annotate(lab, xy=(x,z), fontsize = 12)

        # plot centers if availalbe
        if pred_center is not None and true_center is not None:
            plt.plot(*pred_center, marker = "D", color='0000')
            plt.plot(*true_center, marker = "X", color='0000')

        # plot as grid squares at all positions
        squares = []
        for x,z, lab in zip(to_plot_xs_prob, to_plot_zs_prob, to_plot_probs):
            rgba = list(cmap(lab))
            # make opaque
            rgba[-1] = 0.4
            sq = matplotlib.patches.Rectangle((x,z), width = 1, height = 1, color = rgba)
            ax.add_patch(sq)

        file_path =  f"{out_path}.png"
        #data_path = f"{out_path}.npy"
        #np.save(data_path, true_data)

        print(f"saving to {file_path}")
        plt.savefig(file_path)
        plt.close()


def main(args):
    if args.binarize_blocks:
        args.num_blocks = 1

    device = "cpu"
    if args.cuda is not None:
        free_gpu_id = get_free_gpu()
        if free_gpu_id > -1:
            device = f"cuda:{free_gpu_id}"
            #device = "cuda:0"

    device = torch.device(device)
    print(f"On device {device}")
    test = torch.ones((1))
    test = test.to(device)

    # load the data
    dataset_reader = DatasetReader(args.train_path,
                                   args.val_path,
                                   args.test_path,
                                   image_path = args.image_path,
                                   include_depth = args.include_depth,
                                   batch_by_line = args.traj_type != "flat",
                                   traj_type = args.traj_type,
                                   batch_size = args.batch_size,
                                   max_seq_length = args.max_seq_length,
                                   do_filter = args.do_filter,
                                   do_one_hot = args.do_one_hot,
                                   top_only = args.top_only,
                                   resolution = args.resolution,
                                   is_bert = "bert" in args.embedder,
                                   binarize_blocks = args.binarize_blocks,
                                   augment_with_noise = args.augment_with_noise)

    checkpoint_dir = pathlib.Path(args.checkpoint_dir)
    if not args.test:
        print(f"Reading data from {args.train_path}")
        train_vocab = dataset_reader.read_data("train")
        try:
            os.mkdir(checkpoint_dir)
        except FileExistsError:
            pass
        with open(checkpoint_dir.joinpath("vocab.json"), "w") as f1:
            json.dump(list(train_vocab), f1)
    else:
        print(f"Reading vocab from {checkpoint_dir}")
        with open(checkpoint_dir.joinpath("vocab.json")) as f1:
            train_vocab = json.load(f1)

    # don't read if doing test
    if args.test_path is None:
        print(f"Reading data from {args.val_path}")
        dev_vocab = dataset_reader.read_data("dev")

    if args.test_path is not None:
        test_vocab = dataset_reader.read_data("test")
    # no test then delete
    else:
        del(dataset_reader.data['test'])

    print(f"got data")
    print(f"train/dev: {len(dataset_reader.data['train'])}/{len(dataset_reader.data['dev'])}")
    # construct the vocab and tokenizer
    nlp = English()
    tokenizer = Tokenizer(nlp.vocab)
    print(f"constructing model...")
    # get the embedder from args
    if args.embedder == "random":
        embedder = RandomEmbedder(tokenizer, train_vocab, args.embedding_dim, trainable=True)
    elif args.embedder == "glove":
        embedder = GloveEmbedder(tokenizer, train_vocab, args.embedding_file, args.embedding_dim, trainable=True)
    elif args.embedder.startswith("bert"):
        embedder = BERTEmbedder(model_name = args.embedder,  max_seq_len = args.max_seq_length)
    else:
        raise NotImplementedError(f"No embedder {args.embedder}")

    if args.top_only:
        depth = 1
    else:
        # TODO (elias): confirm this number
        depth = 7

    encoder_cls = ResidualTransformerEncoder if args.encoder_type == "ResidualTransformerEncoder" else TransformerEncoder
    encoder_kwargs = dict(image_size = args.resolution,
                          patch_size = args.patch_size,
                          language_embedder = embedder,
                          n_layers_shared = args.n_shared_layers,
                          n_layers_split  = args.n_split_layers,
                          n_classes = 2,
                          channels = args.channels,
                          n_heads = args.n_heads,
                          hidden_dim = args.hidden_dim,
                          ff_dim = args.ff_dim,
                          dropout = args.dropout,
                          embed_dropout = args.embed_dropout,
                          output_type = args.output_type,
                          positional_encoding_type = args.pos_encoding_type,
                          device = device,
                          log_weights = args.test,
                          init_scale = args.init_scale,
                          do_regression = False,
                          do_reconstruction = args.do_reconstruction,
                          pretrained_weights = args.pretrained_weights)
    if args.encoder_type == "ResidualTransformerEncoder":
        encoder_kwargs["do_residual"] = args.do_residual
    # Initialize encoder
    encoder = encoder_cls(**encoder_kwargs)

    if args.cuda is not None:
        encoder = encoder.cuda(device)

    print(encoder)
    # construct optimizer
    optimizer = torch.optim.Adam(encoder.parameters(), lr=args.learn_rate)
    # scheduler
    scheduler = NoamLR(optimizer, model_size = args.hidden_dim, warmup_steps = args.warmup, factor = args.lr_factor)

    best_epoch = -1
    block_size = int((args.resolution * 4)/64)
    if not args.test:
        if not args.resume:
            try:
                os.mkdir(args.checkpoint_dir)
            except FileExistsError:
                # file exists
                try:
                    assert(len(glob.glob(os.path.join(args.checkpoint_dir, "*.th"))) == 0)
                except AssertionError:
                    raise AssertionError(f"Output directory {args.checkpoint_dir} non-empty, will not overwrite!")
        else:
            # resume from pre-trained
            encoder = encoder.to("cpu")
            state_dict = torch.load(pathlib.Path(args.checkpoint_dir).joinpath("best.th"), map_location='cpu')

            encoder.load_state_dict(state_dict, strict=True)
            encoder = encoder.cuda(device)
            # get training info
            best_checkpoint_data = json.load(open(pathlib.Path(args.checkpoint_dir).joinpath("best_training_state.json")))
            print(f"best_checkpoint_data {best_checkpoint_data}")
            best_epoch = best_checkpoint_data["epoch"]

        # save arg config to checkpoint_dir
        with open(pathlib.Path(args.checkpoint_dir).joinpath("config.yaml"), "w") as f1:
            dump_args = copy.deepcopy(args)
            # drop stuff we can't serialize
            del(dump_args.__dict__["cfg"])
            del(dump_args.__dict__["__cwd__"])
            del(dump_args.__dict__["__path__"])
            to_dump = dump_args.__dict__
            # dump
            yaml.safe_dump(to_dump, f1, encoding='utf-8', allow_unicode=True)

        # construct trainer
        trainer = TransformerTrainer(train_data = dataset_reader.data["train"],
                              val_data = dataset_reader.data["dev"],
                              encoder = encoder,
                              optimizer = optimizer,
                              scheduler = scheduler,
                              num_epochs = args.num_epochs,
                              num_blocks = args.num_blocks,
                              device = device,
                              checkpoint_dir = args.checkpoint_dir,
                              num_models_to_keep = args.num_models_to_keep,
                              generate_after_n = args.generate_after_n,
                              score_type=args.score_type,
                              depth = depth,
                              resolution = args.resolution,
                              output_type = args.output_type,
                              patch_size = args.patch_size,
                              block_size = block_size,
                              best_epoch = best_epoch,
                              seed = args.seed,
                              zero_weight = args.zero_weight,
                              next_weight = args.next_weight,
                              prev_weight = args.prev_weight,
                              do_regression = args.do_regression,
                              do_reconstruction = args.do_reconstruction,
                              n_epochs_pre_valid = args.n_epochs_pre_valid)
        trainer.train()

    else:
        # test-time, load best model
        print(f"loading model weights from {args.checkpoint_dir}")
        #state_dict = torch.load(pathlib.Path(args.checkpoint_dir).joinpath("best.th"))
        #encoder.load_state_dict(state_dict, strict=True)
        encoder = encoder.to("cpu")
        state_dict = torch.load(pathlib.Path(args.checkpoint_dir).joinpath("best.th"), map_location='cpu')

        encoder.load_state_dict(state_dict, strict=True)
        encoder = encoder.cuda(device)

        if "test" in dataset_reader.data.keys():
            eval_data = dataset_reader.data['test']
            if args.out_path is None:
                out_path = "test_metrics.json"
            else:
                out_path = args.out_path
        else:
            eval_data = dataset_reader.data['dev']
            if args.out_path is None:
                out_path = "val_metrics.json"
            else:
                out_path = args.out_path

        eval_trainer = TransformerTrainer(train_data = dataset_reader.data["train"],
                                   val_data = eval_data,
                                   encoder = encoder,
                                   optimizer = None,
                                   scheduler = None,
                                   num_epochs = 0,
                                   num_blocks = args.num_blocks,
                                   device = device,
                                   resolution = args.resolution,
                                   output_type = args.output_type,
                                   checkpoint_dir = args.checkpoint_dir,
                                   patch_size = args.patch_size,
                                   block_size = block_size,
                                   num_models_to_keep = 0,
                                   seed = args.seed,
                                   generate_after_n = args.generate_after_n,
                                   score_type=args.score_type,
                                   do_regression = args.do_regression,
                                   do_reconstruction = args.do_reconstruction)
        print(f"evaluating")
        eval_trainer.evaluate(out_path)


if __name__ == "__main__":
    np.random.seed(12)
    torch.manual_seed(12)

    parser = ArgumentParser()

    # config file
    parser.add_argument("--cfg", action = ActionConfigFile)

    # training
    parser.add_argument("--test", action="store_true", help="load model and test")
    parser.add_argument("--resume", action="store_true", help="resume training a model")
    # data
    parser.add_argument("--train-path", type=str, default = "blocks_data/trainset_v2.json", help="path to train data")
    parser.add_argument("--val-path", default = "blocks_data/devset.json", type=str, help = "path to dev data" )
    parser.add_argument("--test-path", default = None, help = "path to test data" )
    parser.add_argument("--image-path", default = None, help = "path to simulation-generated heighmap images of scenes")
    parser.add_argument("--include-depth", default=True, action = "store_true", help = "include depth heightmap with images when training from images of state")
    parser.add_argument("--num-blocks", type=int, default=20)
    parser.add_argument("--binarize-blocks", action="store_true", help="flag to treat block prediction as binary task instead of num-blocks-way classification")
    parser.add_argument("--traj-type", type=str, default="flat", choices = ["flat", "trajectory"])
    parser.add_argument("--batch-size", type=int, default = 32)
    parser.add_argument("--max-seq-length", type=int, default = 65)
    parser.add_argument("--do-filter", action="store_true", help="set if we want to restrict prediction to the block moved")
    parser.add_argument("--do-one-hot", action="store_true", help="set if you want input representation to be one-hot" )
    parser.add_argument("--channels", type=int, default=21)
    parser.add_argument("--top-only", action="store_true", help="set if we want to train/predict only the top-most slice of the top-down view")
    parser.add_argument("--resolution", type=int, help="resolution to discretize input state", default=64)
    parser.add_argument("--next-weight", type=float, default=1)
    parser.add_argument("--prev-weight", type=float, default=1)
    parser.add_argument("--augment-with-noise", type=bool, action='store_true', default=False, help = "set to augment training images with gaussian noise")
    # language embedder
    parser.add_argument("--embedder", type=str, default="random", choices = ["random", "glove", "bert-base-cased", "bert-base-uncased"])
    parser.add_argument("--embedding-file", type=str, help="path to pretrained glove embeddings")
    parser.add_argument("--embedding-dim", type=int, default=300)
    # transformer parameters
    parser.add_argument("--encoder-type", type=str, default="TransformerEncoder", choices = ["TransformerEncoder", "ResidualTransformerEncoder"], help = "choice of dual-stream transformer encoder or one that bases next prediction on previous transformer representation")
    parser.add_argument("--pos-encoding-type", type = str, default="learned")
    parser.add_argument("--patch-size", type=int, default = 8)
    parser.add_argument("--n-shared-layers", type=int, default = 6)
    parser.add_argument("--n-split-layers", type=int, default = 2)
    parser.add_argument("--n-classes", type=int, default = 2)
    parser.add_argument("--n-heads", type= int, default = 8)
    parser.add_argument("--hidden-dim", type= int, default = 512)
    parser.add_argument("--ff-dim", type = int, default = 1024)
    parser.add_argument("--dropout", type=float, default=0.2)
    parser.add_argument("--embed-dropout", type=float, default=0.2)
    parser.add_argument("--output-type", type=str, choices = ["per-pixel", "per-patch", "patch-softmax"], default='per-pixel')
    parser.add_argument("--do-residual", action = "store_true", help = "set to residually connect unshared and next prediction in ResidualTransformerEncoder")
    parser.add_argument("--pretrained-weights", type=str, default=None, help = "path to best.th file for a pre-trained initialization")
    # misc
    parser.add_argument("--cuda", type=int, default=None)
    parser.add_argument("--learn-rate", type=float, default = 3e-5)
    parser.add_argument("--warmup", type=int, default=4000, help = "warmup setps for learn-rate scheduling")
    parser.add_argument("--n-epochs-pre-valid", type=int, default = 0, help = "number of epochs to run before doing validation")
    parser.add_argument("--lr-factor", type=float, default = 1.0, help = "factor for learn-rate scheduling")
    parser.add_argument("--gamma", type=float, default = 0.7)
    parser.add_argument("--checkpoint-dir", type=str, default="models/language_pretrain")
    parser.add_argument("--num-models-to-keep", type=int, default = 5)
    parser.add_argument("--num-epochs", type=int, default=3)
    parser.add_argument("--generate-after-n", type=int, default=10)
    parser.add_argument("--score-type", type=str, default="acc", choices = ["acc", "block_acc", "tele_score"])
    parser.add_argument("--zero-weight", type=float, default = 0.05, help = "weight for loss weighting negative vs positive examples")
    parser.add_argument("--init-scale", type=int, default = 4, help = "initalization scale for transformer weights")
    parser.add_argument("--seed", type=int, default=12)
    parser.add_argument("--do-regression", action="store_true", help="add a regression task to learning")
    parser.add_argument("--do-reconstruction", action="store_true", help="add a reconstruction task to learning")
    parser.add_argument("--out-path", type=str, default=None, help = "when decoding, path to output file")

    args = parser.parse_args()
    if args.do_one_hot:
        args.channels = 21


    main(args)