Noahs-ARK · roys174 · Oct 12, 2018 · Oct 12, 2018 · Oct 17, 2018 · Oct 17, 2018
diff --git a/classification/dataloader.py b/classification/dataloader.py
@@ -151,13 +151,14 @@ def pad(sequences, sos=None, eos=None, pad_token='<pad>', pad_left=True, reverse
 def create_one_batch(x, y, map2id, oov='<oov>', gpu=False,
                    sos=None, eos=None, bidirectional=False):
     oov_id = map2id[oov]
-    x_fwd = pad(x, sos=sos, eos=eos, pad_left=True)
-    length = len(x_fwd[0])
-    batch_size = len(x_fwd)
-    x_fwd = [ map2id.get(w, oov_id) for seq in x_fwd for w in seq ]
+    x_padded = pad(x, sos=sos, eos=eos, pad_left=True)
+    length = len(x_padded[0])
+    batch_size = len(x_padded)
+    x_fwd = [ map2id.get(w, oov_id) for seq in x_padded for w in seq ]
     x_fwd = torch.LongTensor(x_fwd)
     assert x_fwd.size(0) == length*batch_size
     x_fwd, y = x_fwd.view(batch_size, length).t().contiguous(), torch.LongTensor(y)
+
     if gpu:
         x_fwd, y = x_fwd.cuda(), y.cuda()
     if bidirectional:
@@ -170,12 +171,13 @@ def create_one_batch(x, y, map2id, oov='<oov>', gpu=False,
         if gpu:
             x_bwd = x_bwd.cuda()
         return (x_fwd, x_bwd), y
-    return (x_fwd), y
+
+    return (x_fwd), y, x_padded
 
 
 # shuffle training examples and create mini-batches
 def create_batches(x, y, batch_size, map2id, perm=None, sort=False, gpu=False,
-                   sos=None, eos=None, bidirectional=False):
+                   sos=None, eos=None, bidirectional=False, get_text_batches=False):
 
     lst = perm or list(range(len(x)))
     # sort sequences based on their length; necessary for SST
@@ -185,28 +187,39 @@ def create_batches(x, y, batch_size, map2id, perm=None, sort=False, gpu=False,
     x = [ x[i] for i in lst ]
     y = [ y[i] for i in lst ]
 
+    txt_batches = None
+    if get_text_batches:
+        txt_batches = []
+
     sum_len = 0.0
     batches_x = [ ]
     batches_y = [ ]
+
+
     size = batch_size
     nbatch = (len(x)-1) // size + 1
     for i in range(nbatch):
-        bx, by = create_one_batch(x[i*size:(i+1)*size], y[i*size:(i+1)*size],
+        bx, by, padded_x = create_one_batch(x[i*size:(i+1)*size], y[i*size:(i+1)*size],
                                   map2id, gpu=gpu, sos=sos, eos=eos, bidirectional=bidirectional)
         sum_len += len(bx[0])
         batches_x.append(bx)
         batches_y.append(by)
 
+        if get_text_batches:
+            txt_batches.append(padded_x)
+
     if sort:
         perm = list(range(nbatch))
         random.shuffle(perm)
         batches_x = [ batches_x[i] for i in perm ]
         batches_y = [ batches_y[i] for i in perm ]
+        if get_text_batches:
+            txt_batches = [txt_batches[i] for i in perm]
 
     # sys.stdout.write("{} batches, avg len: {:.1f}\n".format(
     #     nbatch, sum_len/nbatch
     # ))
-    return batches_x,  batches_y
+    return batches_x,  batches_y, txt_batches
 
 
 def load_embedding_npz(path):

diff --git a/classification/experiment_params.py b/classification/experiment_params.py
@@ -0,0 +1,153 @@
+# these egories have more than 100 training instances.
+def get_categories():
+    #return ["apparel/", "automotive/", "baby/", "beauty/", "books/", "camera_&_photo/", "cell_phones_&_service/", "computer_&_video_games/", "dvd/", "electronics/", "gourmet_food/", "grocery/", "health_&_personal_care/", "jewelry_&_watches/", "kitchen_&_housewares/", "magazines/", "music/", "outdoor_living/", "software/", "sports_&_outdoors/", "toys_&_games/", "video/"]
+    #return ["apparel/", "baby/", "beauty/", "books/", "camera_&_photo/", "cell_phones_&_service/", "computer_&_video_games/", "dvd/", "electronics/", "health_&_personal_care/", "kitchen_&_housewares/", "magazines/", "music/", "software/", "sports_&_outdoors/", "toys_&_games/", "video/"]
+    #return ["camera_&_photo/","apparel/","health_&_personal_care/", "toys_&_games/", "kitchen_&_housewares/", "dvd/","books/", "original_mix/"]
+
+    #return ["kitchen_&_housewares/","dvd/", "books/", "original_mix/"]
+    #return ["dvd/","original_mix/"]
+    #return ["kitchen_&_housewares/", "books/"]
+    #return ["kitchen_&_housewares/"]
+    return ["books/"]
+
+
+
+
+class ExperimentParams:
+    def __init__(self,
+                 path = None,
+                 embedding = None,
+                 loaded_embedding = None,
+                 seed = 314159,
+                 model = "rrnn",
+                 semiring = "plus_times",
+                 use_layer_norm = False,
+                 use_output_gate = False,
+                 use_rho = True,
+                 rho_sum_to_one = False,
+                 use_last_cs = False,
+                 use_epsilon_steps = False,
+                 pattern = "2-gram",
+                 activation = "none",
+                 trainer = "adam",
+                 fix_embedding = True,                            
+                 batch_size = 64,
+                 max_epoch=500,
+                 d_out="256",
+                 dropout=0.2,
+                 embed_dropout=0.2,
+                 rnn_dropout=0.2,
+                 depth=1,
+                 lr=0.001,
+                 lr_decay=0,
+                 lr_schedule_decay=0.5,
+                 gpu=True,
+                 eval_ite=50,
+                 patience=30,
+                 lr_patience=10,
+                 weight_decay=1e-6,
+                 clip_grad=5,
+                 reg_strength=0,
+                 reg_strength_multiple_of_loss=0,
+                 reg_goal_params=False,
+                 prox_step=False,
+                 num_epochs_debug=-1,
+                 debug_run = False,
+                 sparsity_type="none",
+                 filename_prefix="",
+                 filename_suffix="",
+                 dataset="amazon/",
+                 learned_structure=False,
+                 logging_dir="/home/jessedd/projects/rational-recurrences/classification/logging/",
+                 base_data_dir="/home/jessedd/data/",
+                 output_dir=None,
+                 input_model=None
+    ):
+        self.path = path 
+        self.embedding = embedding
+        self.loaded_embedding = loaded_embedding
+        self.seed = seed
+        self.model = model
+        self.semiring = semiring
+        self.use_layer_norm = use_layer_norm
+        self.use_output_gate = use_output_gate
+        self.use_rho = use_rho
+        self.rho_sum_to_one = rho_sum_to_one
+        self.use_last_cs = use_last_cs
+        self.use_epsilon_steps = use_epsilon_steps
+        self.pattern = pattern
+        self.activation = activation
+        self.trainer = trainer
+        self.fix_embedding = fix_embedding
+        self.batch_size = batch_size
+        self.max_epoch = max_epoch
+        self.d_out = d_out
+        self.dropout = dropout
+        self.embed_dropout = embed_dropout
+        self.rnn_dropout = rnn_dropout
+        self.depth = depth
+        self.lr = lr
+        self.lr_decay = lr_decay
+        self.lr_schedule_decay = lr_schedule_decay
+        self.gpu = gpu
+        self.eval_ite = eval_ite
+        self.patience = patience
+        self.lr_patience = lr_patience
+        self.weight_decay = weight_decay
+        self.clip_grad = clip_grad
+        self.reg_strength = reg_strength
+        self.reg_strength_multiple_of_loss = reg_strength_multiple_of_loss
+        self.reg_goal_params = reg_goal_params
+        self.prox_step = prox_step
+        self.num_epochs_debug = num_epochs_debug
+        self.debug_run = debug_run
+        self.sparsity_type = sparsity_type
+        self.filename_prefix = filename_prefix
+        self.filename_suffix = filename_suffix
+        self.dataset = dataset
+        self.learned_structure = learned_structure
+        self.logging_dir = logging_dir
+        self.base_data_dir = base_data_dir
+        self.output_dir = output_dir
+        self.input_model = input_model
+
+        self.current_experiment()
+
+    # adjusts the default values with the current experiment
+    def current_experiment(self):
+        base_data_dir = self.base_data_dir
+        if self.debug_run:
+            base_data_dir += "amazon_debug/"
+        else:
+            base_data_dir += self.dataset
+        self.path = base_data_dir
+        self.embedding = base_data_dir + "embedding"
+
+    def filename(self):
+        if self.sparsity_type == "none" and self.learned_structure:
+            sparsity_name = self.learned_structure
+        else:
+            sparsity_name = self.sparsity_type
+        if self.debug_run:
+            self.filename_prefix += "DEBUG_"
+        name = "{}{}_layers={}_lr={:.3E}_dout={}_drout={:.4f}_rnndout={:.4f}_embdout={:.4f}_wdecay={:.2E}_clip={:.2f}_pattern={}_sparsity={}".format(
+            self.filename_prefix, self.trainer, self.depth, self.lr, self.d_out, self.dropout, self.rnn_dropout, self.embed_dropout,
+            self.weight_decay, self.clip_grad, self.pattern, sparsity_name)
+        if self.reg_strength > 0:
+            name += "_regstr={:.3E}".format(self.reg_strength)
+        if self.reg_strength_multiple_of_loss:
+            name += "_regstrmultofloss={}".format(self.reg_strength_multiple_of_loss)
+        if self.reg_goal_params:
+            name += "_goalparams={}".format(self.reg_goal_params)
+        if self.prox_step:
+            name += "_prox"
+        if self.filename_suffix != "":
+            name += self.filename_suffix
+        if not self.gpu:
+            name = name + "_cpu"
+
+        return name
+
+    def __str__(self):
+        return str(vars(self))
+
diff --git a/classification/experiment_tools.py b/classification/experiment_tools.py
@@ -0,0 +1,38 @@
+from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter
+import time
+import os
+
+
+
+def preload_embed(dir_location):
+    start = time.time()
+    import dataloader
+    embs =  dataloader.load_embedding(os.path.join(dir_location,"embedding_filtered"))
+    print("took {} seconds".format(time.time()-start))
+    print("preloaded embeddings from amazon dataset.")
+    print("")
+    return embs
+
+
+def general_arg_parser():
+    """ CLI args related to training and testing models. """
+    p = ArgumentParser(add_help=False)
+    p.add_argument("-d", '--base_dir', help="Data directory", type=str, required=True)
+    p.add_argument("-a", "--dataset", help="Dataset name", type=str, required=True)
+    p.add_argument("-p", "--pattern", help="Pattern specification", type=str, default="1-gram,2-gram,3-gram,4-gram")
+    p.add_argument("--d_out", help="Output dimension(?)", type=str, default="0,4,0,2")
+    p.add_argument("-g", "--gpu", help="Use GPU", action='store_true')
+    p.add_argument('--depth', help="Depth of network", type=int, default=1)
+    p.add_argument("-s", "--seed", help="Random seed", type=int, default=1234)
+    p.add_argument("-b", "--batch_size", help="Batch size", type=int, default=64)
+    p.add_argument("--use_last_cs", help="Only use last hidden state as output value", action='store_true')
+
+    # p.add_argument("--max_doc_len",
+    #                help="Maximum doc length. For longer documents, spans of length max_doc_len will be randomly "
+    #                     "selected each iteration (-1 means no restriction)",
+    #                type=int, default=-1)
+    # p.add_argument("-n", "--num_train_instances", help="Number of training instances", type=int, default=None)
+    # p.add_argument("-e", "--embedding_file", help="Word embedding file", required=True)
+
+    return p
+