Skip to content

Latest commit

 

History

History
313 lines (253 loc) · 12.8 KB

ffm.config.md

File metadata and controls

313 lines (253 loc) · 12.8 KB

Field-aware Factorization Machine

# filesystem scheme URI
# local filesystem : "local", "file:///", (you can use both of them in linux and os system, but in windows local filesystem, only"local" can be used)
# hdfs filesystem : "hdfs://host"
# other filesystem URI
fs_scheme : "file:///",

# if you want to see more detailed logs, set verbose to true
verbose : false,

data {
    # train data
    train {
        # train data path
        # local filesystem : supports file and recursive directories
        # hdfs filesystem : depends on spark or hadoop cluster, spark supports regex paths
        data_path : "???", 
  
        # max tolerable error format count in train data
        max_error_tol : 0
    },

    # test/validation data
    test {
        # test data path(if you have no test data, you don't have to supply it)
        # local filesystem : supports local file and recursive directories
        # hdfs filesystem : depends on spark or hadoop cluster, spark supports more complicated paths(more that one path, regex paths)
        data_path : "",
      
        # max tolerable error format count in train data
        max_error_tol : 0
    },

    # delimiters, see data_format.md for more details
    # train/test data format:
    #   regression : weight###label###f1name:f1value,f2name:f2value,...
    #   binary classification : weight###label(0 or 1)###(field1@)f1name:f1value,(field2@)f2name:f2value,...
    #   binary cross_entropy : weight###label(0 ~ 1, positive)###(field1@)f1name:f1value,(field2@)f2name:f2value,...
    # each of ffm model's features must belong to a field.  "(field?@)"" means that "fieldx@"" is not essential, if it doesn't have "fieldx@", featurename is used as field
    delim {
        # separates sample weight, labels, features
        x_delim : "###",
      
        # if you have more than one label, they will be separated by y_delim
        y_delim : ",",
      
        # separates features(a feature includes feature_name and feature_value)
        features_delim : ",",
      
        # separates feature_name and feature_value
        feature_name_val_delim : ":",
      
        # seperates field and feature_name
        field_delim : "@" 
    },

    # if your task is of classification(including multi-class classification), 
    # you can downsample/upsample some special classes in special probability/weight
    # format : y_sampling : ["class1@prob", "class2@prob", ...]
    # downsampling: binary classification, if you want to reserve negative samples in 0.1 prob, set y_sampling to ["0:0.1"]
    # upsampling: multi-class classification, if you want to increase 6'th class sample weight by 10X and 8'th class sample weight by 5X, set y_sampling to ["6@10","8@5"] 
    # empty means no sampling
    y_sampling : [],

    # whether your train/test data is assigned. See "Train/Test Data Splitting Manner" in running_guide.md for more details on train/test data assignment method
    assigned : false,
    # if your train/test data is not assigned, we provide the following two ways for slaves to read files: 
    # lines_avg : different slaves read different lines of same file alternative. If you have a few train/test files and more than one slave, we recommend this manner
    # files_avg : different slaves read different files, if your files outnumber slaves, and the number of samples in each file is similar, we recommend this manner
    unassigned_mode : "lines_avg" // "files_avg"
},

feature {
    # feature hash, implements "Feature Hashing for Large Scale Multitask Learning"
    # if your feature dim is very large, training will proceed very slowly, you can use feature hash to reduce dim at a fraction of cost 
    feature_hash {
        # switch
        need_feature_hash : false,
  
        # feature hashed dim, final feature dim can be more than bucket_size
        bucket_size : 1000000,
  
        # random seed
        seed : 39916801,
  
        # hashed feature name prefix
        feature_prefix : "hash_"
    },

    # preprocessing feature value
    # many learning algorithms(e.g. l1, l2 regularization) assume that features are centered around zero and have variance in the same order
    # if you use feature transform, a file record statistical information while locates at "model.data_path" + "_feature_transform_stat" and
    # will be used in offline or online prediction.
    transform {
        # feature value
        switch_on : false,
        
        # preprocessing manner:
        #   standardization : x --> (x - mean) / stdvar
        #   scale_range : x --> min + (max - min) * (x - xmin) / (xmax - xmin)
        mode : "standardization",
      
        # if your preprocessing manner is "scale_range", you must supply min and max scale ranges
        scale_range {
            min : -1,
            max : 1
        },
	    
        # include_features and exclude_features are both empty list means doing transformation on all features.If you only want to transform a subset of features, then put related features in include_features, and set exclude_features to an empty list. If you want to tranform all features except some, then put them in exclude_features and set include_features to an empty list. If both of them are not empty list, then include_features configuration is effective and exclude_features not.
        # preprocessing only a subset of features, e.g. ["f1", "f3"]
        include_features : [],

        # preprocessing all features except exclude_features, e.g. ["f2", "f3"]
        exclude_features : []
    }

	# if a feature's frequency of occurrence is less than filter_threshold, the feature will be filtered
    filter_threshold : 0
},

model {
    # path(checkpoint path) for model parameters, format:
    # f1,weight,precision(laplace approximate)
    # f2,weight,precision
    # ...
    # 
    # model dict data will be saved in path: "data_path" + "_dict". This dict can be used as other models "dict_path", e.g. you can use model dict of linear model with l1 as fm model's "dict_path"
    data_path : "???",
  
    # delimiter in model data
    delim : ",",
  
    # whether using user provided dict or other models' dict
    need_dict : false,
    
    # user provided dict path, if "need_dict:false", dict_path does not need to be provided.
    # dict data format:
    # f1
    # f2
    # ....
    # 
    # attention: dict_path is input, model dict data is output
    dict_path : "",
  
    # model saved frequency
    dump_freq : 50,
  
    # user must provide field dict path, field dict format(each field placed in a line):
    # field1
    # field2
    # ...
    # fieldn
    field_dict_path : "",
  
    # whether your model uses intercept(bias)
    need_bias : true,
  
    # bias feature name
    bias_feature_name : "_bias_",
  
    # whether continue to train from checkpoint
    continue_train : false
},

loss {
    # loss function of linear model.
    # e.g. "sigmoid" stands for Logistic Regression(active function is sigmoid and loss function is negative log likelihood). "l2" stands for Linear Regression. See "Objective Function(active function and loss function)" in models.md for more details
    loss_function : "sigmoid",
  
    # except loss, if you want to evaluate other metrics, like "auc", "confusion_matrix", "RMSE", ..., see evaluation_metrics.md for more details
    evaluate_metric : ["auc"],
  
    # whether you only want to evaluate some metrics(training is finished)
    just_evaluate : false,
  
    # model parameters regularization to avoid overfitting,
    # l1 and l2 can be used at the same time(elastic net)
    regularization : {
        # l1 regularization(parameters with the prior of laplace distribution),
        # two terms represent l1 regularization of linear part and second order interaction respectively.  
        l1 : [5.28e-9, 5.28e-9],        
        # l2 regularization(parameters with the prior of guassian distribution),
        # two terms represent l2 regularization of linear part and second order interaction respectively.
        l2 : [5.28e-7, 5.28e-7]
    }
},

optimization {
    # optimizer type, this version only supports "line_search"("trust_region". "sgd" will be supported in the future)
    optimizer : "line_search",

    line_search {
        # step search stopping criterion:
        # "sufficient_decrease", "wolfe", "strong_wolfe" three modes
        # if you use "wolfe" mode, loss will always decrease no matter loss is convex or not
        mode : "wolfe",

        # backtracking is one of the most effetive step search methods, just use default values
        backtracking : {
            step_decr : 0.5,
            step_incr : 2.1,
            max_iter : 55,
            min_step : 1e-16,
            max_step : 1e18,
            c1 : 1e-4,
            c2 : 0.9
        }
        
        # L-BFGS is one of the best optimization methods which is used for convex and non-convex function
        lbfgs {
            # L-BFGS uses recent m curve infomation approximate Hessian Matrix
            m : 12,
          	
          	# converge control 
            convergence : {
                # max iter step, if you want to converge precisely(it has risk of overfitting), increase max_iter
                max_iter : 100,
          
                # if you want to converge precisely(it has risk of overfitting), decrease eps
                eps : 1e-3
            }
        }
    }
},

hyper {
    # switch of hyperparameter optimization, if you switch on, be sure to provide test data(validation)
    switch_on : false,
  
    # between two hyper opt steps, whether previous optimized model parameters will be used for hyper opt start in the next step.
    # if restart is true, hyper opt step always uses random model parameters, otherwise use previous optimized model parameters.
    # in general, convex loss function uses restart:false, non-convex loss function uses restart:true
    restart : true,
  
    # hyperparameter optimization method:
    #   "hoag" : hyperparameter optimization with approximate gradient(modified)
    #   "grid" : grid search
    mode : "hoag",

    # hyperparameter optimization with approximate gradient. 
    # using this method, convex loss functions can always find optimal hyper parameters, non-convex loss functions can not be guaranteed.
    # attention: l1 hyperparameter is not supported in hoag method!
    hoag {
        # hyperparameters optimization inital step length, 1.0 always works
        init_step : 1.0,
  
        # between two continuous steps, if gradient is opposite, step will decrease, step = step * step_decr_factor 
        step_decr_factor : 0.7,
        
        # if the diff of the two continuous step tests are less than a limit, hyper opt will be aborted. if you want to get more optimal hyperparameters, decrease this value.
        test_loss_reduce_limit : 1e-5,
  
        # max hyper optimization step, if you want to get more optimal hyperparameters, increase this value.
        outer_iter : 10,
        
        # l1 regularization value(fixed)
        # l1 hyperparameter is not supported in hoag method
        # two terms represent l1 regularization of linear part and second order interaction respectively.          
        l1 : [0.0, 0.0], 
  
        # l2 regularization initial value
        # two terms represent of l2 regularization of linear part and second order interaction respectively.
        l2 : [5.28e-7, 5.28e-7]
    },

    # grid search, total optimization step=l1parts * l2parts
    grid {
        # range and interval of l1 hyperparameter will be searched
        # format: [l1left, l1right, l1parts], means [left, right] interval will be divided into "l1parts" equal parts, if you want to use fixed l1 value, set as follows: [fixedvalue, fixedvalue, 1]
        # first three terms are used for linear, last three terms are used for interaction
        l1 : [1e-9, 1e-6, 5, 1e-9, 1e-6, 5], 
      
        # range and interval of l2 hyperparameter will be searched
        # format: [l2left, l2right, l2parts], means [left, right] interval will be divided into "l2parts" equal parts, if you want to use fixed l2 value, set as follows: [fixedvalue, fixedvalue, 1],
        # first three terms are used for linear, last three terms are used for interaction
        l2 : [1e-8, 1e-5, 5, 1e-8, 1e-5, 5] 
    }

},

random {
    # random initialize method, including normal/uniform distribution
    mode : "normal",
  
    # random seed
    seed : 111111,
  
    # normal distribution
    normal {
        # expectation
        mean : 0.0,
        # standard deviation
        std : 0.01
    },
	
    # uniform distribution
    uniform {
        # range start
        range_start : -0.01,
        # range end
        range_end : 0.01
    },
},

# if the value of first term >= 1, ffm needs linear part, if <= 0, fm doesn't have linear part
# if the value of second term >= 1, ffm needs second part, if <= 0, fm doesn't have second part(not recommended, it degenerates a linear model)
k : [1,8],

# if ffm has second interaction part, it indicates whether bias needs latent factor.
bias_need_latent_factor : false