Skip to content

Latest commit

 

History

History
269 lines (216 loc) · 11.1 KB

linear.config.md

File metadata and controls

269 lines (216 loc) · 11.1 KB

Linear Model Config:

# filesystem scheme URI
# local filesystem : "local", "file:///", (you can use both of them in linux and os system, but in windows local filesystem, only"local" can be used)
# hdfs filesystem : "hdfs://host"
# other filesystem URI
fs_scheme : "file:///",

# if you want to see more detailed logs, set verbose to true
verbose : false,

data {
    # train data
    train {
        # train data path
        # local filesystem : supports file and recursive directories
        # hdfs filesystem : depends on spark or hadoop cluster, spark supports regex paths
        data_path : "???", 
  
        # max tolerable error format count in train data
        max_error_tol : 0
    },

    # test/validation data
    test {
        # test data path(if you have no test data, you don't have to supply it)
        # local filesystem : supports local file and recursive directories
        # hdfs filesystem : depends on spark or hadoop cluster, spark supports more complicated paths(more that one path, regex paths)
        data_path : "",
      
        # max tolerable error format count in train data
        max_error_tol : 0
    },

    # delimiters, see data_format.md for more details
    # train/testing data format:
    #   regression : weight###label###f1name:f1value,f2name:f2value,...
    #   binary classification : weight###label(0 or 1)###f1name:f1value,f2name:f2value,...
    #   binary cross_entropy : weight###label(0~1, positive)###f1name:f1value,f2name:f2value,...
    # format of featureName:featureValue:
    # numerical feature: heigth:175.4
    # categorical feature: gender_male:1.0
    delim {
        # separates sample weight, labels, features, init_prediction(only in tree model)
        x_delim : "###",
      
        # if you have more than one label, they will be separated by y_delim
        y_delim : ",",
      
        # separates features(a feature includes feature_name and feature_value)
        features_delim : ",",
      
        # separates feature_name and feature_value
        feature_name_val_delim : ":"
    },

    # if your task is of classification(including multi-class classification), 
    # you can downsample/upsample some special classes in special probability/weight
    # format : y_sampling : ["class1@prob", "class2@prob", ...]
    # downsampling: binary classification, if you want to reserve negative samples in 0.1 prob, set y_sampling to ["0:0.1"]
    # upsampling: multi-class classification, if you want to increase the 6'th class sample weight by 10X and 8'th class sample weight by 5X, set y_sampling to ["6@10","8@5"] 
    # empty means no sampling
    y_sampling : [],

    # whether your train/test data is assigned. See "Train/Test Data Splitting Manner" in running_guide.md for more details on train/test data assignment method
    assigned : false,
    # if your train/test data is not assigned, we provide the following two ways for slaves to read files: 
    # lines_avg : different slaves read different lines of same file alternative. If you have a few train/test files and more than one slave, we recommend this manner
    # files_avg : different slaves read different files, if your files outnumber slaves, and the number of samples in each file is similar, we recommend this manner
    unassigned_mode : "lines_avg" // "files_avg"
},

feature {
    # feature hash, implements "Feature Hashing for Large Scale Multitask Learning"
    # if your feature dim is very large, training will procede very slowly, you can use feature hash to reduce dim at a fraction of cost 
    feature_hash {
        # switch
        need_feature_hash : false,
  
        # feature hashed dim, final feature dim can be more than bucket_size
        bucket_size : 1000000,
  
        # random seed
        seed : 39916801,
  
        # hashed feature name prefix
        feature_prefix : "hash_"
    },

    # preprocessing feature value
    # many learning algorithms(e.g. l1, l2 regularization) assume that features are centered around zero and have variance in the same order
    # if you use feature transform, a file record statistical information while locates at "model.data_path" + "_feature_transform_stat" and
    # will be used in offline or online prediction.
    transform {
        # feature value
        switch_on : false,
        
        # preprocessing manner:
        #   standardization : x --> (x - mean) / stdvar
        #   scale_range : x --> min + (max - min) * (x - xmin) / (xmax - xmin)
        mode : "standardization",
      
        # if your preprocessing manner is "scale_range", you must supply min and max scale ranges
        scale_range {
            min : -1,
            max : 1
        },
	    
        # include_features and exclude_features are both empty list means doing transformation on all features.If you only want to transform a subset of features, then put related features in include_features, and set exclude_features to an empty list. If you want to tranform all features except some, then put them in exclude_features and set include_features to an empty list. If both of them are not empty list, then include_features configuration is effective and exclude_features not.
        # preprocessing only a subset of features, e.g. ["f1", "f3"]
        include_features : [],

        # preprocessing all features except exclude_features, e.g. ["f2", "f3"]
        exclude_features : []
    }

	# if a feature's frequency of occurrence is less than filter_threshold, the feature will be filtered
    filter_threshold : 0
},

model {
    # path(checkpoint path) for model parameters, format:
    # f1,weight,precision(laplace approximate)
    # f2,weight,precision
    # ...
    # 
    # model dict data will be saved in path: "data_path" + "_dict". This dict can be used as other models "dict_path", e.g. you can use model dict of linear model with l1 as fm model's "dict_path"
    data_path : "???",
  
    # delimiter in model data
    delim : ",",
  
    # whether using user provided dict or other models' dict
    need_dict : false,
    
    # user provided dict path, if "need_dict:false", dict_path does not need to be provided
    # dict data format:
    # f1
    # f2
    # ....
    # 
    # attention: dict_path is input, model dict data is output
    dict_path : "",
  
    # model save frequency
    dump_freq : 50,
  
    # whether your model uses intercept(bias)
    need_bias : true,
  
    # bias feature name
    bias_feature_name : "_bias_",
  
    # whether continue to train from checkpoint
    continue_train : false
},

loss {
    # loss function of linear model.
    # e.g. "sigmoid" stands for Logistic Regression(active function is sigmoid and loss function is negative log likelihood). "l2" stands for Linear Regression. See "Objective Function(active function and loss function)" in models.md for more details
    loss_function : "sigmoid",
  
    # except loss, if you want to evaluate other metrics, like "auc", "confusion_matrix", "RMSE", ..., see evaluation_metrics.md for more details
    evaluate_metric : ["auc"],
  
    # whether you only want to evaluate some metrics(training is finished)
    just_evaluate : false,
  
    # model parameters regularization to avoid overfitting,
    # l1 and l2 can be used at the same time(elastic net)
    regularization : {
        # l1 regularization(parameters with the prior of laplace distribution)
        l1 : [5.28e-9],
        # l2 regularization(parameters with the prior of guassian distribution)
        l2 : [5.28e-7]
    }
},

optimization {
    # optimizer type, this version only support "line_search"("trust_region", "sgd" will be supported in the future)
    optimizer : "line_search",

    line_search {
        # step search stopping criterion:
        # "sufficient_decrease", "wolfe", "strong_wolfe" three modes
        # if you use "wolfe" mode, loss will always decrease no matter loss is convex or not
        mode : "wolfe",

        # backtracking is one of most effective step search method, just using default values
        backtracking : {
            step_decr : 0.5,
            step_incr : 2.1,
            max_iter : 55,
            min_step : 1e-16,
            max_step : 1e18,
            c1 : 1e-4,
            c2 : 0.9
        }
        
        # L-BFGS is one of best optimization method use for convex and non-convex function,
        lbfgs {
            # L-BFGS use recent m curve infomation approximate Hessian Matrix
            m : 8,
          	
          	# converge control 
            convergence : {
                # max iter step, if you can want to converge precisely(have a risk of overfitting), increase max_iter
                max_iter : 60,
          
                # if you want to converge precisely(have a risk of overfitting), decrease eps
                eps : 1e-3
            }
        }
    }
},

hyper {
    # switch of hyperparameter optimization, if you switch on, be sure to provide test data(validation)
    switch_on : false,
  
    # between two hyper opt steps, whether previous optimized model parameters will be used for hyper opt start in the next step.
    # if restart is true, hyper opt step always uses random model parameters, otherwise use previous optimized model parameters.
    # in general, convex loss function uses restart:false, non-convex loss function uses restart:true
    restart : false,
  
    # hyperparameter optimization method:
    #   "hoag" : hyperparameter optimization with approximate gradient(modified)
    #   "grid" : grid search
    mode : "hoag",

    # hyperparameter optimization with approximate gradient. 
    # using this method , convex loss functions can always found optimal hyper parameters, non-convex loss functions can not be guaranteed.
    # attention: l1 hyperparameter not supported in hoag method!
    hoag {
        # hyperparameters optimization inital step length, 1.0 always works
        init_step : 1.0,
  
        # bwtween two continuous steps, if gradient is opposite, step will decrese, step = step * step_decr_factor 
        step_decr_factor : 0.7,
        
        # if the diff of test of of two continuous steps less than a limit, hyper opt will be aborted. if you want to get more optimal hyperparameters, decrease this value.
        test_loss_reduce_limit : 1e-5,
  
        # max hyper optimization step, if you want to get more optimal hyperparameters, increase this value.
        outer_iter : 10,
        
        # l1 regularization value(fixed)
        # l1 hyperparameter not supported in hoag method
        l1 : [0.0],
  
        # l2 regularization init value
        l2 : [5.28e-7]
    },

    # grid search, total optimization step=l1parts * l2parts
    grid {
        # range and interval of l1 hyperparameter to be searched
        # format: [l1left, l1right, l1parts], means [left, right] interval will be divided into "l1parts" equal parts, if you want to use fixed l1 value, set as follows: [fixedvalue, fixedvalue, 1]
        l1 : [1e-9, 1e-6, 5],
      
        # range and interval of l2 hyperparameter to be searched
        # format: [l2left, l2right, l2parts], means [left, right] interval will be divided into "l2parts" equal parts, if you want to use fixed l2 value, set as follows: [fixedvalue, fixedvalue, 1]
        l2 : [1e-8, 1e-5, 5]
    }

}