Source code for

Machine leaning

The command-line application for general-purpose machine learning.



  python -m <config_file>

Config parameters

- **training_set**: nordlys ML instance file format (MIFF)
- **test_set**: nordlys ML instance file format (MIFF); if provided then it's always used for testing. Can be left empty if cross-validation is used, in which case the remaining split is used for testing.
- **cross_validation**:
   - k: number of folds (default: 10); use -1 for leave-one-out
   - split_strategy: name of a property (normally query-id for IR problems). If set, the entities with the same value for that property are kept in the same split. if not set, entities are randomly distributed among splits.
   - splits_file: JSON file with splits (instance_ids); if the file is provided it is used, otherwise it's generated
   - create_splits: if True, creates the CV splits. Otherwise loads the splits from "split_file" parameter.
- **model**: ML model, currently supported values: rf, gbrt
- **category**: [regression | classification], default: "regression"
- **parameters**: dict with parameters of the given ML model
   - If GBRT:
      - alpha: learning rate, default: 0.1
      - tree: number of trees, default: 1000
      - depth: max depth of trees, default: 10% of number of features
   - If RF:
      - tree: number of trees, default: 1000
      - maxfeat: max features of trees, default: 10% of number of features
- **model_file**: the model is saved to this file
- **load_model**: if True, loads the model
- **feature_imp_file**: Feature importance is saved to this file
- **output_file**: where output is written; default output format: TSV with with instance_id and (estimated) target

Example config

.. code:: python

	    "model": "gbrt",
	    "category": "regression",
			"alpha": 0.1,
			"tree": 10,
			"depth": 5
		"training_set": "path/to/train.json",
		"test_set": "path/to/test.json",
		"model_file": "path/to/model.txt",
	    "output_file": "path/to/output.json",
			"create_splits": true,
			"splits_file": "path/to/splits.json",
	        "k": 5,
	        "split_strategy": "q_id"


:Authors: Faegheh Hasibi, Krisztian Balog
import argparse
from sys import exit
import numpy
import pickle
from sklearn.ensemble import (GradientBoostingRegressor, GradientBoostingClassifier, RandomForestRegressor,

from import Instances
from import CrossValidation
from nordlys.config import PLOGGER
from nordlys.core.utils.file_utils import FileUtils

[docs]class ML(object): def __init__(self, config): self.__check_config(config) self.__config = config @staticmethod def __check_config(config): """Checks config parameters and set default values.""" try: # if "training_set" not in config: # raise Exception("training_set is missing") # if "output_file" not in config: # raise Exception("output_file is missing") if "cross_validation" in config: if "splits_file" not in config["cross_validation"]: raise Exception("splits_file is missing") if "k" not in config["cross_validation"]: config["cross_validation"]["k"] = 10 # else: # if "test_set" not in config: # raise Exception("test_set is missing") except Exception as e: PLOGGER.error("Error in config file: ", e) exit(1)
[docs] def gen_model(self, num_features=None): """ Reads parameters and generates a model to be trained. :param num_features: int, number of features :return untrained ranker/classifier """ model = None if self.__config["model"].lower() == "gbrt": alpha = self.__config["parameters"].get("alpha", 0.1) tree = self.__config["parameters"].get("tree", 1000) default_depth = round(num_features / 10.0) if num_features is not None else None depth = self.__config["parameters"].get("depth", default_depth)"Training instances using GBRT ...")"Number of trees: " + str(tree) + "\tDepth of trees: " + str(depth)) if self.__config.get("category", "regression") == "regression":"Training regressor") model = GradientBoostingRegressor(n_estimators=tree, max_depth=depth, learning_rate=alpha) else:"Training the classifier") model = GradientBoostingClassifier(n_estimators=tree, max_depth=depth, learning_rate=alpha) elif self.__config["model"].lower() == "rf": tree = self.__config["parameters"].get("tree", 1000) default_maxfeat = round(num_features / 10.0) if num_features is not None else None max_feat = self.__config["parameters"].get("maxfeat", default_maxfeat)"Training instances using RF ...")"Number of trees: " + str(tree) + "\tMax features: " + str(max_feat)) if self.__config.get("category", "regression") == "regression":"Training regressor") model = RandomForestRegressor(n_estimators=tree, max_features=max_feat) else:"Training classifier") model = RandomForestClassifier(n_estimators=tree, max_features=max_feat) return model
[docs] def train_model(self, instances): """Trains model on a given set of instances. :param instances: Instances object :return: the learned model """ features = instances.get_all()[0].features features_names = sorted(features.keys())"Number of instances:\t" + str(len(instances.get_all())))"Number of features:\t" + str(len(features_names))) # Converts instances to Scikit-learn format : (n_samples, n_features) n_samples = len(instances.get_all()) train_x = numpy.zeros((n_samples, len(features_names))) train_y = numpy.empty(n_samples, dtype=object) # numpy.zeros(n_samples) for i, ins in enumerate(instances.get_all()): train_x[i] = [ins.features[ftr] for ftr in features_names] if self.__config.get("category", "regression") == "regression": train_y[i] = float( else: train_y[i] = str( # training model = self.gen_model(len(features)), train_y) # write the trained model to the file if "model_file" in self.__config: # @todo if CV is used we need to append the fold no. to the filename"Writing trained model to {} ...".format(self.__config["model_file"])) pickle.dump(model, open(self.__config["model_file"], "wb")) if "feature_imp_file" in self.__config: print(self.analyse_features(model, features_names)) return model
[docs] def analyse_features(self, model, feature_names): """ Ranks features based on their importance. Scikit uses Gini score to get feature importances. :param model: trained model :param feature_names: list of feature names """ # we sort the features to make sure that are in the same order as they used while training. # This is especially important when the function is called outside "train_model" function. feature_names = sorted(feature_names) # gets feature importance importances = zip(feature_names, model.feature_importances_) sorted_importances = sorted(importances, key=lambda imps: imps[1], reverse=True) feat_imp_str = "=========== Feature Importance ===========\n" for feat, importance in sorted_importances: feat_imp_str += feat + "\t" + str(importance) + "\n" feat_imp_str += "==========================================" open(self.__config["feature_imp_file"], "w").write(feat_imp_str) return feat_imp_str
[docs] def apply_model(self, instances, model): """Applies model on a given set of instances. :param instances: Instances object :param model: trained model :return: Instances """"Applying model ... ") if len(instances.get_all()) > 0: features_names = sorted(instances.get_all()[0].features.keys()) for ins in instances.get_all(): test_x = numpy.array([[ins.features[ftr] for ftr in features_names]]) if self.__config.get("category", "regression") == "regression": ins.score = model.predict(test_x)[0] else: # classification = str(model.predict(test_x)[0]) # "predict_proba" gets class probabilities; an array of probabilities for each class e.g.[0.99, 0.1] ins.score = model.predict_proba(test_x)[0][1] return instances
[docs] def output(self, instances): """Writes results to output file. :param instances: Instances object """ with open(self.__config["output_file"], "w") as f: f.write("id\tscore\n") # output to file"id\ttarget\tscore\n") for ins in instances.get_all(): f.write( + "\t" + "{0:.5f}".format(ins.score) + "\n") # output to file"Output saved in: " + self.__config["output_file"])
[docs] def run(self): # load training instances ins_train = Instances.from_json(self.__config["training_set"]) # Cross Validation if "cross_validation" in self.__config: cv = CrossValidation(self.__config["cross_validation"]["k"], ins_train, self.train_model, self.apply_model) split_strategy = self.__config["cross_validation"].get("split_strategy", None) split_file = self.__config["cross_validation"]["splits_file"] # Always creates new splits if the create_flag is True if bool(self.__config["cross_validation"].get("create_splits", False)) is True: cv.create_folds(split_strategy) cv.save_folds(split_file) # New splits will be created only if the provided split_file does not exist. else: cv.get_folds(split_file, split_strategy) inss = # classic test-train split else: ins_test = Instances.from_json(self.__config["test_set"]) model = self.train_model(ins_train) inss = self.apply_model(ins_test, model) # output results (which are stored in inss) inss.to_json(self.__config["output_file"])
# inss.to_treceval(self.__config["output_file"]) # self.output(inss)
[docs]def arg_parser(): parser = argparse.ArgumentParser() parser.add_argument("config", help="config file", type=str) args = parser.parse_args() return args
[docs]def main(args): config = FileUtils.load_config(args.config) ml = ML(config)
if __name__ == "__main__": main(arg_parser())