Source code for nordlys.core.ml.ml

"""
Machine leaning
===============

The command-line application for general-purpose machine learning.


Usage
-----

::

  python -m nordlys.core.ml.ml <config_file>


Config parameters
------------------

- **training_set**: nordlys ML instance file format (MIFF)
- **test_set**: nordlys ML instance file format (MIFF); if provided then it's always used for testing. Can be left empty if cross-validation is used, in which case the remaining split is used for testing.
- **cross_validation**:
   - k: number of folds (default: 10); use -1 for leave-one-out
   - split_strategy: name of a property (normally query-id for IR problems). If set, the entities with the same value for that property are kept in the same split. if not set, entities are randomly distributed among splits.
   - splits_file: JSON file with splits (instance_ids); if the file is provided it is used, otherwise it's generated
   - create_splits: if True, creates the CV splits. Otherwise loads the splits from "split_file" parameter.
- **model**: ML model, currently supported values: rf, gbrt
- **category**: [regression | classification], default: "regression"
- **parameters**: dict with parameters of the given ML model
   - If GBRT:
      - alpha: learning rate, default: 0.1
      - tree: number of trees, default: 1000
      - depth: max depth of trees, default: 10% of number of features
   - If RF:
      - tree: number of trees, default: 1000
      - maxfeat: max features of trees, default: 10% of number of features
- **model_file**: the model is saved to this file
- **load_model**: if True, loads the model
- **feature_imp_file**: Feature importance is saved to this file
- **output_file**: where output is written; default output format: TSV with with instance_id and (estimated) target


Example config
---------------

.. code:: python

	{
	    "model": "gbrt",
	    "category": "regression",
		"parameters":{
			"alpha": 0.1,
			"tree": 10,
			"depth": 5
		},
		"training_set": "path/to/train.json",
		"test_set": "path/to/test.json",
		"model_file": "path/to/model.txt",
	    "output_file": "path/to/output.json",
	    "cross_validation":{
			"create_splits": true,
			"splits_file": "path/to/splits.json",
	        "k": 5,
	        "split_strategy": "q_id"
		}
	}

------------------------

:Authors: Faegheh Hasibi, Krisztian Balog
"""
import argparse
from sys import exit
import numpy
import pickle
from sklearn.ensemble import (GradientBoostingRegressor, GradientBoostingClassifier, RandomForestRegressor,
                              RandomForestClassifier)

from nordlys.core.ml.instances import Instances
from nordlys.core.ml.cross_validation import CrossValidation
from nordlys.config import PLOGGER
from nordlys.core.utils.file_utils import FileUtils


[docs]class ML(object): def __init__(self, config): self.__check_config(config) self.__config = config @staticmethod def __check_config(config): """Checks config parameters and set default values.""" try: # if "training_set" not in config: # raise Exception("training_set is missing") # if "output_file" not in config: # raise Exception("output_file is missing") if "cross_validation" in config: if "splits_file" not in config["cross_validation"]: raise Exception("splits_file is missing") if "k" not in config["cross_validation"]: config["cross_validation"]["k"] = 10 # else: # if "test_set" not in config: # raise Exception("test_set is missing") except Exception as e: PLOGGER.error("Error in config file: ", e) exit(1)
[docs] def gen_model(self, num_features=None): """ Reads parameters and generates a model to be trained. :param num_features: int, number of features :return untrained ranker/classifier """ model = None if self.__config["model"].lower() == "gbrt": alpha = self.__config["parameters"].get("alpha", 0.1) tree = self.__config["parameters"].get("tree", 1000) default_depth = round(num_features / 10.0) if num_features is not None else None depth = self.__config["parameters"].get("depth", default_depth) PLOGGER.info("Training instances using GBRT ...") PLOGGER.info("Number of trees: " + str(tree) + "\tDepth of trees: " + str(depth)) if self.__config.get("category", "regression") == "regression": PLOGGER.info("Training regressor") model = GradientBoostingRegressor(n_estimators=tree, max_depth=depth, learning_rate=alpha) else: PLOGGER.info("Training the classifier") model = GradientBoostingClassifier(n_estimators=tree, max_depth=depth, learning_rate=alpha) elif self.__config["model"].lower() == "rf": tree = self.__config["parameters"].get("tree", 1000) default_maxfeat = round(num_features / 10.0) if num_features is not None else None max_feat = self.__config["parameters"].get("maxfeat", default_maxfeat) PLOGGER.info("Training instances using RF ...") PLOGGER.info("Number of trees: " + str(tree) + "\tMax features: " + str(max_feat)) if self.__config.get("category", "regression") == "regression": PLOGGER.info("Training regressor") model = RandomForestRegressor(n_estimators=tree, max_features=max_feat) else: PLOGGER.info("Training classifier") model = RandomForestClassifier(n_estimators=tree, max_features=max_feat) return model
[docs] def train_model(self, instances): """Trains model on a given set of instances. :param instances: Instances object :return: the learned model """ features = instances.get_all()[0].features features_names = sorted(features.keys()) PLOGGER.info("Number of instances:\t" + str(len(instances.get_all()))) PLOGGER.info("Number of features:\t" + str(len(features_names))) # Converts instances to Scikit-learn format : (n_samples, n_features) n_samples = len(instances.get_all()) train_x = numpy.zeros((n_samples, len(features_names))) train_y = numpy.empty(n_samples, dtype=object) # numpy.zeros(n_samples) for i, ins in enumerate(instances.get_all()): train_x[i] = [ins.features[ftr] for ftr in features_names] if self.__config.get("category", "regression") == "regression": train_y[i] = float(ins.target) else: train_y[i] = str(ins.target) # training model = self.gen_model(len(features)) model.fit(train_x, train_y) # write the trained model to the file if "model_file" in self.__config: # @todo if CV is used we need to append the fold no. to the filename PLOGGER.info("Writing trained model to {} ...".format(self.__config["model_file"])) pickle.dump(model, open(self.__config["model_file"], "wb")) if "feature_imp_file" in self.__config: print(self.analyse_features(model, features_names)) return model
[docs] def analyse_features(self, model, feature_names): """ Ranks features based on their importance. Scikit uses Gini score to get feature importances. :param model: trained model :param feature_names: list of feature names """ # we sort the features to make sure that are in the same order as they used while training. # This is especially important when the function is called outside "train_model" function. feature_names = sorted(feature_names) # gets feature importance importances = zip(feature_names, model.feature_importances_) sorted_importances = sorted(importances, key=lambda imps: imps[1], reverse=True) feat_imp_str = "=========== Feature Importance ===========\n" for feat, importance in sorted_importances: feat_imp_str += feat + "\t" + str(importance) + "\n" feat_imp_str += "==========================================" open(self.__config["feature_imp_file"], "w").write(feat_imp_str) return feat_imp_str
[docs] def apply_model(self, instances, model): """Applies model on a given set of instances. :param instances: Instances object :param model: trained model :return: Instances """ PLOGGER.info("Applying model ... ") if len(instances.get_all()) > 0: features_names = sorted(instances.get_all()[0].features.keys()) for ins in instances.get_all(): test_x = numpy.array([[ins.features[ftr] for ftr in features_names]]) if self.__config.get("category", "regression") == "regression": ins.score = model.predict(test_x)[0] else: # classification ins.target = str(model.predict(test_x)[0]) # "predict_proba" gets class probabilities; an array of probabilities for each class e.g.[0.99, 0.1] ins.score = model.predict_proba(test_x)[0][1] return instances
[docs] def output(self, instances): """Writes results to output file. :param instances: Instances object """ with open(self.__config["output_file"], "w") as f: f.write("id\tscore\n") # output to file PLOGGER.info("id\ttarget\tscore\n") for ins in instances.get_all(): f.write(ins.id + "\t" + "{0:.5f}".format(ins.score) + "\n") # output to file PLOGGER.info("Output saved in: " + self.__config["output_file"])
[docs] def run(self): # load training instances ins_train = Instances.from_json(self.__config["training_set"]) # Cross Validation if "cross_validation" in self.__config: cv = CrossValidation(self.__config["cross_validation"]["k"], ins_train, self.train_model, self.apply_model) split_strategy = self.__config["cross_validation"].get("split_strategy", None) split_file = self.__config["cross_validation"]["splits_file"] # Always creates new splits if the create_flag is True if bool(self.__config["cross_validation"].get("create_splits", False)) is True: cv.create_folds(split_strategy) cv.save_folds(split_file) # New splits will be created only if the provided split_file does not exist. else: cv.get_folds(split_file, split_strategy) inss = cv.run() # classic test-train split else: ins_test = Instances.from_json(self.__config["test_set"]) model = self.train_model(ins_train) inss = self.apply_model(ins_test, model) # output results (which are stored in inss) inss.to_json(self.__config["output_file"])
# inss.to_treceval(self.__config["output_file"]) # self.output(inss)
[docs]def arg_parser(): parser = argparse.ArgumentParser() parser.add_argument("config", help="config file", type=str) args = parser.parse_args() return args
[docs]def main(args): config = FileUtils.load_config(args.config) ml = ML(config) ml.run()
if __name__ == "__main__": main(arg_parser())