Source code for nordlys.core.ml.ml

"""
Machine leaning
===============

The command-line application for general-purpose machine learning.


Usage
-----

::

  python -m nordlys.core.ml.ml <config_file>


Config parameters
------------------

- **training_set**: nordlys ML instance file format (MIFF)
- **test_set**: nordlys ML instance file format (MIFF); if provided then it's always used for testing. Can be left empty if cross-validation is used, in which case the remaining split is used for testing.
- **cross_validation**:
   - k: number of folds (default: 10); use -1 for leave-one-out
   - split_strategy: name of a property (normally query-id for IR problems). If set, the entities with the same value for that property are kept in the same split. if not set, entities are randomly distributed among splits.
   - splits_file: JSON file with splits (instance_ids); if the file is provided it is used, otherwise it's generated
   - create_splits: if True, creates the CV splits. Otherwise loads the splits from "split_file" parameter.
- **model**: ML model, currently supported values: rf, gbrt
- **category**: [regression | classification], default: "regression"
- **parameters**: dict with parameters of the given ML model
   - If GBRT:
      - alpha: learning rate, default: 0.1
      - tree: number of trees, default: 1000
      - depth: max depth of trees, default: 10% of number of features
   - If RF:
      - tree: number of trees, default: 1000
      - maxfeat: max features of trees, default: 10% of number of features
- **model_file**: the model is saved to this file
- **load_model**: if True, loads the model
- **feature_imp_file**: Feature importance is saved to this file
- **output_file**: where output is written; default output format: TSV with with instance_id and (estimated) target


Example config
---------------

.. code:: python

	{
	    "model": "gbrt",
	    "category": "regression",
		"parameters":{
			"alpha": 0.1,
			"tree": 10,
			"depth": 5
		},
		"training_set": "path/to/train.json",
		"test_set": "path/to/test.json",
		"model_file": "path/to/model.txt",
	    "output_file": "path/to/output.json",
	    "cross_validation":{
			"create_splits": true,
			"splits_file": "path/to/splits.json",
	        "k": 5,
	        "split_strategy": "q_id"
		}
	}

------------------------

:Authors: Faegheh Hasibi, Krisztian Balog
"""
import argparse
from sys import exit
import numpy
import pickle
from sklearn.ensemble import (GradientBoostingRegressor, GradientBoostingClassifier, RandomForestRegressor,
                              RandomForestClassifier)

from nordlys.core.ml.instances import Instances
from nordlys.core.ml.cross_validation import CrossValidation
from nordlys.config import PLOGGER
from nordlys.core.utils.file_utils import FileUtils


[docs]class ML(object):
    def __init__(self, config):
        self.__check_config(config)
        self.__config = config

    @staticmethod
    def __check_config(config):
        """Checks config parameters and set default values."""
        try:
            # if "training_set" not in config:
            #     raise Exception("training_set is missing")
            # if "output_file" not in config:
            #     raise Exception("output_file is missing")
            if "cross_validation" in config:
                if "splits_file" not in config["cross_validation"]:
                    raise Exception("splits_file is missing")
                if "k" not in config["cross_validation"]:
                    config["cross_validation"]["k"] = 10
            # else:
            #     if "test_set" not in config:
            #         raise Exception("test_set is missing")

        except Exception as e:
            PLOGGER.error("Error in config file: ", e)
            exit(1)

[docs]    def gen_model(self, num_features=None):
        """ Reads parameters and generates a model to be trained.

        :param num_features: int, number of features
        :return untrained ranker/classifier
        """
        model = None
        if self.__config["model"].lower() == "gbrt":
            alpha = self.__config["parameters"].get("alpha", 0.1)
            tree = self.__config["parameters"].get("tree", 1000)
            default_depth = round(num_features / 10.0) if num_features is not None else None
            depth = self.__config["parameters"].get("depth", default_depth)

            PLOGGER.info("Training instances using GBRT ...")
            PLOGGER.info("Number of trees: " + str(tree) + "\tDepth of trees: " + str(depth))
            if self.__config.get("category", "regression") == "regression":
                PLOGGER.info("Training regressor")
                model = GradientBoostingRegressor(n_estimators=tree, max_depth=depth, learning_rate=alpha)
            else:
                PLOGGER.info("Training the classifier")
                model = GradientBoostingClassifier(n_estimators=tree, max_depth=depth, learning_rate=alpha)

        elif self.__config["model"].lower() == "rf":
            tree = self.__config["parameters"].get("tree", 1000)
            default_maxfeat = round(num_features / 10.0) if num_features is not None else None
            max_feat = self.__config["parameters"].get("maxfeat", default_maxfeat)

            PLOGGER.info("Training instances using RF ...")
            PLOGGER.info("Number of trees: " + str(tree) + "\tMax features: " + str(max_feat))
            if self.__config.get("category", "regression") == "regression":
                PLOGGER.info("Training regressor")
                model = RandomForestRegressor(n_estimators=tree, max_features=max_feat)
            else:
                PLOGGER.info("Training classifier")
                model = RandomForestClassifier(n_estimators=tree, max_features=max_feat)
        return model

[docs]    def train_model(self, instances):
        """Trains model on a given set of instances.

        :param instances: Instances object
        :return: the learned model
        """

        features = instances.get_all()[0].features
        features_names = sorted(features.keys())
        PLOGGER.info("Number of instances:\t" + str(len(instances.get_all())))
        PLOGGER.info("Number of features:\t" + str(len(features_names)))
        # Converts instances to Scikit-learn format : (n_samples, n_features)
        n_samples = len(instances.get_all())
        train_x = numpy.zeros((n_samples, len(features_names)))
        train_y = numpy.empty(n_samples, dtype=object)  # numpy.zeros(n_samples)
        for i, ins in enumerate(instances.get_all()):
            train_x[i] = [ins.features[ftr] for ftr in features_names]
            if self.__config.get("category", "regression") == "regression":
                train_y[i] = float(ins.target)
            else:
                train_y[i] = str(ins.target)
        # training
        model = self.gen_model(len(features))
        model.fit(train_x, train_y)

        # write the trained model to the file
        if "model_file" in self.__config:
            # @todo if CV is used we need to append the fold no. to the filename
            PLOGGER.info("Writing trained model to {} ...".format(self.__config["model_file"]))
            pickle.dump(model, open(self.__config["model_file"], "wb"))

        if "feature_imp_file" in self.__config:
            print(self.analyse_features(model, features_names))
        return model

[docs]    def analyse_features(self, model, feature_names):
        """ Ranks features based on their importance.
        Scikit uses Gini score to get feature importances.

        :param model: trained model
        :param feature_names: list of feature names
        """

        # we sort the features to make sure that are in the same order as they used while training.
        # This is especially important when the function is called outside "train_model" function.
        feature_names = sorted(feature_names)

        # gets feature importance
        importances = zip(feature_names, model.feature_importances_)
        sorted_importances = sorted(importances, key=lambda imps: imps[1], reverse=True)

        feat_imp_str = "=========== Feature Importance ===========\n"
        for feat, importance in sorted_importances:
            feat_imp_str += feat + "\t" + str(importance) + "\n"
        feat_imp_str += "=========================================="
        open(self.__config["feature_imp_file"], "w").write(feat_imp_str)
        return feat_imp_str

[docs]    def apply_model(self, instances, model):
        """Applies model on a given set of instances.

        :param instances: Instances object
        :param model: trained model
        :return: Instances
        """
        PLOGGER.info("Applying model ... ")
        if len(instances.get_all()) > 0:
            features_names = sorted(instances.get_all()[0].features.keys())
            for ins in instances.get_all():
                test_x = numpy.array([[ins.features[ftr] for ftr in features_names]])
                if self.__config.get("category", "regression") == "regression":
                    ins.score = model.predict(test_x)[0]
                else:  # classification
                    ins.target = str(model.predict(test_x)[0])
                    # "predict_proba" gets class probabilities; an array of probabilities for each class e.g.[0.99, 0.1]
                    ins.score = model.predict_proba(test_x)[0][1]
        return instances

[docs]    def output(self, instances):
        """Writes results to output file.

        :param instances: Instances object
        """
        with open(self.__config["output_file"], "w") as f:
            f.write("id\tscore\n")  # output to file
            PLOGGER.info("id\ttarget\tscore\n")
            for ins in instances.get_all():
                f.write(ins.id + "\t" + "{0:.5f}".format(ins.score) + "\n")  # output to file
        PLOGGER.info("Output saved in: " + self.__config["output_file"])

[docs]    def run(self):
        # load training instances
        ins_train = Instances.from_json(self.__config["training_set"])

        # Cross Validation
        if "cross_validation" in self.__config:
            cv = CrossValidation(self.__config["cross_validation"]["k"], ins_train, self.train_model, self.apply_model)
            split_strategy = self.__config["cross_validation"].get("split_strategy", None)
            split_file = self.__config["cross_validation"]["splits_file"]
            # Always creates new splits if the create_flag is True
            if bool(self.__config["cross_validation"].get("create_splits", False)) is True:
                cv.create_folds(split_strategy)
                cv.save_folds(split_file)
            # New splits will be created only if the provided split_file does not exist.
            else:
                cv.get_folds(split_file, split_strategy)
            inss = cv.run()

        # classic test-train split
        else:
            ins_test = Instances.from_json(self.__config["test_set"])
            model = self.train_model(ins_train)
            inss = self.apply_model(ins_test, model)

        # output results (which are stored in inss)
        inss.to_json(self.__config["output_file"])
        # inss.to_treceval(self.__config["output_file"])
        # self.output(inss)


[docs]def arg_parser():
    parser = argparse.ArgumentParser()
    parser.add_argument("config", help="config file", type=str)
    args = parser.parse_args()
    return args


[docs]def main(args):
    config = FileUtils.load_config(args.config)
    ml = ML(config)
    ml.run()


if __name__ == "__main__":
    main(arg_parser())