Source code for RankingTrainValidationSplit

# Copyright (C) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License. See LICENSE in project root for information.


import sys

if sys.version >= '3':
    basestring = str

from pyspark.ml.param.shared import *
from mmlspark.Utils import *
from pyspark.ml.common import inherit_doc
from pyspark.ml import Model
from pyspark.ml.util import *

import sys
from mmlspark._RankingTrainValidationSplit import _RankingTrainValidationSplit, _RankingTrainValidationSplitModel
from pyspark.ml import Estimator
from pyspark.ml.param import Params, Param, TypeConverters
from pyspark.ml.tuning import ValidatorParams
from pyspark.ml.util import *
from pyspark.ml.param.shared import HasParallelism

if sys.version >= '3':
    basestring = str


[docs]class HasCollectSubMetrics(Params):
    """
    Mixin for param collectSubModels: Param for whether to collect a list of sub-models trained during tuning. If set to false, then only the single best sub-model will be available after fitting. If set to true, then all sub-models will be available. Warning: For large models, collecting all sub-models can cause OOMs on the Spark driver.
    """

    collectSubMetrics = Param(Params._dummy(), "collectSubMetrics",
                              "Param for whether to collect a list of sub-models metrics.",
                              typeConverter=TypeConverters.toBoolean)

    def __init__(self):
        super(HasCollectSubMetrics, self).__init__()
        self._setDefault(collectSubMetrics=False)

[docs]    def setCollectSubMetrics(self, value):
        """
        Sets the value of :py:attr:`collectSubModels`.
        """
        return self._set(collectSubModels=value)

[docs]    def getCollectSubMetrics(self):
        """
        Gets the value of collectSubModels or its default value.
        """
        return self.getOrDefault(self.collectSubMetrics)


[docs]class HasCollectSubModels(Params):
    """
    Mixin for param collectSubModels: Param for whether to collect a list of sub-models trained during tuning. If set to false, then only the single best sub-model will be available after fitting. If set to true, then all sub-models will be available. Warning: For large models, collecting all sub-models can cause OOMs on the Spark driver.
    """

    collectSubModels = Param(Params._dummy(), "collectSubModels",
                             "Param for whether to collect a list of sub-models trained during tuning. If set to false, then only the single best sub-model will be available after fitting. If set to true, then all sub-models will be available. Warning: For large models, collecting all sub-models can cause OOMs on the Spark driver.",
                             typeConverter=TypeConverters.toBoolean)

    def __init__(self):
        super(HasCollectSubModels, self).__init__()
        self._setDefault(collectSubModels=False)

[docs]    def setCollectSubModels(self, value):
        """
        Sets the value of :py:attr:`collectSubModels`.
        """
        return self._set(collectSubModels=value)

[docs]    def getCollectSubModels(self):
        """
        Gets the value of collectSubModels or its default value.
        """
        return self.getOrDefault(self.collectSubModels)


[docs]@inherit_doc
class RankingTrainValidationSplit(_RankingTrainValidationSplit, Estimator, ValidatorParams, HasCollectSubModels,
                                  HasCollectSubMetrics,
                                  HasParallelism):

    def _create_model(self, java_model):
        model = RankingTrainValidationSplitModel()
        model._java_obj = java_model
        model._transfer_params_from_java()
        return model

    def _fit(self, dataset):
        return self._to_java().fit(dataset._jdf)

    @classmethod
    def _from_java(cls, java_stage):
        """
        Given a Java TrainValidationSplit, create and return a Python wrapper of it.
        Used for ML persistence.
        """

        estimator, epms, evaluator = super(RankingTrainValidationSplit, cls)._from_java_impl(java_stage)
        trainRatio = java_stage.getTrainRatio()
        seed = java_stage.getSeed()
        parallelism = java_stage.getParallelism()
        collectSubModels = java_stage.getCollectSubModels()
        collectSubMetrics = java_stage.getCollectSubMetrics()
        # Create a new instance of this stage.
        py_stage = cls(estimator=estimator, estimatorParamMaps=epms, evaluator=evaluator,
                       trainRatio=trainRatio, seed=seed, parallelism=parallelism,
                       collectSubModels=collectSubModels, collectSubMetrics=collectSubMetrics)
        py_stage._resetUid(java_stage.uid())
        return py_stage

    def _to_java(self):
        """
        Transfer this instance to a Java TrainValidationSplit. Used for ML persistence.
        :return: Java object equivalent to this instance.
        """

        estimator, epms, evaluator = super(RankingTrainValidationSplit, self)._to_java_impl()

        _java_obj = JavaParams._new_java_obj("com.microsoft.ml.spark.RankingTrainValidationSplit",
                                             self.uid)
        _java_obj.setEstimatorParamMaps(epms)
        _java_obj.setEvaluator(evaluator)
        _java_obj.setEstimator(estimator)
        _java_obj.setTrainRatio(self.getTrainRatio())
        _java_obj.setSeed(self.getSeed())
        _java_obj.setParallelism(self.getParallelism())
        _java_obj.setCollectSubModels(self.getCollectSubModels())
        _java_obj.setCollectSubMetrics(self.getCollectSubMetrics())
        return _java_obj


[docs]@inherit_doc
class RankingTrainValidationSplitModel(_RankingTrainValidationSplitModel, Model, ValidatorParams):

    def _transform(self, dataset):
        return self.bestModel.transform(dataset)

[docs]    def copy(self, extra=None):
        """
        Creates a copy of this instance with a randomly generated uid
        and some extra params. This copies the underlying bestModel,
        creates a deep copy of the embedded paramMap, and
        copies the embedded and extra parameters over.
        And, this creates a shallow copy of the validationMetrics.

        :param extra: Extra parameters to copy to the new instance
        :return: Copy of this instance
        """
        if extra is None:
            extra = dict()
        bestModel = self.bestModel.copy(extra)
        validationMetrics = list(self.validationMetrics)
        subModels = self.subModels
        subMetrics = self.subMetrics
        return RankingTrainValidationSplitModel(bestModel, validationMetrics, subModels, subMetrics)

[docs]    def recommendForAllUsers(self, numItems):
        # return self.bestModel.recommendForAllUsers(numItems)
        return self.bestModel._call_java("recommendForAllUsers", numItems)

[docs]    def recommendForAllItems(self, numItems):
        return self.bestModel.recommendForAllItems(numItems)

[docs]    @classmethod
    def from_java(cls, java_stage):
        """
        Given a Java TrainValidationSplitModel, create and return a Python wrapper of it.
        Used for ML persistence.
        """

        # Load information from java_stage to the instance.
        bestModel = JavaParams._from_java(java_stage.bestModel())
        estimator, epms, evaluator = super(RankingTrainValidationSplitModel,
                                           cls)._from_java_impl(java_stage)
        # Create a new instance of this stage.
        py_stage = cls(bestModel=bestModel).setEstimator(estimator)
        py_stage = py_stage.setEstimatorParamMaps(epms).setEvaluator(evaluator)

        if java_stage.hasSubModels():
            py_stage.subModels = [JavaParams._from_java(sub_model)
                                  for sub_model in java_stage.subModels()]

        if java_stage.hasSubMetrics():
            py_stage.subMetrics = [JavaParams._from_java(sub_metrics)
                                   for sub_metrics in java_stage.subMetrics()]

        py_stage._resetUid(java_stage.uid())
        return py_stage

    def _to_java(self):
        """
        Transfer this instance to a Java TrainValidationSplitModel. Used for ML persistence.
        :return: Java object equivalent to this instance.
        """

        sc = SparkContext._active_spark_context
        # TODO: persst validation metrics as well
        _java_obj = JavaParams._new_java_obj(
            "org.apache.spark.ml.tuning.TrainValidationSplitModel",
            self.uid,
            self.bestModel._to_java(),
            _py2java(sc, []))
        estimator, epms, evaluator = super(TrainValidationSplitModel, self)._to_java_impl()

        _java_obj.set("evaluator", evaluator)
        _java_obj.set("estimator", estimator)
        _java_obj.set("estimatorParamMaps", epms)

        if self.subModels is not None:
            java_sub_models = [sub_model._to_java() for sub_model in self.subModels]
            _java_obj.setSubModels(java_sub_models)

        if self.subMetrics is not None:
            java_sub_metrics = [sub_metrics._to_java() for sub_metrics in self.subMetrics]
            _java_obj.setSubMetrics(java_sub_metrics)

        return _java_obj