From 756ebf57f1d62ed6bbe73ef0da8823abff86a836 Mon Sep 17 00:00:00 2001 From: Matej Klemen Date: Sat, 13 Jul 2019 12:31:42 +0200 Subject: [PATCH 01/15] Update docs for CsvLoader (loadCsvDataset) --- .../picnicml/doddlemodel/data/CsvLoader.scala | 24 ++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/src/main/scala/io/picnicml/doddlemodel/data/CsvLoader.scala b/src/main/scala/io/picnicml/doddlemodel/data/CsvLoader.scala index 4931c036..24f1d840 100644 --- a/src/main/scala/io/picnicml/doddlemodel/data/CsvLoader.scala +++ b/src/main/scala/io/picnicml/doddlemodel/data/CsvLoader.scala @@ -9,7 +9,29 @@ import scala.io.{BufferedSource, Source} object CsvLoader { - /** Loads a csv dataset with 2 header lines (1st line for feature names and 2nd for types). */ + /** Loads a csv dataset with 2 header lines (1st line for feature names and 2nd for types). + * @param datasetFilePath csv file to load + * @param na value to interpret as N/A data in the given dataset + * + * @example Reading the iris dataset. + * {{{ + * import java.io.File + * + * val file = new File("/datasets/iris.csv") + * val (data, featureInfo) = loadCsvDataset(file) + * // separate features from the label + * val (irisFeatures, irisLabels) = data(::, 0 to 3), data(::, -1) + * }}} + * + * @example Reading a dataset where N/A values are marked with `NA`. + * {{{ + * import java.io.File + * + * val file = new File("/datasets/dummy_csv_reading.csv") + * // specify a value to interpret as N/A data + * val (data, featureInfo) = loadCsvDataset(file, "NA") + * }}} + * */ def loadCsvDataset(datasetFilePath: String, na: String = "NA"): FeaturesWithIndex = loadCsvDataset(Source.fromFile(datasetFilePath), na) From 14c588d3dafb50220515391eaa643a8af2bdabe0 Mon Sep 17 00:00:00 2001 From: Matej Klemen Date: Sat, 13 Jul 2019 12:37:33 +0200 Subject: [PATCH 02/15] Update docs for DatasetUtils (shuffleDataset, splitDataset) --- .../doddlemodel/data/DatasetUtils.scala | 38 ++++++++++++++++++- 1 file changed, 36 insertions(+), 2 deletions(-) diff --git a/src/main/scala/io/picnicml/doddlemodel/data/DatasetUtils.scala b/src/main/scala/io/picnicml/doddlemodel/data/DatasetUtils.scala index 78e94c94..44e86b0c 100644 --- a/src/main/scala/io/picnicml/doddlemodel/data/DatasetUtils.scala +++ b/src/main/scala/io/picnicml/doddlemodel/data/DatasetUtils.scala @@ -6,13 +6,47 @@ import scala.util.Random object DatasetUtils { - /** Shuffles rows of the dataset. */ + /** Shuffles rows of the dataset. + * @param x features to be shuffled + * @param y labels corresponding to features + * + * @example Shuffle a dataset randomly. + * {{{ + * import scala.util.Random + * + * // we are assuming data was previously loaded + * val (dataX, dataY) = ... + * + * val (shuffledX, shuffledY) = shuffleDataset(dataX, dataY) + * + * // seeded shuffle - seed passed to shuffler implicitly + * implicit val rand: Random = new Random(42) + * val (shuffledX, shuffledY) = shuffleDataset(dataX, dataY) + * }}} + * */ def shuffleDataset(x: Features, y: Target)(implicit rand: Random = new Random()): Dataset = { val shuffleIndices = rand.shuffle((0 until y.length).toIndexedSeq) (x(shuffleIndices, ::).toDenseMatrix, y(shuffleIndices).toDenseVector) } - /** Splits the dataset into two subsets for training and testing. */ + /** Splits the dataset into two subsets for training and testing. + * @param x features to be split + * @param y labels corresponding to features + * @param proportionTrain proportion of dataset to be put into training set - between 0.0 and 1.0 + * + * @example Split dataset into training and test set. + * {{{ + * // we are assuming data was previously loaded + * val (dataX, dataY) = ... + * + * // by default, the split is 50%:50% + * val trTeSplit = splitDataset(dataX, dataY) + * + * // put 80% of data into training set and 20% into test set + * val trTeSplit = splitDataset(dataX, dataY, 0.8) + * val (trainX, trainY, testX, testY) = (trTeSplit.xTr, trTeSplit.yTr, trTeSplit.xTe, trTeSplit.yTe) + * }}} + * */ def splitDataset(x: Features, y: Target, proportionTrain: Float = 0.5f): TrainTestSplit = { val numTrain = numberOfTrainExamplesBasedOnProportion(x.rows, proportionTrain) val trIndices = 0 until numTrain From 7b3a30bb555e4ae32813d04de884ad14498d8c15 Mon Sep 17 00:00:00 2001 From: Matej Klemen Date: Sat, 13 Jul 2019 12:39:13 +0200 Subject: [PATCH 03/15] Update docs for Feature (FeatureIndex's functions) --- .../picnicml/doddlemodel/data/Feature.scala | 53 ++++++++++++++++++- 1 file changed, 52 insertions(+), 1 deletion(-) diff --git a/src/main/scala/io/picnicml/doddlemodel/data/Feature.scala b/src/main/scala/io/picnicml/doddlemodel/data/Feature.scala index b5a5c929..09b52bea 100644 --- a/src/main/scala/io/picnicml/doddlemodel/data/Feature.scala +++ b/src/main/scala/io/picnicml/doddlemodel/data/Feature.scala @@ -35,11 +35,33 @@ object Feature { subset(subsetIndices:_*) } + /** Create a feature index with subset of features, provided by feature names. + * @param names subset of features to be selected + * + * @example Create feature index based on features "f1" and "f3" from a constructed feature index. + * {{{ + * val featureIndex = FeatureIndex(List("f1", "f2", "f3"), List(NumericalFeature, NumericalFeature, + * NumericalFeature), List(0, 1, 2)) + * val subIndex = featureIndex.subset("f1", "f3") + * }}} + * + * */ def subset(names: String*): FeatureIndex = { val nameToIndex = this.names.zipWithIndex.toMap subset(names.map(n => nameToIndex(n)):_*) } + /** Create a feature index with subset of features, provided by feature indices. + * @param indices column indices for subset of features to be selected + * + * @example Create feature index based on second and third (i.e. indices 1, 2) features from a constructed + * feature index. + * {{{ + * val featureIndex = FeatureIndex(List("f1", "f2", "f3"), List(NumericalFeature, NumericalFeature, + * NumericalFeature), List(0, 1, 2)) + * val subIndex = featureIndex.subset(1 to 2) + * }}} + * */ def subset(indices: IndexedSeq[Int]): FeatureIndex = subset(indices:_*) // DummyImplicit is needed to avoid the same type as String* after erasure @@ -49,6 +71,7 @@ object Feature { indices.toIndexedSeq.map(i => this.columnIndices(i)) ) + /** Create a feature index by dropping a feature by column index. */ def drop(index: Int): FeatureIndex = new FeatureIndex( this.names.zipWithIndex.flatMap { case (n, i) => if (i != index) n.some else none[String] }, this.types.zipWithIndex.flatMap { case (t, i) => if (i != index) t.some else none[FeatureType] }, @@ -61,26 +84,54 @@ object Feature { this.names.zip(this.types).map { case (n, t) => s"$n (${t.headerLineString})" } mkString ", " } + /** A structure that keeps track of feature metadata (names, types and indices). This is needed + * because some methods are only applicable to a certain type of features, e.g. [0, 1] scaling + * only makes sense for numerical features. */ object FeatureIndex { - + /** Construct feature index with `n` categorical features. Feature names are generated automatically. + * @param n number of categorical features in feature index + */ def categorical(n: Int): FeatureIndex = categorical((0 until n).toList) def categorical(columnIndices: List[Int]): FeatureIndex = apply(columnIndices.indices.map(i => s"f$i").toList, columnIndices.map(_ => CategoricalFeature), columnIndices) + /** Construct feature index with `n` numerical features. Feature names are generated automatically. + * @param n number of numerical features in feature index + */ def numerical(n: Int): FeatureIndex = numerical((0 until n).toList) def numerical(columnIndices: List[Int]): FeatureIndex = apply(columnIndices.indices.map(i => s"f$i").toList, columnIndices.map(_ => NumericalFeature), columnIndices) + /** Construct feature index from feature types. Feature names are generated automatically. + * @param types list of feature types + * + * @example Construct a feature index with one numerical and two categorical features. + * {{{ + * val featureIndex = FeatureIndex(List(CategoricalFeature, NumericalFeature, CategoricalFeature)) + * }}} + */ def apply(types: List[FeatureType]): FeatureIndex = apply(types.indices.map(i => s"f$i").toList, types, types.indices.toList) def apply(types: List[FeatureType], columnIndices: List[Int]): FeatureIndex = apply(types.indices.map(i => s"f$i").toList, types, columnIndices) + /** Construct a feature index with custom feature names, types and column indices. + * @param names feature names + * @param types feature types + * @param columnIndices column index for each feature + * + * @example Construct a feature index with three features, named "age" (numerical), "height" (numerical) + * and "group" (categorical). + * {{{ + * val featureIndex = FeatureIndex(List("age", "height", "group"), List(NumericalFeature, + * NumericalFeature, CategoricalFeature), List(0, 1, 2)) + * }}} + */ def apply(names: List[String], types: List[FeatureType], columnIndices: List[Int]): FeatureIndex = new FeatureIndex(names.toIndexedSeq, types.toIndexedSeq, columnIndices.toIndexedSeq) } From 491f5df123c9ccf127a5e68118c0b014dac580e6 Mon Sep 17 00:00:00 2001 From: Matej Klemen Date: Sat, 13 Jul 2019 13:04:36 +0200 Subject: [PATCH 04/15] Update package documentation (basic dataset info) --- .../io/picnicml/doddlemodel/data/package.scala | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/src/main/scala/io/picnicml/doddlemodel/data/package.scala b/src/main/scala/io/picnicml/doddlemodel/data/package.scala index 4bcc8911..f2ea2ced 100644 --- a/src/main/scala/io/picnicml/doddlemodel/data/package.scala +++ b/src/main/scala/io/picnicml/doddlemodel/data/package.scala @@ -4,6 +4,7 @@ import breeze.linalg.{DenseMatrix, DenseVector, unique} import io.picnicml.doddlemodel.CrossScalaCompat.floatOrdering import io.picnicml.doddlemodel.data.Feature.FeatureIndex +/** Provides data management utilities and definitions of custom doddle-model data types. */ package object data { type RealVector = DenseVector[Float] @@ -17,9 +18,22 @@ package object data { type Dataset = (Features, Target) type DatasetWithIndex = (Features, Target, FeatureIndex) + /** Loads and returns the Boston Housing prices dataset. */ def loadBostonDataset: DatasetWithIndex = ResourceDatasetLoaders.loadBostonDataset + /** Loads and returns the Breast cancer Wisconsin (diagnostic) dataset. + * + * @see + * Breast cancer dataset on UCI Machine Learning Repository + * */ def loadBreastCancerDataset: DatasetWithIndex = ResourceDatasetLoaders.loadBreastCancerDataset + /** Loads and returns the Iris dataset. + * + * @see + * Iris dataset on UCI Machine Learning Repository + * */ def loadIrisDataset: DatasetWithIndex = ResourceDatasetLoaders.loadIrisDataset + + /** Loads and returns an artificial dataset with a Poisson target variable. */ def loadHighSchoolTestDataset: DatasetWithIndex = ResourceDatasetLoaders.loadHighSchoolTestDataset def numberOfUniqueGroups(groups: IntVector): Int = { From d8aebb46e1258d99988328b8c0b3e3d685a61803 Mon Sep 17 00:00:00 2001 From: Matej Klemen Date: Tue, 16 Jul 2019 01:45:35 +0200 Subject: [PATCH 05/15] Format code examples in dummy package --- .../classification/MostFrequentClassifier.scala | 10 ++++++---- .../dummy/classification/StratifiedClassifier.scala | 12 +++++++----- .../dummy/classification/UniformClassifier.scala | 10 ++++++---- .../doddlemodel/dummy/regression/MeanRegressor.scala | 6 ++++-- .../dummy/regression/MedianRegressor.scala | 6 ++++-- 5 files changed, 27 insertions(+), 17 deletions(-) diff --git a/src/main/scala/io/picnicml/doddlemodel/dummy/classification/MostFrequentClassifier.scala b/src/main/scala/io/picnicml/doddlemodel/dummy/classification/MostFrequentClassifier.scala index 7ba48156..c00cb079 100644 --- a/src/main/scala/io/picnicml/doddlemodel/dummy/classification/MostFrequentClassifier.scala +++ b/src/main/scala/io/picnicml/doddlemodel/dummy/classification/MostFrequentClassifier.scala @@ -6,13 +6,15 @@ import io.picnicml.doddlemodel.CrossScalaCompat.floatOrdering import io.picnicml.doddlemodel.data.{Features, Simplex, Target} import io.picnicml.doddlemodel.typeclasses.Classifier +case class MostFrequentClassifier private (numClasses: Option[Int], mostFrequentClass: Option[Float]) + /** An immutable dummy classifier that always predicts the most frequent label. * - * Examples: - * val model = MostFrequentClassifier() + * @example + * {{{ + * val model = MostFrequentClassifier() + * }}} */ -case class MostFrequentClassifier private (numClasses: Option[Int], mostFrequentClass: Option[Float]) - object MostFrequentClassifier { def apply(): MostFrequentClassifier = MostFrequentClassifier(none, none) diff --git a/src/main/scala/io/picnicml/doddlemodel/dummy/classification/StratifiedClassifier.scala b/src/main/scala/io/picnicml/doddlemodel/dummy/classification/StratifiedClassifier.scala index 3ebd0bd9..0f79bfb6 100644 --- a/src/main/scala/io/picnicml/doddlemodel/dummy/classification/StratifiedClassifier.scala +++ b/src/main/scala/io/picnicml/doddlemodel/dummy/classification/StratifiedClassifier.scala @@ -9,11 +9,6 @@ import io.picnicml.doddlemodel.dummy.classification.StratifiedClassifier.ev import io.picnicml.doddlemodel.syntax.OptionSyntax._ import io.picnicml.doddlemodel.typeclasses.Classifier -/** An immutable dummy classifier that samples predictions from a stratified categorical distribution. - * - * Examples: - * val model = StratifiedClassifier() - */ case class StratifiedClassifier private (numClasses: Option[Int], targetDistr: Option[Multinomial[DenseVector[Double], Int]]) { @@ -23,6 +18,13 @@ case class StratifiedClassifier private (numClasses: Option[Int], } } +/** An immutable dummy classifier that samples predictions from a stratified categorical distribution. + * + * @example + * {{{ + * val model = StratifiedClassifier() + * }}} + */ object StratifiedClassifier { def apply(): StratifiedClassifier = StratifiedClassifier(none, none) diff --git a/src/main/scala/io/picnicml/doddlemodel/dummy/classification/UniformClassifier.scala b/src/main/scala/io/picnicml/doddlemodel/dummy/classification/UniformClassifier.scala index 169c7cf0..379549a9 100644 --- a/src/main/scala/io/picnicml/doddlemodel/dummy/classification/UniformClassifier.scala +++ b/src/main/scala/io/picnicml/doddlemodel/dummy/classification/UniformClassifier.scala @@ -6,13 +6,15 @@ import cats.syntax.option._ import io.picnicml.doddlemodel.data.{Features, Simplex, Target} import io.picnicml.doddlemodel.typeclasses.Classifier +case class UniformClassifier private (numClasses: Option[Int]) + /** An immutable dummy classifier that samples predictions from a uniform categorical distribution. * - * Examples: - * val model = UniformClassifier() + * @example + * {{{ + * val model = UniformClassifier() + * }}} */ -case class UniformClassifier private (numClasses: Option[Int]) - object UniformClassifier { def apply(): UniformClassifier = UniformClassifier(none) diff --git a/src/main/scala/io/picnicml/doddlemodel/dummy/regression/MeanRegressor.scala b/src/main/scala/io/picnicml/doddlemodel/dummy/regression/MeanRegressor.scala index f4ac5df7..83785559 100644 --- a/src/main/scala/io/picnicml/doddlemodel/dummy/regression/MeanRegressor.scala +++ b/src/main/scala/io/picnicml/doddlemodel/dummy/regression/MeanRegressor.scala @@ -8,8 +8,10 @@ import io.picnicml.doddlemodel.typeclasses.Regressor /** An immutable dummy regressor that always predicts the sample mean. * - * Examples: - * val model = MeanRegressor() + * @example + * {{{ + * val model = MeanRegressor() + * }}} */ case class MeanRegressor private (mean: Option[Float]) diff --git a/src/main/scala/io/picnicml/doddlemodel/dummy/regression/MedianRegressor.scala b/src/main/scala/io/picnicml/doddlemodel/dummy/regression/MedianRegressor.scala index bf063e53..ea8356e7 100644 --- a/src/main/scala/io/picnicml/doddlemodel/dummy/regression/MedianRegressor.scala +++ b/src/main/scala/io/picnicml/doddlemodel/dummy/regression/MedianRegressor.scala @@ -8,8 +8,10 @@ import io.picnicml.doddlemodel.typeclasses.Regressor /** An immutable dummy regressor that always predicts the sample median. * - * Examples: - * val model = MedianRegressor() + * @example + * {{{ + * val model = MedianRegressor() + * }}} */ case class MedianRegressor private (median: Option[Float]) From 68112c42d09a806a6f31b5b9b078ba60514a441a Mon Sep 17 00:00:00 2001 From: Matej Klemen Date: Tue, 16 Jul 2019 01:52:00 +0200 Subject: [PATCH 06/15] Add docs for impute package --- .../doddlemodel/impute/MeanValueImputer.scala | 29 ++++++++++++------ .../impute/MostFrequentValueImputer.scala | 30 ++++++++++++------- 2 files changed, 40 insertions(+), 19 deletions(-) diff --git a/src/main/scala/io/picnicml/doddlemodel/impute/MeanValueImputer.scala b/src/main/scala/io/picnicml/doddlemodel/impute/MeanValueImputer.scala index b3c61f92..3b414310 100644 --- a/src/main/scala/io/picnicml/doddlemodel/impute/MeanValueImputer.scala +++ b/src/main/scala/io/picnicml/doddlemodel/impute/MeanValueImputer.scala @@ -8,20 +8,31 @@ import io.picnicml.doddlemodel.data.{Features, RealVector} import io.picnicml.doddlemodel.syntax.OptionSyntax._ import io.picnicml.doddlemodel.typeclasses.Transformer -/** An immutable simple imputer that replaces all NaN values with column means. - * - * @param featureIndex feature index associated with features, this is needed so that only numerical features - * are transformed by this preprocessor, could be a subset of columns to be transformed - * - * Examples: - * val imputer = MeanValueImputer(featureIndex) - * val imputerSubsetOfColumns = MeanValueImputer(featureIndex.subset("f0", "f2")) - */ case class MeanValueImputer private (private[impute] val means: Option[RealVector], private val featureIndex: FeatureIndex) +/** An immutable simple imputer that replaces all NaN values with column means. */ object MeanValueImputer { + /** Create an imputer based on a feature index. + * + * @param featureIndex feature index associated with features, this is needed so that only numerical features + * are transformed by this preprocessor, could be a subset of columns to be transformed + * + * @example Impute values for all (numerical) features. + * {{{ + * val featureIndex = FeatureIndex(List(NumericalFeature, CategoricalFeature, NumericalFeature, + * NumericalFeature)) + * val imputer = MeanValueImputer(featureIndex) + * }}} + * + * @example Impute values for a subset of features. + * {{{ + * val featureIndex = FeatureIndex(List("f0", "f1", "f2"), List(NumericalFeature, NumericalFeature, + * NumericalFeature), List(0, 1, 2)) + * val imputerSubsetOfColumns = MeanValueImputer(featureIndex.subset("f0", "f2")) + * }}} + */ def apply(featureIndex: FeatureIndex): MeanValueImputer = MeanValueImputer(none, featureIndex) @SerialVersionUID(0L) diff --git a/src/main/scala/io/picnicml/doddlemodel/impute/MostFrequentValueImputer.scala b/src/main/scala/io/picnicml/doddlemodel/impute/MostFrequentValueImputer.scala index ec9ab926..2d912cb2 100644 --- a/src/main/scala/io/picnicml/doddlemodel/impute/MostFrequentValueImputer.scala +++ b/src/main/scala/io/picnicml/doddlemodel/impute/MostFrequentValueImputer.scala @@ -7,20 +7,30 @@ import io.picnicml.doddlemodel.data.{Features, RealVector} import io.picnicml.doddlemodel.syntax.OptionSyntax._ import io.picnicml.doddlemodel.typeclasses.Transformer -/** An immutable simple imputer that replaces all NaN values with most frequent value of a corresponding column. - * - * @param featureIndex feature index associated with features, this is needed so that only categorical features - * are transformed by this preprocessor, could be a subset of columns to be transformed - * - * Examples: - * val imputer = MostFrequentValueImputer(featureIndex) - * val imputerSubsetOfColumns = MostFrequentValueImputer(featureIndex.subset("f0", "f2")) - */ case class MostFrequentValueImputer private (private[impute] val mostFrequent: Option[RealVector], private val featureIndex: FeatureIndex) - +/** An immutable simple imputer that replaces all NaN values with most frequent value of a corresponding column. */ object MostFrequentValueImputer { + /** Create an imputer based on a feature index. + * + * @param featureIndex feature index associated with features, this is needed so that only categorical features + * are transformed by this preprocessor, could be a subset of columns to be transformed + * + * @example Impute values for all (numerical) features. + * {{{ + * val featureIndex = FeatureIndex(List(NumericalFeature, CategoricalFeature, NumericalFeature, + * NumericalFeature)) + * val imputer = MostFrequentValueImputer(featureIndex) + * }}} + * + * @example Impute values for a subset of features. + * {{{ + * val featureIndex = FeatureIndex(List("f0", "f1", "f2"), List(NumericalFeature, NumericalFeature, + * NumericalFeature), List(0, 1, 2)) + * val imputerSubsetOfColumns = MostFrequentValueImputer(featureIndex.subset("f0", "f2")) + * }}} + */ def apply(featureIndex: FeatureIndex): MostFrequentValueImputer = MostFrequentValueImputer(None, featureIndex) From fcfd2955e0833d5df3a195452a05e98ad2dd36b9 Mon Sep 17 00:00:00 2001 From: Matej Klemen Date: Tue, 16 Jul 2019 23:35:34 +0200 Subject: [PATCH 07/15] Add docs for linear package --- .../doddlemodel/linear/LinearRegression.scala | 23 ++++++++++++------- .../linear/LogisticRegression.scala | 23 ++++++++++++------- .../linear/PoissonRegression.scala | 23 ++++++++++++------- .../linear/SoftmaxClassifier.scala | 23 ++++++++++++------- 4 files changed, 60 insertions(+), 32 deletions(-) diff --git a/src/main/scala/io/picnicml/doddlemodel/linear/LinearRegression.scala b/src/main/scala/io/picnicml/doddlemodel/linear/LinearRegression.scala index 3d6e4ae8..34ede7cf 100644 --- a/src/main/scala/io/picnicml/doddlemodel/linear/LinearRegression.scala +++ b/src/main/scala/io/picnicml/doddlemodel/linear/LinearRegression.scala @@ -4,20 +4,27 @@ import cats.syntax.option._ import io.picnicml.doddlemodel.data.{Features, RealVector, Target} import io.picnicml.doddlemodel.linear.typeclasses.LinearRegressor -/** An immutable multiple linear regression model with ridge regularization. - * - * @param lambda L2 regularization strength, must be positive, 0 means no regularization - * - * Examples: - * val model = LinearRegression() - * val model = LinearRegression(lambda = 1.5f) - */ +/** An immutable multiple linear regression model with ridge regularization. */ case class LinearRegression private (lambda: Float, private val w: Option[RealVector]) { private var yPredCache: Target = _ } object LinearRegression { + /** Create a regularized linear regression model. + * + * @param lambda L2 regularization strength, must be non-negative, 0.0 means no regularization + * + * @example Create and fit a regularized linear regression model with lambda = 1.5. + * {{{ + * import io.picnicml.doddlemodel.linear.LinearRegression.ev + * + * val X: Features = DenseMatrix(List(1.0, 2.0), List(3.0, 4.0)) + * val y: Target = DenseVector(-3.0, 2.0) + * val model = LinearRegression(lambda = 1.5f) + * val fittedModel = ev.fit(model, X, y) + * }}} + */ def apply(lambda: Float = 0.0f): LinearRegression = { require(lambda >= 0.0f, "L2 regularization strength must be non-negative") LinearRegression(lambda, none) diff --git a/src/main/scala/io/picnicml/doddlemodel/linear/LogisticRegression.scala b/src/main/scala/io/picnicml/doddlemodel/linear/LogisticRegression.scala index a62d2da6..3a30860b 100644 --- a/src/main/scala/io/picnicml/doddlemodel/linear/LogisticRegression.scala +++ b/src/main/scala/io/picnicml/doddlemodel/linear/LogisticRegression.scala @@ -6,20 +6,27 @@ import cats.syntax.option._ import io.picnicml.doddlemodel.data.{Features, RealVector, Simplex, Target} import io.picnicml.doddlemodel.linear.typeclasses.LinearClassifier -/** An immutable multiple logistic regression model with ridge regularization. - * - * @param lambda L2 regularization strength, must be positive, 0 means no regularization - * - * Examples: - * val model = LogisticRegression() - * val model = LogisticRegression(lambda = 1.5f) - */ +/** An immutable multiple logistic regression model with ridge regularization. */ case class LogisticRegression private (lambda: Float, numClasses: Option[Int], private val w: Option[RealVector]) { private var yPredProbaCache: RealVector = _ } object LogisticRegression { + /** Create a regularized logistic regression model. + * + * @param lambda L2 regularization strength, must be non-negative, 0.0 means no regularization + * + * @example Create and fit a logistic regression model with lambda = 1.5. + * {{{ + * import io.picnicml.doddlemodel.linear.LogisticRegression.ev + * + * val X: Features = DenseMatrix(List(1.0, 2.0), List(3.0, 4.0)) + * val y: Target = DenseVector(0.0, 1.0) + * val model = LogisticRegression(lambda = 1.5f) + * val fittedModel = ev.fit(model, X, y) + * }}} + */ def apply(lambda: Float = 0.0f): LogisticRegression = { require(lambda >= 0.0f, "L2 regularization strength must be non-negative") LogisticRegression(lambda, none, none) diff --git a/src/main/scala/io/picnicml/doddlemodel/linear/PoissonRegression.scala b/src/main/scala/io/picnicml/doddlemodel/linear/PoissonRegression.scala index fc0b80e6..9e8b2237 100644 --- a/src/main/scala/io/picnicml/doddlemodel/linear/PoissonRegression.scala +++ b/src/main/scala/io/picnicml/doddlemodel/linear/PoissonRegression.scala @@ -6,20 +6,27 @@ import cats.syntax.option._ import io.picnicml.doddlemodel.data.{Features, RealVector, Target} import io.picnicml.doddlemodel.linear.typeclasses.LinearRegressor -/** An immutable multiple Poisson regression model with ridge regularization. - * - * @param lambda L2 regularization strength, must be positive, 0 means no regularization - * - * Examples: - * val model = PoissonRegression() - * val model = PoissonRegression(lambda = 1.5f) - */ +/** An immutable multiple Poisson regression model with ridge regularization. */ case class PoissonRegression private (lambda: Float, private val w: Option[RealVector]) { private var yPredMeanCache: Target = _ } object PoissonRegression { + /** Create a regularized Poisson regression model. + * + * @param lambda L2 regularization strength, must be non-negative, 0.0 means no regularization + * + * @example Create and fit a regularized Poisson regression model with lambda = 1.5. + * {{{ + * import io.picnicml.doddlemodel.linear.PoissonRegression.ev + * + * val X: Features = DenseMatrix(List(1.0, 2.0), List(3.0, 4.0)) + * val y: Target = DenseVector(-3.0, 2.0) + * val model = PoissonRegression(lambda = 1.5f) + * val fittedModel = ev.fit(model, X, y) + * }}} + */ def apply(lambda: Float = 0.0f): PoissonRegression = { require(lambda >= 0.0f, "L2 regularization strength must be non-negative") PoissonRegression(lambda, none) diff --git a/src/main/scala/io/picnicml/doddlemodel/linear/SoftmaxClassifier.scala b/src/main/scala/io/picnicml/doddlemodel/linear/SoftmaxClassifier.scala index 30001ea3..e1e0bd43 100644 --- a/src/main/scala/io/picnicml/doddlemodel/linear/SoftmaxClassifier.scala +++ b/src/main/scala/io/picnicml/doddlemodel/linear/SoftmaxClassifier.scala @@ -7,20 +7,27 @@ import io.picnicml.doddlemodel.data.{Features, RealVector, Simplex, Target} import io.picnicml.doddlemodel.linear.typeclasses.LinearClassifier import io.picnicml.doddlemodel.syntax.OptionSyntax._ -/** An immutable multiple multinomial regression model with ridge regularization. - * - * @param lambda L2 regularization strength, must be positive, 0 means no regularization - * - * Examples: - * val model = SoftmaxClassifier() - * val model = SoftmaxClassifier(lambda = 1.5f) - */ +/** An immutable multiple multinomial regression model with ridge regularization. */ case class SoftmaxClassifier private (lambda: Float, numClasses: Option[Int], private val w: Option[RealVector]) { private var yPredProbaCache: Simplex = _ } object SoftmaxClassifier { + /** Create a regularized softmax model. + * + * @param lambda L2 regularization strength, must be non-negative, 0.0 means no regularization + * + * @example Create and fit a regularized softmax classifier with lambda = 1.5. + * {{{ + * import io.picnicml.doddlemodel.linear.SoftmaxClassifier.ev + * + * val X: Features = DenseMatrix(List(1.0, 2.0), List(3.0, 4.0)) + * val y: Target = DenseVector(0.0, 1.0) + * val model = SoftmaxClassifier(lambda = 1.5f) + * val fittedModel = ev.fit(model, X, y) + * }}} + */ def apply(lambda: Float = 0.0f): SoftmaxClassifier = { require(lambda >= 0.0f, "L2 regularization strength must be non-negative") SoftmaxClassifier(lambda, none, none) From bebf025a6cb11530fc7280735255993e7c3ef90f Mon Sep 17 00:00:00 2001 From: Matej Klemen Date: Wed, 17 Jul 2019 23:11:03 +0200 Subject: [PATCH 08/15] Update docs for metrics package --- .../metrics/ClassificationMetrics.scala | 24 +++++++++++++++---- .../doddlemodel/metrics/RankingMetrics.scala | 17 ++++++++++++- .../metrics/RegressionMetrics.scala | 20 ++++++++++++---- .../doddlemodel/metrics/package.scala | 18 ++++++++++++++ 4 files changed, 69 insertions(+), 10 deletions(-) diff --git a/src/main/scala/io/picnicml/doddlemodel/metrics/ClassificationMetrics.scala b/src/main/scala/io/picnicml/doddlemodel/metrics/ClassificationMetrics.scala index 6091a8ae..57d88388 100644 --- a/src/main/scala/io/picnicml/doddlemodel/metrics/ClassificationMetrics.scala +++ b/src/main/scala/io/picnicml/doddlemodel/metrics/ClassificationMetrics.scala @@ -4,7 +4,7 @@ import io.picnicml.doddlemodel.data.{Target, numberOfTargetClasses} object ClassificationMetrics { - /** Classification accuracy. */ + /** Classification accuracy - measures the proportion of correctly classified examples among all examples. */ object Accuracy extends Metric { override lazy val higherValueIsBetter: Boolean = true @@ -15,7 +15,12 @@ object ClassificationMetrics { override def toString: String = "accuracy" } - /** Positive predictive value. */ + /** Precision (positive predictive value) - measures the proportion of correctly classified positive examples + * (true positives) among all examples classified as positive. + * + * @note Only defined for a binary classification task. + * @see [[https://en.wikipedia.org/wiki/Precision_and_recall]] + * */ object Precision extends Metric { override lazy val higherValueIsBetter: Boolean = true @@ -36,7 +41,12 @@ object ClassificationMetrics { override def toString: String = "precision" } - /** Sensitivity. */ + /** Recall (sensitivity) - measures the proportion of correctly classified positive examples (true positives) + * among all actual positive examples. + * + * @note Only defined for a binary classification task. + * @see [[https://en.wikipedia.org/wiki/Precision_and_recall]] + * */ object Recall extends Metric { override lazy val higherValueIsBetter: Boolean = true @@ -57,7 +67,11 @@ object ClassificationMetrics { override def toString: String = "recall" } - /** F1 score. */ + /** F1 score - defined as the harmonic average of precision and recall. + * + * @note Only defined for a binary classification task. + * @see [[https://en.wikipedia.org/wiki/F1_score]] + * */ object F1Score extends Metric { override lazy val higherValueIsBetter: Boolean = true @@ -77,7 +91,7 @@ object ClassificationMetrics { override def toString: String = "F1 score" } - /** Hamming loss. */ + /** Hamming loss - measures the proportion of incorrectly classified examples. */ object HammingLoss extends Metric { override lazy val higherValueIsBetter: Boolean = false diff --git a/src/main/scala/io/picnicml/doddlemodel/metrics/RankingMetrics.scala b/src/main/scala/io/picnicml/doddlemodel/metrics/RankingMetrics.scala index 690c24d9..f768843d 100644 --- a/src/main/scala/io/picnicml/doddlemodel/metrics/RankingMetrics.scala +++ b/src/main/scala/io/picnicml/doddlemodel/metrics/RankingMetrics.scala @@ -7,7 +7,14 @@ import scala.collection.compat.immutable.ArraySeq object RankingMetrics { - /** Area under the ROC-curve. **/ + /** Area under the ROC-curve. + * + * Can be interpreted as the probability that a classifier will rank a randomly chosen positive example higher + * than a randomly chosen negative example. + * + * @note Only defined for a binary classification task. + * @see [[https://en.wikipedia.org/wiki/Receiver_operating_characteristic#Area_under_the_curve]] + * */ object Auc extends Metric { override lazy val higherValueIsBetter: Boolean = true @@ -27,7 +34,15 @@ object RankingMetrics { /** Receiver operating characteristic curve (ROC-curve). * + * Presents the ability of a binary classifier (in terms of true positive rate and false positive rate) as the + * discrimination threshold is varied. + * + * @param y ground truth labels + * @param yPredProba predicted probabilities * @param length the number of thresholds to take into account, i.e. the number of coordinates returned + * @note Only defined for a binary classification task. + * @note Currently, ROC-curve is only defined for probability scores (i.e. `yPredProba` needs to contain values + * between 0.0 and 1.0) */ def rocCurve(y: Target, yPredProba: RealVector, length: Int = 30): RocCurve = { require(length >= 5, "Number of points of the ROC-curve must be at least 3") diff --git a/src/main/scala/io/picnicml/doddlemodel/metrics/RegressionMetrics.scala b/src/main/scala/io/picnicml/doddlemodel/metrics/RegressionMetrics.scala index c02b4643..d37be2b7 100644 --- a/src/main/scala/io/picnicml/doddlemodel/metrics/RegressionMetrics.scala +++ b/src/main/scala/io/picnicml/doddlemodel/metrics/RegressionMetrics.scala @@ -7,7 +7,10 @@ import io.picnicml.doddlemodel.data.Target object RegressionMetrics { - /** Root mean squared error. */ + /** Root mean squared error - defined as the square root of mean squared error. + * + * @see [[https://en.wikipedia.org/wiki/Root-mean-square_deviation]]q + * */ object Rmse extends Metric { override lazy val higherValueIsBetter: Boolean = false @@ -17,7 +20,10 @@ object RegressionMetrics { override def toString: String = "RMSE" } - /** Mean squared error. */ + /** Mean squared error - defined as the average of the squares of error. + * + * @see [[https://en.wikipedia.org/wiki/Mean_squared_error]] + * */ object Mse extends Metric { override lazy val higherValueIsBetter: Boolean = false @@ -30,7 +36,10 @@ object RegressionMetrics { override def toString: String = "MSE" } - /** Mean absolute error. */ + /** Mean absolute error - defined as the average of absolute error. + * + * @see [[https://en.wikipedia.org/wiki/Mean_absolute_error]] + * */ object Mae extends Metric { override lazy val higherValueIsBetter: Boolean = false @@ -41,7 +50,10 @@ object RegressionMetrics { } - /** Explained variance. */ + /** Explained variance - measures the proportion of variance in dataset that is captured by the model. + * + * @see [[https://en.wikipedia.org/wiki/Explained_variation]] + * */ object ExplainedVariance extends Metric { override lazy val higherValueIsBetter: Boolean = true diff --git a/src/main/scala/io/picnicml/doddlemodel/metrics/package.scala b/src/main/scala/io/picnicml/doddlemodel/metrics/package.scala index c4a926bb..fb4cdf55 100644 --- a/src/main/scala/io/picnicml/doddlemodel/metrics/package.scala +++ b/src/main/scala/io/picnicml/doddlemodel/metrics/package.scala @@ -4,21 +4,39 @@ import io.picnicml.doddlemodel.metrics.ClassificationMetrics._ import io.picnicml.doddlemodel.metrics.RankingMetrics._ import io.picnicml.doddlemodel.metrics.RegressionMetrics._ +/** Provides various evaluation metrics for prediction tasks. */ package object metrics { // regression metrics + /** @see [[io.picnicml.doddlemodel.metrics.RegressionMetrics.Mse]] */ lazy val mse: Metric = Mse + + /** @see [[io.picnicml.doddlemodel.metrics.RegressionMetrics.Rmse]] */ lazy val rmse: Metric = Rmse + + /** @see [[io.picnicml.doddlemodel.metrics.RegressionMetrics.Mae]] */ lazy val mae: Metric = Mae + + /** @see [[io.picnicml.doddlemodel.metrics.RegressionMetrics.ExplainedVariance]] */ lazy val explainedVariance: Metric = ExplainedVariance // classification metrics + /** @see [[io.picnicml.doddlemodel.metrics.ClassificationMetrics.Accuracy]] */ lazy val accuracy: Metric = Accuracy + + /** @see [[io.picnicml.doddlemodel.metrics.ClassificationMetrics.Precision]] */ lazy val precision: Metric = Precision + + /** @see [[io.picnicml.doddlemodel.metrics.ClassificationMetrics.Recall]] */ lazy val recall: Metric = Recall + + /** @see [[io.picnicml.doddlemodel.metrics.ClassificationMetrics.F1Score]] */ lazy val f1Score: Metric = F1Score + + /** @see [[io.picnicml.doddlemodel.metrics.ClassificationMetrics.HammingLoss]] */ lazy val hammingLoss: Metric = HammingLoss // ranking metrics + /** @see [[io.picnicml.doddlemodel.metrics.RankingMetrics.Auc]] */ lazy val auc: Metric = Auc } From 47503a29477dd7736d47b8548a54ef7f4fa2f6f0 Mon Sep 17 00:00:00 2001 From: Matej Klemen Date: Wed, 11 Sep 2019 12:09:12 +0200 Subject: [PATCH 09/15] Update docs for model selection package --- .../modelselection/CrossValidation.scala | 35 +++++++++++++------ .../modelselection/GroupKFoldSplitter.scala | 28 +++++++++------ .../modelselection/HyperparameterSearch.scala | 25 +++++++++++++ .../modelselection/KFoldSplitter.scala | 23 +++++++----- 4 files changed, 80 insertions(+), 31 deletions(-) diff --git a/src/main/scala/io/picnicml/doddlemodel/modelselection/CrossValidation.scala b/src/main/scala/io/picnicml/doddlemodel/modelselection/CrossValidation.scala index e0d2a29e..c3798426 100644 --- a/src/main/scala/io/picnicml/doddlemodel/modelselection/CrossValidation.scala +++ b/src/main/scala/io/picnicml/doddlemodel/modelselection/CrossValidation.scala @@ -9,21 +9,12 @@ import scala.concurrent.duration.Duration import scala.concurrent.{Await, Future} import scala.util.Random -/** A parallel, n-fold cross validation technique. - * - * @param metric a function from io.picnicml.doddlemodel.metrics used to calculate each fold's score - * @param dataSplitter a strategy for splitting the dataset into multiple folds - * - * Examples: - * val splitter = KFoldSplitter(folds = 3) - * val cv = CrossValidation(metric = rmse, dataSplitter = splitter)) - * cv.score(model, x, y) - */ class CrossValidation private (val metric: Metric, val dataSplitter: DataSplitter) { private implicit val ec: CVExecutionContext = new CVExecutionContext() - /** + /** Obtain the average score of all folds. + * * @param reusable indicates whether to shutdown the thread pool after the cv score is computed * and by default it is, if the same CrossValidation instance is needed after the first call * to score(...), bring implicit CrossValReusable(true) to scope and call CrossValidation.shutdownNow() @@ -57,8 +48,30 @@ class CrossValidation private (val metric: Metric, val dataSplitter: DataSplitte def shutdownNow(): Unit = this.ec.shutdownNow() } +/** A parallel, k-fold cross validation technique. */ object CrossValidation { + /** Create a k-fold cross validation instance. + * @param metric a function from [[io.picnicml.doddlemodel.metrics]] used to calculate each fold's score + * @param dataSplitter a strategy for splitting the dataset into multiple folds + * + * @example Perform 2-fold cross validation using logistic regression and evaluate its performance + * using root mean squared error. + * {{{ + * import io.picnicml.doddlemodel.metrics.rmse + * import io.picnicml.doddlemodel.linear.LogisticRegression + * + * val X: Features = DenseMatrix(List(1.0, 2.0), List(3.0, 4.0), List(5.0, 6.0), List(7.0, 8.0)) + * val y: Target = DenseVector(0.0, 1.0, 0.0, 1.0) + * val model = LogisticRegression(1.0) + * + * val splitter = KFoldSplitter(numFolds = 2) + * val cv = CrossValidation(metric = rmse, dataSplitter = splitter)) + * cv.score(model, X, y) + * }}} + * + * @see [[io.picnicml.doddlemodel.metrics Metrics in doddle-model]] + */ def apply(metric: Metric, dataSplitter: DataSplitter): CrossValidation = new CrossValidation(metric, dataSplitter) } diff --git a/src/main/scala/io/picnicml/doddlemodel/modelselection/GroupKFoldSplitter.scala b/src/main/scala/io/picnicml/doddlemodel/modelselection/GroupKFoldSplitter.scala index 4405aff4..0af085fb 100644 --- a/src/main/scala/io/picnicml/doddlemodel/modelselection/GroupKFoldSplitter.scala +++ b/src/main/scala/io/picnicml/doddlemodel/modelselection/GroupKFoldSplitter.scala @@ -8,16 +8,6 @@ import io.picnicml.doddlemodel.modelselection.GroupKFoldSplitter.{TestFolds, Tra import scala.util.Random -/** K-Folds strategy for splitting data that makes sure groups in each fold are non-overlapping, - * i.e no group is present in both training and testing splits. Folds try to be as balanced - * as possible, i.e. the number of test examples in each fold is approximately the same. - * - * @param numFolds number of folds - * - * Examples: - * val dataSplitter = GroupKFoldSplitter(folds = 3) - * datasplitter.splitData(x, y, groups) - */ class GroupKFoldSplitter private (val numFolds: Int) extends DataSplitter { override def splitData(x: Features, y: Target, groups: IntVector) @@ -61,9 +51,25 @@ class GroupKFoldSplitter private (val numFolds: Int) extends DataSplitter { throw new NotImplementedError("GroupKFoldSplitter only splits data based on groups") } - +/** A strategy for splitting data into k folds that makes sure groups in each fold are non-overlapping, + * i.e no group is present in both training and testing splits. */ object GroupKFoldSplitter { + /** Create a group k-fold splitter. + * @param numFolds number of folds + * + * @example Split 10 examples, corresponding to data of 3 patients into 3 folds, making sure that data of a patient + * never appears in both training and test set in the same fold. + * {{{ + * val patientFeatures = DenseMatrix.rand(10, 3) + * val isSick = DenseVector(0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0) + * val idPatient = DenseVector(1, 2, 2, 0, 0, 0, 2, 1, 1, 2) + * + * val splitter = GroupKFoldSplitter(numFolds = 3) + * // stream, containing 3 TrainTestSplits + * val splits = splitter.splitData(patientFeatures, isSick, idPatient) + * }}} + */ def apply(numFolds: Int): GroupKFoldSplitter = { require(numFolds > 0, "Number of folds must be positive") new GroupKFoldSplitter(numFolds) diff --git a/src/main/scala/io/picnicml/doddlemodel/modelselection/HyperparameterSearch.scala b/src/main/scala/io/picnicml/doddlemodel/modelselection/HyperparameterSearch.scala index 9d28195c..49688bdb 100644 --- a/src/main/scala/io/picnicml/doddlemodel/modelselection/HyperparameterSearch.scala +++ b/src/main/scala/io/picnicml/doddlemodel/modelselection/HyperparameterSearch.scala @@ -52,8 +52,33 @@ class HyperparameterSearch private (val numIterations: Int, val crossVal: CrossV } } +/** A parallel hyperparameter search using k-fold cross validation. */ object HyperparameterSearch { + /** Create a hyperparameter search instance. + * @param numIterations number of predictors for which the cross validation score is calculated + * @param crossValidation k-fold cross validation instance + * @param verbose flag that specifies whether validation score of the selected model is printed to standard output + * + * @example Search among 3 different regularization values (0.1, 0.2, 0.5) for logistic regression using + * 3-fold cross validation and store the (re-fitted on entire dataset) model that obtains highest accuracy. + * {{{ + * import io.picnicml.doddlemodel.metrics.accuracy + * import io.picnicml.doddlemodel.linear.LogisticRegression + * + * val x = DenseMatrix.rand(10, 3) + * val y = DenseVector(0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0) + * + * val splitter = KFoldSplitter(numFolds = 3) + * val cv = CrossValidation(metric = accuracy, dataSplitter = splitter) + * val search = HyperparameterSearch(numIterations = 3, crossValidation = cv) + * val lambdas = List(0.1, 0.2, 0.5).iterator + * + * val modelBestParams = search.bestOf(x, y) { + * LogisticRegression(lambda = lambdas.next) + * } + * }}} + */ def apply(numIterations: Int, crossValidation: CrossValidation, verbose: Boolean = true): HyperparameterSearch = { require(numIterations > 0, "Number of iterations must be positive") new HyperparameterSearch(numIterations, crossValidation, verbose) diff --git a/src/main/scala/io/picnicml/doddlemodel/modelselection/KFoldSplitter.scala b/src/main/scala/io/picnicml/doddlemodel/modelselection/KFoldSplitter.scala index 2f462b7b..c2877269 100644 --- a/src/main/scala/io/picnicml/doddlemodel/modelselection/KFoldSplitter.scala +++ b/src/main/scala/io/picnicml/doddlemodel/modelselection/KFoldSplitter.scala @@ -4,15 +4,6 @@ import io.picnicml.doddlemodel.data.{Features, IntVector, Target, TrainTestSplit import scala.util.Random -/** K-Folds strategy for splitting data. - * - * @param numFolds number of folds - * @param shuffleRows indicates whether examples should be shuffled prior to calculating the score - * - * Examples: - * val dataSplitter = KFoldSplitter(folds = 3) - * datasplitter.splitData(x, y) - */ class KFoldSplitter private (val numFolds: Int, val shuffleRows: Boolean) extends DataSplitter { override def splitData(x: Features, y: Target) @@ -60,8 +51,22 @@ class KFoldSplitter private (val numFolds: Int, val shuffleRows: Boolean) extend throw new NotImplementedError("KFoldSplitter doesn't split data based on groups") } +/** K-folds strategy for splitting data. */ object KFoldSplitter { + /** Create a k-fold splitter instance. + * @param numFolds number of folds + * @param shuffleRows a flag indicating whether examples should be shuffled prior to calculating the splits + * + * @example Split data into 3 folds. + * {{{ + * val x = DenseMatrix.rand(7, 2) + * val y = DenseVector(0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0) + * + * val splitter = KFoldSplitter(numFolds = 3) + * splitter.splitData(x, y) + * }}} + */ def apply(numFolds: Int, shuffleRows: Boolean = true): KFoldSplitter = { require(numFolds > 0, "Number of folds must be positive") new KFoldSplitter(numFolds, shuffleRows) From 64e5862d5a62c7adc6220f2a16b4aa7113690ab4 Mon Sep 17 00:00:00 2001 From: Matej Klemen Date: Sun, 13 Oct 2019 23:44:53 +0200 Subject: [PATCH 10/15] Verify and improve docs for `data` package * make sure examples in docs are working after rebase * make examples complete by including imports --- .../picnicml/doddlemodel/data/CsvLoader.scala | 16 +++--- .../doddlemodel/data/DatasetUtils.scala | 40 +++++++++++--- .../picnicml/doddlemodel/data/Feature.scala | 52 +++++++++++++++---- .../picnicml/doddlemodel/data/package.scala | 6 ++- 4 files changed, 86 insertions(+), 28 deletions(-) diff --git a/src/main/scala/io/picnicml/doddlemodel/data/CsvLoader.scala b/src/main/scala/io/picnicml/doddlemodel/data/CsvLoader.scala index 24f1d840..8437173a 100644 --- a/src/main/scala/io/picnicml/doddlemodel/data/CsvLoader.scala +++ b/src/main/scala/io/picnicml/doddlemodel/data/CsvLoader.scala @@ -9,29 +9,27 @@ import scala.io.{BufferedSource, Source} object CsvLoader { - /** Loads a csv dataset with 2 header lines (1st line for feature names and 2nd for types). + /** Loads a csv dataset with 2 header lines (first line for feature names and second for types). * @param datasetFilePath csv file to load * @param na value to interpret as N/A data in the given dataset * * @example Reading the iris dataset. * {{{ - * import java.io.File + * import io.picnicml.doddlemodel.data.CsvLoader.loadCsvDataset * - * val file = new File("/datasets/iris.csv") - * val (data, featureInfo) = loadCsvDataset(file) + * val (data, featureInfo) = loadCsvDataset("src/main/resources/datasets/iris.csv") * // separate features from the label - * val (irisFeatures, irisLabels) = data(::, 0 to 3), data(::, -1) + * val (irisFeatures, irisLabels) = (data(::, 0 to 3), data(::, -1)) * }}} * * @example Reading a dataset where N/A values are marked with `NA`. * {{{ - * import java.io.File + * import io.picnicml.doddlemodel.data.CsvLoader.loadCsvDataset * - * val file = new File("/datasets/dummy_csv_reading.csv") * // specify a value to interpret as N/A data - * val (data, featureInfo) = loadCsvDataset(file, "NA") + * val (data, featureInfo) = loadCsvDataset("src/main/resources/datasets/dummy_csv_reading.csv", "NA") * }}} - * */ + */ def loadCsvDataset(datasetFilePath: String, na: String = "NA"): FeaturesWithIndex = loadCsvDataset(Source.fromFile(datasetFilePath), na) diff --git a/src/main/scala/io/picnicml/doddlemodel/data/DatasetUtils.scala b/src/main/scala/io/picnicml/doddlemodel/data/DatasetUtils.scala index 44e86b0c..a79b5c96 100644 --- a/src/main/scala/io/picnicml/doddlemodel/data/DatasetUtils.scala +++ b/src/main/scala/io/picnicml/doddlemodel/data/DatasetUtils.scala @@ -13,17 +13,19 @@ object DatasetUtils { * @example Shuffle a dataset randomly. * {{{ * import scala.util.Random + * import io.picnicml.doddlemodel.data.CsvLoader.loadCsvDataset + * import io.picnicml.doddlemodel.data.DatasetUtils.shuffleDataset * - * // we are assuming data was previously loaded - * val (dataX, dataY) = ... + * val (data, featureInfo) = loadCsvDataset("src/main/resources/datasets/iris.csv") + * val (dataX, dataY) = (data(::, 0 to 3), data(::, -1)) * * val (shuffledX, shuffledY) = shuffleDataset(dataX, dataY) * - * // seeded shuffle - seed passed to shuffler implicitly + * // seeded shuffle - seed is passed to shuffler implicitly * implicit val rand: Random = new Random(42) * val (shuffledX, shuffledY) = shuffleDataset(dataX, dataY) * }}} - * */ + */ def shuffleDataset(x: Features, y: Target)(implicit rand: Random = new Random()): Dataset = { val shuffleIndices = rand.shuffle((0 until y.length).toIndexedSeq) (x(shuffleIndices, ::).toDenseMatrix, y(shuffleIndices).toDenseVector) @@ -36,17 +38,39 @@ object DatasetUtils { * * @example Split dataset into training and test set. * {{{ - * // we are assuming data was previously loaded - * val (dataX, dataY) = ... + * import io.picnicml.doddlemodel.data.CsvLoader.loadCsvDataset + * import io.picnicml.doddlemodel.data.DatasetUtils.splitDataset + * + * val (data, featureInfo) = loadCsvDataset("src/main/resources/datasets/iris.csv") + * val (dataX, dataY) = (data(::, 0 to 3), data(::, -1)) * * // by default, the split is 50%:50% * val trTeSplit = splitDataset(dataX, dataY) * * // put 80% of data into training set and 20% into test set - * val trTeSplit = splitDataset(dataX, dataY, 0.8) + * val trTeSplit = splitDataset(dataX, dataY, 0.8f) * val (trainX, trainY, testX, testY) = (trTeSplit.xTr, trTeSplit.yTr, trTeSplit.xTe, trTeSplit.yTe) * }}} - * */ + * + * @example Split dataset into training, validation and test set in the ratio 60%: 10%: 30%. This is done by + * performing two train-test splits in a row. First we split the dataset in ratio (60% + 10%): 30%, + * obtaining combined training and validation set and the test set. Then we split the first part using + * the ratio (60% / 70%): (10% / 70%) to obtain the training and validation set. + * {{{ + * import io.picnicml.doddlemodel.data.CsvLoader.loadCsvDataset + * import io.picnicml.doddlemodel.data.DatasetUtils.splitDataset + * + * val (data, featureInfo) = loadCsvDataset("src/main/resources/datasets/iris.csv") + * val (dataX, dataY) = (data(::, 0 to 3), data(::, -1)) + * + * val teVsNonTeSplit = splitDataset(dataX, dataY, 0.7f) + * val (trValX, trValY, testX, testY) = (teVsNonTeSplit.xTr, teVsNonTeSplit.yTr, + * teVsNonTeSplit.xTe, teVsNonTeSplit.yTe) + * + * val trValSplit = splitDataset(trValX, trValY, (0.6f / 0.7f)) + * val (trainX, trainY, valX, valY) = (trValSplit.xTr, trValSplit.yTr, trValSplit.xTe, trValSplit.yTe) + * }}} + */ def splitDataset(x: Features, y: Target, proportionTrain: Float = 0.5f): TrainTestSplit = { val numTrain = numberOfTrainExamplesBasedOnProportion(x.rows, proportionTrain) val trIndices = 0 until numTrain diff --git a/src/main/scala/io/picnicml/doddlemodel/data/Feature.scala b/src/main/scala/io/picnicml/doddlemodel/data/Feature.scala index 09b52bea..73d75da0 100644 --- a/src/main/scala/io/picnicml/doddlemodel/data/Feature.scala +++ b/src/main/scala/io/picnicml/doddlemodel/data/Feature.scala @@ -40,30 +40,48 @@ object Feature { * * @example Create feature index based on features "f1" and "f3" from a constructed feature index. * {{{ + * import io.picnicml.doddlemodel.data.Feature.{FeatureIndex, NumericalFeature} + * * val featureIndex = FeatureIndex(List("f1", "f2", "f3"), List(NumericalFeature, NumericalFeature, * NumericalFeature), List(0, 1, 2)) * val subIndex = featureIndex.subset("f1", "f3") * }}} * - * */ + */ def subset(names: String*): FeatureIndex = { val nameToIndex = this.names.zipWithIndex.toMap subset(names.map(n => nameToIndex(n)):_*) } /** Create a feature index with subset of features, provided by feature indices. + * @param indices column indices for subset of features to be selected + * + * @example Create feature index based on second and third (i.e. indices 1, 2) features from a constructed + * feature index. + * {{{ + * import io.picnicml.doddlemodel.data.Feature.{FeatureIndex, NumericalFeature} + * + * val featureIndex = FeatureIndex(List("f1", "f2", "f3"), List(NumericalFeature, NumericalFeature, + * NumericalFeature), List(0, 1, 2)) + * val subIndex = featureIndex.subset(1 to 2) + * }}} + */ + def subset(indices: IndexedSeq[Int]): FeatureIndex = subset(indices:_*) + + /** Create a feature index with subset of features, provided by feature indices. Alternative interface to do same + * as with `FeatureIndex.subset(indices: IndexedSeq[Int])`. * @param indices column indices for subset of features to be selected * * @example Create feature index based on second and third (i.e. indices 1, 2) features from a constructed * feature index. * {{{ + * import io.picnicml.doddlemodel.data.Feature.{FeatureIndex, NumericalFeature} + * * val featureIndex = FeatureIndex(List("f1", "f2", "f3"), List(NumericalFeature, NumericalFeature, * NumericalFeature), List(0, 1, 2)) - * val subIndex = featureIndex.subset(1 to 2) + * val subIndex = featureIndex.subset(1, 2) * }}} - * */ - def subset(indices: IndexedSeq[Int]): FeatureIndex = subset(indices:_*) - + */ // DummyImplicit is needed to avoid the same type as String* after erasure def subset(indices: Int*)(implicit di: DummyImplicit): FeatureIndex = new FeatureIndex( indices.toIndexedSeq.map(i => this.names(i)), @@ -71,7 +89,17 @@ object Feature { indices.toIndexedSeq.map(i => this.columnIndices(i)) ) - /** Create a feature index by dropping a feature by column index. */ + /** Create a feature index by dropping a feature by column index. + * @param index index of column to be dropped + * @example Drop the third (index 2) feature from a feature index. + * {{{ + * import io.picnicml.doddlemodel.data.Feature.{FeatureIndex, NumericalFeature} + * + * val featureIndex = FeatureIndex(List("f1", "f2", "f3"), List(NumericalFeature, NumericalFeature, + * NumericalFeature), List(0, 1, 2)) + * val subIndex = featureIndex.drop(2) + * }}} + */ def drop(index: Int): FeatureIndex = new FeatureIndex( this.names.zipWithIndex.flatMap { case (n, i) => if (i != index) n.some else none[String] }, this.types.zipWithIndex.flatMap { case (t, i) => if (i != index) t.some else none[FeatureType] }, @@ -88,7 +116,9 @@ object Feature { * because some methods are only applicable to a certain type of features, e.g. [0, 1] scaling * only makes sense for numerical features. */ object FeatureIndex { - /** Construct feature index with `n` categorical features. Feature names are generated automatically. + + /** Construct feature index with `n` categorical features. Feature names are generated automatically - `i`th + * feature gets assigned the name "f`i`" (using 0-based counting). * @param n number of categorical features in feature index */ def categorical(n: Int): FeatureIndex = @@ -97,7 +127,8 @@ object Feature { def categorical(columnIndices: List[Int]): FeatureIndex = apply(columnIndices.indices.map(i => s"f$i").toList, columnIndices.map(_ => CategoricalFeature), columnIndices) - /** Construct feature index with `n` numerical features. Feature names are generated automatically. + /** Construct feature index with `n` numerical features. Feature names are generated automatically - `i`th + * feature gets assigned the name "f`i`" (using 0-based counting). * @param n number of numerical features in feature index */ def numerical(n: Int): FeatureIndex = @@ -106,11 +137,13 @@ object Feature { def numerical(columnIndices: List[Int]): FeatureIndex = apply(columnIndices.indices.map(i => s"f$i").toList, columnIndices.map(_ => NumericalFeature), columnIndices) - /** Construct feature index from feature types. Feature names are generated automatically. + /** Construct feature index from feature types. Feature names are generated automatically - `i`th + * feature gets assigned the name "f`i`" (using 0-based counting). * @param types list of feature types * * @example Construct a feature index with one numerical and two categorical features. * {{{ + * import io.picnicml.doddlemodel.data.Feature.{FeatureIndex, NumericalFeature, CategoricalFeature} * val featureIndex = FeatureIndex(List(CategoricalFeature, NumericalFeature, CategoricalFeature)) * }}} */ @@ -128,6 +161,7 @@ object Feature { * @example Construct a feature index with three features, named "age" (numerical), "height" (numerical) * and "group" (categorical). * {{{ + * import io.picnicml.doddlemodel.data.Feature.{FeatureIndex, NumericalFeature, CategoricalFeature} * val featureIndex = FeatureIndex(List("age", "height", "group"), List(NumericalFeature, * NumericalFeature, CategoricalFeature), List(0, 1, 2)) * }}} diff --git a/src/main/scala/io/picnicml/doddlemodel/data/package.scala b/src/main/scala/io/picnicml/doddlemodel/data/package.scala index f2ea2ced..96954fa8 100644 --- a/src/main/scala/io/picnicml/doddlemodel/data/package.scala +++ b/src/main/scala/io/picnicml/doddlemodel/data/package.scala @@ -20,17 +20,19 @@ package object data { /** Loads and returns the Boston Housing prices dataset. */ def loadBostonDataset: DatasetWithIndex = ResourceDatasetLoaders.loadBostonDataset + /** Loads and returns the Breast cancer Wisconsin (diagnostic) dataset. * * @see * Breast cancer dataset on UCI Machine Learning Repository - * */ + */ def loadBreastCancerDataset: DatasetWithIndex = ResourceDatasetLoaders.loadBreastCancerDataset + /** Loads and returns the Iris dataset. * * @see * Iris dataset on UCI Machine Learning Repository - * */ + */ def loadIrisDataset: DatasetWithIndex = ResourceDatasetLoaders.loadIrisDataset /** Loads and returns an artificial dataset with a Poisson target variable. */ From 1df2df1aa89c1ec7df382087beb1d18ce3370945 Mon Sep 17 00:00:00 2001 From: Matej Klemen Date: Mon, 14 Oct 2019 17:18:21 +0200 Subject: [PATCH 11/15] Verify and improve docs for package --- .../classification/MostFrequentClassifier.scala | 16 +++++++++------- .../classification/StratifiedClassifier.scala | 16 +++++++++------- .../classification/UniformClassifier.scala | 16 +++++++++------- .../dummy/regression/MeanRegressor.scala | 17 ++++++++++------- .../dummy/regression/MedianRegressor.scala | 16 +++++++++------- 5 files changed, 46 insertions(+), 35 deletions(-) diff --git a/src/main/scala/io/picnicml/doddlemodel/dummy/classification/MostFrequentClassifier.scala b/src/main/scala/io/picnicml/doddlemodel/dummy/classification/MostFrequentClassifier.scala index c00cb079..55489b48 100644 --- a/src/main/scala/io/picnicml/doddlemodel/dummy/classification/MostFrequentClassifier.scala +++ b/src/main/scala/io/picnicml/doddlemodel/dummy/classification/MostFrequentClassifier.scala @@ -8,15 +8,17 @@ import io.picnicml.doddlemodel.typeclasses.Classifier case class MostFrequentClassifier private (numClasses: Option[Int], mostFrequentClass: Option[Float]) -/** An immutable dummy classifier that always predicts the most frequent label. - * - * @example - * {{{ - * val model = MostFrequentClassifier() - * }}} - */ +/** An immutable dummy classifier that always predicts the most frequent label. */ object MostFrequentClassifier { + /** Create a majority classifier. + * + * @example + * {{{ + * import io.picnicml.doddlemodel.dummy.classification.MostFrequentClassifier + * val model = MostFrequentClassifier() + * }}} + */ def apply(): MostFrequentClassifier = MostFrequentClassifier(none, none) @SerialVersionUID(0L) diff --git a/src/main/scala/io/picnicml/doddlemodel/dummy/classification/StratifiedClassifier.scala b/src/main/scala/io/picnicml/doddlemodel/dummy/classification/StratifiedClassifier.scala index 0f79bfb6..31e05cee 100644 --- a/src/main/scala/io/picnicml/doddlemodel/dummy/classification/StratifiedClassifier.scala +++ b/src/main/scala/io/picnicml/doddlemodel/dummy/classification/StratifiedClassifier.scala @@ -18,15 +18,17 @@ case class StratifiedClassifier private (numClasses: Option[Int], } } -/** An immutable dummy classifier that samples predictions from a stratified categorical distribution. - * - * @example - * {{{ - * val model = StratifiedClassifier() - * }}} - */ +/** An immutable dummy classifier that samples predictions from a stratified categorical distribution. */ object StratifiedClassifier { + /** Create a stratified classifier. + * + * @example + * {{{ + * import io.picnicml.doddlemodel.dummy.classification.StratifiedClassifier + * val model = StratifiedClassifier() + * }}} + */ def apply(): StratifiedClassifier = StratifiedClassifier(none, none) @SerialVersionUID(0L) diff --git a/src/main/scala/io/picnicml/doddlemodel/dummy/classification/UniformClassifier.scala b/src/main/scala/io/picnicml/doddlemodel/dummy/classification/UniformClassifier.scala index 379549a9..8e82d517 100644 --- a/src/main/scala/io/picnicml/doddlemodel/dummy/classification/UniformClassifier.scala +++ b/src/main/scala/io/picnicml/doddlemodel/dummy/classification/UniformClassifier.scala @@ -8,15 +8,17 @@ import io.picnicml.doddlemodel.typeclasses.Classifier case class UniformClassifier private (numClasses: Option[Int]) -/** An immutable dummy classifier that samples predictions from a uniform categorical distribution. - * - * @example - * {{{ - * val model = UniformClassifier() - * }}} - */ +/** An immutable dummy classifier that samples predictions from a uniform categorical distribution. */ object UniformClassifier { + /** Create a uniform classifier. + * + * @example + * {{{ + * import io.picnicml.doddlemodel.dummy.classification.UniformClassifier + * val model = UniformClassifier() + * }}} + */ def apply(): UniformClassifier = UniformClassifier(none) @SerialVersionUID(0L) diff --git a/src/main/scala/io/picnicml/doddlemodel/dummy/regression/MeanRegressor.scala b/src/main/scala/io/picnicml/doddlemodel/dummy/regression/MeanRegressor.scala index 83785559..581c02eb 100644 --- a/src/main/scala/io/picnicml/doddlemodel/dummy/regression/MeanRegressor.scala +++ b/src/main/scala/io/picnicml/doddlemodel/dummy/regression/MeanRegressor.scala @@ -6,17 +6,20 @@ import cats.syntax.option._ import io.picnicml.doddlemodel.data.{Features, Target} import io.picnicml.doddlemodel.typeclasses.Regressor -/** An immutable dummy regressor that always predicts the sample mean. - * - * @example - * {{{ - * val model = MeanRegressor() - * }}} - */ + case class MeanRegressor private (mean: Option[Float]) +/** An immutable dummy regressor that always predicts the sample mean. */ object MeanRegressor { + /** Create a mean regressor. + * + * @example + * {{{ + * import io.picnicml.doddlemodel.dummy.regression.MeanRegressor + * val model = MeanRegressor() + * }}} + */ def apply(): MeanRegressor = MeanRegressor(none) @SerialVersionUID(0L) diff --git a/src/main/scala/io/picnicml/doddlemodel/dummy/regression/MedianRegressor.scala b/src/main/scala/io/picnicml/doddlemodel/dummy/regression/MedianRegressor.scala index ea8356e7..04145d71 100644 --- a/src/main/scala/io/picnicml/doddlemodel/dummy/regression/MedianRegressor.scala +++ b/src/main/scala/io/picnicml/doddlemodel/dummy/regression/MedianRegressor.scala @@ -6,17 +6,19 @@ import cats.syntax.option._ import io.picnicml.doddlemodel.data.{Features, Target} import io.picnicml.doddlemodel.typeclasses.Regressor -/** An immutable dummy regressor that always predicts the sample median. - * - * @example - * {{{ - * val model = MedianRegressor() - * }}} - */ case class MedianRegressor private (median: Option[Float]) +/** An immutable dummy regressor that always predicts the sample median. */ object MedianRegressor { + /** Create a median regressor. + * + * @example + * {{{ + * import io.picnicml.doddlemodel.dummy.regression.MedianRegressor + * val model = MedianRegressor() + * }}} + */ def apply(): MedianRegressor = MedianRegressor(none) @SerialVersionUID(0L) From c4a8b57b9d398ceec592d7e401f7f557d2ca040d Mon Sep 17 00:00:00 2001 From: Matej Klemen Date: Tue, 15 Oct 2019 10:09:05 +0200 Subject: [PATCH 12/15] Verify and improve docs for `impute` --- .../doddlemodel/impute/MeanValueImputer.scala | 28 ++++++++++++----- .../impute/MostFrequentValueImputer.scala | 31 +++++++++++++------ 2 files changed, 42 insertions(+), 17 deletions(-) diff --git a/src/main/scala/io/picnicml/doddlemodel/impute/MeanValueImputer.scala b/src/main/scala/io/picnicml/doddlemodel/impute/MeanValueImputer.scala index 3b414310..2fcc5278 100644 --- a/src/main/scala/io/picnicml/doddlemodel/impute/MeanValueImputer.scala +++ b/src/main/scala/io/picnicml/doddlemodel/impute/MeanValueImputer.scala @@ -11,26 +11,38 @@ import io.picnicml.doddlemodel.typeclasses.Transformer case class MeanValueImputer private (private[impute] val means: Option[RealVector], private val featureIndex: FeatureIndex) -/** An immutable simple imputer that replaces all NaN values with column means. */ +/** An immutable simple imputer that replaces numerical NaN values with column means. Categorical values are left + * untouched. */ object MeanValueImputer { /** Create an imputer based on a feature index. * - * @param featureIndex feature index associated with features, this is needed so that only numerical features + * @param featureIndex feature index associated with features - this is needed so that only numerical features * are transformed by this preprocessor, could be a subset of columns to be transformed * * @example Impute values for all (numerical) features. * {{{ - * val featureIndex = FeatureIndex(List(NumericalFeature, CategoricalFeature, NumericalFeature, - * NumericalFeature)) - * val imputer = MeanValueImputer(featureIndex) + * import io.picnicml.doddlemodel.data.CsvLoader.loadCsvDataset + * import io.picnicml.doddlemodel.impute.MeanValueImputer + * import io.picnicml.doddlemodel.syntax.TransformerSyntax._ + * + * val (data, featureInfo) = loadCsvDataset("src/main/resources/datasets/dummy_csv_reading.csv", "NA") + * val imputer = MeanValueImputer(featureInfo) + * val fittedImputer = imputer.fit(data) + * // Note: only fourth (index 3) column gets imputed as it's the only numerical column with NAs + * fittedImputer.transform(data) * }}} * * @example Impute values for a subset of features. * {{{ - * val featureIndex = FeatureIndex(List("f0", "f1", "f2"), List(NumericalFeature, NumericalFeature, - * NumericalFeature), List(0, 1, 2)) - * val imputerSubsetOfColumns = MeanValueImputer(featureIndex.subset("f0", "f2")) + * import io.picnicml.doddlemodel.data.CsvLoader.loadCsvDataset + * import io.picnicml.doddlemodel.impute.MeanValueImputer + * import io.picnicml.doddlemodel.syntax.TransformerSyntax._ + * + * val (data, featureInfo) = loadCsvDataset("src/main/resources/datasets/dummy_csv_reading.csv", "NA") + * val imputerSubset = MeanValueImputer(featureInfo.subset("f3")) + * val fittedImputer = imputerSubset.fit(data) + * fittedImputer.transform(data) * }}} */ def apply(featureIndex: FeatureIndex): MeanValueImputer = MeanValueImputer(none, featureIndex) diff --git a/src/main/scala/io/picnicml/doddlemodel/impute/MostFrequentValueImputer.scala b/src/main/scala/io/picnicml/doddlemodel/impute/MostFrequentValueImputer.scala index 2d912cb2..05a71f8c 100644 --- a/src/main/scala/io/picnicml/doddlemodel/impute/MostFrequentValueImputer.scala +++ b/src/main/scala/io/picnicml/doddlemodel/impute/MostFrequentValueImputer.scala @@ -9,26 +9,39 @@ import io.picnicml.doddlemodel.typeclasses.Transformer case class MostFrequentValueImputer private (private[impute] val mostFrequent: Option[RealVector], private val featureIndex: FeatureIndex) -/** An immutable simple imputer that replaces all NaN values with most frequent value of a corresponding column. */ + +/** An immutable simple imputer that replaces categorical NaN values with most frequent value of the corresponding + * column. Numerical values are left untouched. */ object MostFrequentValueImputer { /** Create an imputer based on a feature index. * - * @param featureIndex feature index associated with features, this is needed so that only categorical features + * @param featureIndex feature index associated with features - this is needed so that only categorical features * are transformed by this preprocessor, could be a subset of columns to be transformed * - * @example Impute values for all (numerical) features. + * @example Impute values for all (categorical) features. * {{{ - * val featureIndex = FeatureIndex(List(NumericalFeature, CategoricalFeature, NumericalFeature, - * NumericalFeature)) - * val imputer = MostFrequentValueImputer(featureIndex) + * import io.picnicml.doddlemodel.data.CsvLoader.loadCsvDataset + * import io.picnicml.doddlemodel.impute.MostFrequentValueImputer + * import io.picnicml.doddlemodel.syntax.TransformerSyntax._ + * + * val (data, featureInfo) = loadCsvDataset("src/main/resources/datasets/dummy_csv_reading.csv", "NA") + * val imputer = MostFrequentValueImputer(featureInfo) + * val fittedImputer = imputer.fit(data) + * // Note: only second (index 1) column gets imputed as it's the only categorical column with NAs + * fittedImputer.transform(data) * }}} * * @example Impute values for a subset of features. * {{{ - * val featureIndex = FeatureIndex(List("f0", "f1", "f2"), List(NumericalFeature, NumericalFeature, - * NumericalFeature), List(0, 1, 2)) - * val imputerSubsetOfColumns = MostFrequentValueImputer(featureIndex.subset("f0", "f2")) + * import io.picnicml.doddlemodel.data.CsvLoader.loadCsvDataset + * import io.picnicml.doddlemodel.impute.MostFrequentValueImputer + * import io.picnicml.doddlemodel.syntax.TransformerSyntax._ + * + * val (data, featureInfo) = loadCsvDataset("src/main/resources/datasets/dummy_csv_reading.csv", "NA") + * val imputerSubset = MostFrequentValueImputer(featureInfo.subset("f1")) + * val fittedImputer = imputerSubset.fit(data) + * fittedImputer.transform(data) * }}} */ def apply(featureIndex: FeatureIndex): MostFrequentValueImputer = From 8406cb933484c08c9be6fa64bb2c984f9663efb1 Mon Sep 17 00:00:00 2001 From: Matej Klemen Date: Tue, 15 Oct 2019 11:33:25 +0200 Subject: [PATCH 13/15] Verify and improve docs for `linear` --- .../doddlemodel/linear/LinearRegression.scala | 14 ++++++++------ .../linear/LogisticRegression.scala | 18 ++++++++++-------- .../doddlemodel/linear/PoissonRegression.scala | 14 ++++++++------ .../doddlemodel/linear/SoftmaxClassifier.scala | 14 ++++++++------ 4 files changed, 34 insertions(+), 26 deletions(-) diff --git a/src/main/scala/io/picnicml/doddlemodel/linear/LinearRegression.scala b/src/main/scala/io/picnicml/doddlemodel/linear/LinearRegression.scala index 34ede7cf..e5c1b92f 100644 --- a/src/main/scala/io/picnicml/doddlemodel/linear/LinearRegression.scala +++ b/src/main/scala/io/picnicml/doddlemodel/linear/LinearRegression.scala @@ -13,16 +13,18 @@ object LinearRegression { /** Create a regularized linear regression model. * - * @param lambda L2 regularization strength, must be non-negative, 0.0 means no regularization + * @param lambda L2 regularization strength - must be non-negative, 0.0 means no regularization * - * @example Create and fit a regularized linear regression model with lambda = 1.5. + * @example Create and fit a regularized linear regression model with lambda 1.5. * {{{ - * import io.picnicml.doddlemodel.linear.LinearRegression.ev + * import breeze.linalg.{DenseMatrix, DenseVector} + * import io.picnicml.doddlemodel.linear.LinearRegression + * import io.picnicml.doddlemodel.syntax.RegressorSyntax._ * - * val X: Features = DenseMatrix(List(1.0, 2.0), List(3.0, 4.0)) - * val y: Target = DenseVector(-3.0, 2.0) + * val X = DenseMatrix(List(1.0f, 2.0f), List(3.0f, 4.0f)) + * val y = DenseVector(-3.0f, 2.0f) * val model = LinearRegression(lambda = 1.5f) - * val fittedModel = ev.fit(model, X, y) + * val fittedModel = model.fit(X, y) * }}} */ def apply(lambda: Float = 0.0f): LinearRegression = { diff --git a/src/main/scala/io/picnicml/doddlemodel/linear/LogisticRegression.scala b/src/main/scala/io/picnicml/doddlemodel/linear/LogisticRegression.scala index 3a30860b..5d32f17b 100644 --- a/src/main/scala/io/picnicml/doddlemodel/linear/LogisticRegression.scala +++ b/src/main/scala/io/picnicml/doddlemodel/linear/LogisticRegression.scala @@ -15,17 +15,19 @@ object LogisticRegression { /** Create a regularized logistic regression model. * - * @param lambda L2 regularization strength, must be non-negative, 0.0 means no regularization + * @param lambda L2 regularization strength - must be non-negative, 0.0 means no regularization * - * @example Create and fit a logistic regression model with lambda = 1.5. - * {{{ - * import io.picnicml.doddlemodel.linear.LogisticRegression.ev + * @example Create and fit a logistic regression model with lambda 1.5. + * {{{ + * import breeze.linalg.{DenseMatrix, DenseVector} + * import io.picnicml.doddlemodel.linear.LogisticRegression + * import io.picnicml.doddlemodel.syntax.ClassifierSyntax._ * - * val X: Features = DenseMatrix(List(1.0, 2.0), List(3.0, 4.0)) - * val y: Target = DenseVector(0.0, 1.0) + * val X = DenseMatrix(List(1.0f, 2.0f), List(3.0f, 4.0f)) + * val y = DenseVector(0.0f, 1.0f) * val model = LogisticRegression(lambda = 1.5f) - * val fittedModel = ev.fit(model, X, y) - * }}} + * val fittedModel = model.fit(X, y) + * }}} */ def apply(lambda: Float = 0.0f): LogisticRegression = { require(lambda >= 0.0f, "L2 regularization strength must be non-negative") diff --git a/src/main/scala/io/picnicml/doddlemodel/linear/PoissonRegression.scala b/src/main/scala/io/picnicml/doddlemodel/linear/PoissonRegression.scala index 9e8b2237..8df43dbc 100644 --- a/src/main/scala/io/picnicml/doddlemodel/linear/PoissonRegression.scala +++ b/src/main/scala/io/picnicml/doddlemodel/linear/PoissonRegression.scala @@ -15,16 +15,18 @@ object PoissonRegression { /** Create a regularized Poisson regression model. * - * @param lambda L2 regularization strength, must be non-negative, 0.0 means no regularization + * @param lambda L2 regularization strength - must be non-negative, 0.0 means no regularization * - * @example Create and fit a regularized Poisson regression model with lambda = 1.5. + * @example Create and fit a regularized Poisson regression model with lambda 1.5. * {{{ - * import io.picnicml.doddlemodel.linear.PoissonRegression.ev + * import breeze.linalg.{DenseMatrix, DenseVector} + * import io.picnicml.doddlemodel.syntax.RegressorSyntax._ + * import io.picnicml.doddlemodel.linear.PoissonRegression * - * val X: Features = DenseMatrix(List(1.0, 2.0), List(3.0, 4.0)) - * val y: Target = DenseVector(-3.0, 2.0) + * val X = DenseMatrix(List(1.0f, 2.0f), List(3.0f, 4.0f)) + * val y = DenseVector(-3.0f, 2.0f) * val model = PoissonRegression(lambda = 1.5f) - * val fittedModel = ev.fit(model, X, y) + * val fittedModel = model.fit(X, y) * }}} */ def apply(lambda: Float = 0.0f): PoissonRegression = { diff --git a/src/main/scala/io/picnicml/doddlemodel/linear/SoftmaxClassifier.scala b/src/main/scala/io/picnicml/doddlemodel/linear/SoftmaxClassifier.scala index e1e0bd43..5193ae36 100644 --- a/src/main/scala/io/picnicml/doddlemodel/linear/SoftmaxClassifier.scala +++ b/src/main/scala/io/picnicml/doddlemodel/linear/SoftmaxClassifier.scala @@ -16,16 +16,18 @@ object SoftmaxClassifier { /** Create a regularized softmax model. * - * @param lambda L2 regularization strength, must be non-negative, 0.0 means no regularization + * @param lambda L2 regularization strength - must be non-negative, 0.0 means no regularization * - * @example Create and fit a regularized softmax classifier with lambda = 1.5. + * @example Create and fit a regularized softmax classifier with lambda 1.5. * {{{ - * import io.picnicml.doddlemodel.linear.SoftmaxClassifier.ev + * import breeze.linalg.{DenseMatrix, DenseVector} + * import io.picnicml.doddlemodel.linear.SoftmaxClassifier + * import io.picnicml.doddlemodel.syntax.ClassifierSyntax._ * - * val X: Features = DenseMatrix(List(1.0, 2.0), List(3.0, 4.0)) - * val y: Target = DenseVector(0.0, 1.0) + * val X = DenseMatrix(List(1.0f, 2.0f), List(3.0f, 4.0f)) + * val y = DenseVector(0.0f, 1.0f) * val model = SoftmaxClassifier(lambda = 1.5f) - * val fittedModel = ev.fit(model, X, y) + * val fittedModel = model.fit(X, y) * }}} */ def apply(lambda: Float = 0.0f): SoftmaxClassifier = { From 9df160c920e097b30348d9d059ab9e2f64ffa11e Mon Sep 17 00:00:00 2001 From: Matej Klemen Date: Tue, 15 Oct 2019 11:42:44 +0200 Subject: [PATCH 14/15] Consistency fix for linear docs (document object, not case class) --- .../scala/io/picnicml/doddlemodel/linear/LinearRegression.scala | 2 +- .../io/picnicml/doddlemodel/linear/LogisticRegression.scala | 2 +- .../io/picnicml/doddlemodel/linear/PoissonRegression.scala | 2 +- .../io/picnicml/doddlemodel/linear/SoftmaxClassifier.scala | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/main/scala/io/picnicml/doddlemodel/linear/LinearRegression.scala b/src/main/scala/io/picnicml/doddlemodel/linear/LinearRegression.scala index e5c1b92f..5739c4e8 100644 --- a/src/main/scala/io/picnicml/doddlemodel/linear/LinearRegression.scala +++ b/src/main/scala/io/picnicml/doddlemodel/linear/LinearRegression.scala @@ -4,11 +4,11 @@ import cats.syntax.option._ import io.picnicml.doddlemodel.data.{Features, RealVector, Target} import io.picnicml.doddlemodel.linear.typeclasses.LinearRegressor -/** An immutable multiple linear regression model with ridge regularization. */ case class LinearRegression private (lambda: Float, private val w: Option[RealVector]) { private var yPredCache: Target = _ } +/** An immutable multiple linear regression model with ridge regularization. */ object LinearRegression { /** Create a regularized linear regression model. diff --git a/src/main/scala/io/picnicml/doddlemodel/linear/LogisticRegression.scala b/src/main/scala/io/picnicml/doddlemodel/linear/LogisticRegression.scala index 5d32f17b..efc3aa24 100644 --- a/src/main/scala/io/picnicml/doddlemodel/linear/LogisticRegression.scala +++ b/src/main/scala/io/picnicml/doddlemodel/linear/LogisticRegression.scala @@ -6,11 +6,11 @@ import cats.syntax.option._ import io.picnicml.doddlemodel.data.{Features, RealVector, Simplex, Target} import io.picnicml.doddlemodel.linear.typeclasses.LinearClassifier -/** An immutable multiple logistic regression model with ridge regularization. */ case class LogisticRegression private (lambda: Float, numClasses: Option[Int], private val w: Option[RealVector]) { private var yPredProbaCache: RealVector = _ } +/** An immutable multiple logistic regression model with ridge regularization. */ object LogisticRegression { /** Create a regularized logistic regression model. diff --git a/src/main/scala/io/picnicml/doddlemodel/linear/PoissonRegression.scala b/src/main/scala/io/picnicml/doddlemodel/linear/PoissonRegression.scala index 8df43dbc..1cf50476 100644 --- a/src/main/scala/io/picnicml/doddlemodel/linear/PoissonRegression.scala +++ b/src/main/scala/io/picnicml/doddlemodel/linear/PoissonRegression.scala @@ -6,11 +6,11 @@ import cats.syntax.option._ import io.picnicml.doddlemodel.data.{Features, RealVector, Target} import io.picnicml.doddlemodel.linear.typeclasses.LinearRegressor -/** An immutable multiple Poisson regression model with ridge regularization. */ case class PoissonRegression private (lambda: Float, private val w: Option[RealVector]) { private var yPredMeanCache: Target = _ } +/** An immutable multiple Poisson regression model with ridge regularization. */ object PoissonRegression { /** Create a regularized Poisson regression model. diff --git a/src/main/scala/io/picnicml/doddlemodel/linear/SoftmaxClassifier.scala b/src/main/scala/io/picnicml/doddlemodel/linear/SoftmaxClassifier.scala index 5193ae36..ae0871d4 100644 --- a/src/main/scala/io/picnicml/doddlemodel/linear/SoftmaxClassifier.scala +++ b/src/main/scala/io/picnicml/doddlemodel/linear/SoftmaxClassifier.scala @@ -7,11 +7,11 @@ import io.picnicml.doddlemodel.data.{Features, RealVector, Simplex, Target} import io.picnicml.doddlemodel.linear.typeclasses.LinearClassifier import io.picnicml.doddlemodel.syntax.OptionSyntax._ -/** An immutable multiple multinomial regression model with ridge regularization. */ case class SoftmaxClassifier private (lambda: Float, numClasses: Option[Int], private val w: Option[RealVector]) { private var yPredProbaCache: Simplex = _ } +/** An immutable multiple multinomial regression model with ridge regularization. */ object SoftmaxClassifier { /** Create a regularized softmax model. From ea6454913408ad83ddcabb1eebd433e9eeb9e625 Mon Sep 17 00:00:00 2001 From: Matej Klemen Date: Tue, 15 Oct 2019 13:44:52 +0200 Subject: [PATCH 15/15] Verify and improve docs for `modelselection` --- .../modelselection/CrossValidation.scala | 51 ++++++++++++++----- .../modelselection/GroupKFoldSplitter.scala | 7 ++- .../modelselection/HyperparameterSearch.scala | 23 ++------- .../modelselection/KFoldSplitter.scala | 7 ++- 4 files changed, 53 insertions(+), 35 deletions(-) diff --git a/src/main/scala/io/picnicml/doddlemodel/modelselection/CrossValidation.scala b/src/main/scala/io/picnicml/doddlemodel/modelselection/CrossValidation.scala index c3798426..5d631f23 100644 --- a/src/main/scala/io/picnicml/doddlemodel/modelselection/CrossValidation.scala +++ b/src/main/scala/io/picnicml/doddlemodel/modelselection/CrossValidation.scala @@ -15,10 +15,33 @@ class CrossValidation private (val metric: Metric, val dataSplitter: DataSplitte /** Obtain the average score of all folds. * - * @param reusable indicates whether to shutdown the thread pool after the cv score is computed - * and by default it is, if the same CrossValidation instance is needed after the first call - * to score(...), bring implicit CrossValReusable(true) to scope and call CrossValidation.shutdownNow() - * after the instance is not needed anymore + * @param reusable indicates whether to shutdown the thread pool after the cv score is computed. + * By default it is shutdown. + * + * @note If the same `CrossValidation` instance is needed after the first call to `score(...)`, bring an implicit + * `CrossValReusable(true)` to scope and call `CrossValidation.shutdownNow()` after the instance is not + * needed anymore. + * + * @example Reuse a `CrossValidation` instance. + * {{{ + * import breeze.linalg.{DenseMatrix, DenseVector} + * import io.picnicml.doddlemodel.metrics.rmse + * import io.picnicml.doddlemodel.linear.LogisticRegression + * import io.picnicml.doddlemodel.modelselection.{CrossValidation, KFoldSplitter} + * import io.picnicml.doddlemodel.modelselection.CrossValReusable + * + * implicit val cvReusable = CrossValReusable(true) + * val X = DenseMatrix(List(1.0f, 2.0f), List(3.0f, 4.0f), List(5.0f, 6.0f), List(7.0f, 8.0f)) + * val y = DenseVector(0.0f, 1.0f, 0.0f, 1.0f) + * val model = LogisticRegression(1.0f) + * + * val splitter = KFoldSplitter(numFolds = 2) + * val cv = CrossValidation(metric = rmse, dataSplitter = splitter) + * cv.score(model, X, y) + * // would throw a `RejectedExecutionException` if an implicit `CrossValReusable` instance was not defined + * cv.score(model, X, y) + * cv.shutdownNow() + * }}} */ def score[A](model: A, x: Features, y: Target, groups: Option[IntVector] = none) (implicit ev: Predictor[A], @@ -43,7 +66,7 @@ class CrossValidation private (val metric: Metric, val dataSplitter: DataSplitte /** * Shuts down the current thread pool. Call this if the CrossValidation instance is not needed - * anymore and CrossValReusable(true) is in scope. + * anymore and `CrossValReusable(true)` is in scope. */ def shutdownNow(): Unit = this.ec.shutdownNow() } @@ -58,16 +81,18 @@ object CrossValidation { * @example Perform 2-fold cross validation using logistic regression and evaluate its performance * using root mean squared error. * {{{ - * import io.picnicml.doddlemodel.metrics.rmse - * import io.picnicml.doddlemodel.linear.LogisticRegression + * import breeze.linalg.{DenseMatrix, DenseVector} + * import io.picnicml.doddlemodel.metrics.rmse + * import io.picnicml.doddlemodel.linear.LogisticRegression + * import io.picnicml.doddlemodel.modelselection.{CrossValidation, KFoldSplitter} * - * val X: Features = DenseMatrix(List(1.0, 2.0), List(3.0, 4.0), List(5.0, 6.0), List(7.0, 8.0)) - * val y: Target = DenseVector(0.0, 1.0, 0.0, 1.0) - * val model = LogisticRegression(1.0) + * val X = DenseMatrix(List(1.0f, 2.0f), List(3.0f, 4.0f), List(5.0f, 6.0f), List(7.0f, 8.0f)) + * val y = DenseVector(0.0f, 1.0f, 0.0f, 1.0f) + * val model = LogisticRegression(1.0f) * - * val splitter = KFoldSplitter(numFolds = 2) - * val cv = CrossValidation(metric = rmse, dataSplitter = splitter)) - * cv.score(model, X, y) + * val splitter = KFoldSplitter(numFolds = 2) + * val cv = CrossValidation(metric = rmse, dataSplitter = splitter) + * cv.score(model, X, y) * }}} * * @see [[io.picnicml.doddlemodel.metrics Metrics in doddle-model]] diff --git a/src/main/scala/io/picnicml/doddlemodel/modelselection/GroupKFoldSplitter.scala b/src/main/scala/io/picnicml/doddlemodel/modelselection/GroupKFoldSplitter.scala index 0af085fb..427f4451 100644 --- a/src/main/scala/io/picnicml/doddlemodel/modelselection/GroupKFoldSplitter.scala +++ b/src/main/scala/io/picnicml/doddlemodel/modelselection/GroupKFoldSplitter.scala @@ -61,8 +61,11 @@ object GroupKFoldSplitter { * @example Split 10 examples, corresponding to data of 3 patients into 3 folds, making sure that data of a patient * never appears in both training and test set in the same fold. * {{{ - * val patientFeatures = DenseMatrix.rand(10, 3) - * val isSick = DenseVector(0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0) + * import breeze.linalg.{DenseMatrix, DenseVector, convert} + * import io.picnicml.doddlemodel.modelselection.GroupKFoldSplitter + * + * val patientFeatures = convert(DenseMatrix.rand(10, 3), Float) + * val isSick = DenseVector(0.0f, 0.0f, 1.0f, 0.0f, 0.0f, 1.0f, 1.0f, 1.0f, 0.0f, 0.0f) * val idPatient = DenseVector(1, 2, 2, 0, 0, 0, 2, 1, 1, 2) * * val splitter = GroupKFoldSplitter(numFolds = 3) diff --git a/src/main/scala/io/picnicml/doddlemodel/modelselection/HyperparameterSearch.scala b/src/main/scala/io/picnicml/doddlemodel/modelselection/HyperparameterSearch.scala index 49688bdb..f1cf2e88 100644 --- a/src/main/scala/io/picnicml/doddlemodel/modelselection/HyperparameterSearch.scala +++ b/src/main/scala/io/picnicml/doddlemodel/modelselection/HyperparameterSearch.scala @@ -7,21 +7,6 @@ import io.picnicml.doddlemodel.typeclasses.Predictor import scala.util.Random - -/** A parallel hyperparameter search using n-fold cross validation. - * - * @param numIterations number of predictors for which the cross validation score is calculated - * @param verbose flag that specifies whether validation score of the selected model is - * printed to standard output - * - * Examples: - * val splitter = KFoldSplitter(numFolds = 3) - * val cv: CrossValidation = CrossValidation(metric = accuracy, dataSplitter = splitter) - * val search = HyperparameterSearch(numIterations = 3, crossValidation = cv) - * val bestModel = search.bestOf(x, y) { - * LogisticRegression(lambda = gamma.draw()) - * } - */ class HyperparameterSearch private (val numIterations: Int, val crossVal: CrossValidation, verbose: Boolean) { implicit val cvReusable: CrossValReusable = CrossValReusable(true) @@ -63,16 +48,18 @@ object HyperparameterSearch { * @example Search among 3 different regularization values (0.1, 0.2, 0.5) for logistic regression using * 3-fold cross validation and store the (re-fitted on entire dataset) model that obtains highest accuracy. * {{{ + * import breeze.linalg.{DenseMatrix, DenseVector, convert} * import io.picnicml.doddlemodel.metrics.accuracy * import io.picnicml.doddlemodel.linear.LogisticRegression + * import io.picnicml.doddlemodel.modelselection.{CrossValidation, KFoldSplitter, HyperparameterSearch} * - * val x = DenseMatrix.rand(10, 3) - * val y = DenseVector(0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0) + * val x = convert(DenseMatrix.rand(10, 3), Float) + * val y = DenseVector(0.0f, 1.0f, 0.0f, 0.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f, 1.0f) * * val splitter = KFoldSplitter(numFolds = 3) * val cv = CrossValidation(metric = accuracy, dataSplitter = splitter) * val search = HyperparameterSearch(numIterations = 3, crossValidation = cv) - * val lambdas = List(0.1, 0.2, 0.5).iterator + * val lambdas = List(0.1f, 0.2f, 0.5f).iterator * * val modelBestParams = search.bestOf(x, y) { * LogisticRegression(lambda = lambdas.next) diff --git a/src/main/scala/io/picnicml/doddlemodel/modelselection/KFoldSplitter.scala b/src/main/scala/io/picnicml/doddlemodel/modelselection/KFoldSplitter.scala index c2877269..78770317 100644 --- a/src/main/scala/io/picnicml/doddlemodel/modelselection/KFoldSplitter.scala +++ b/src/main/scala/io/picnicml/doddlemodel/modelselection/KFoldSplitter.scala @@ -60,8 +60,11 @@ object KFoldSplitter { * * @example Split data into 3 folds. * {{{ - * val x = DenseMatrix.rand(7, 2) - * val y = DenseVector(0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0) + * import breeze.linalg.{DenseMatrix, DenseVector, convert} + * import io.picnicml.doddlemodel.modelselection.KFoldSplitter + * + * val x = convert(DenseMatrix.rand(7, 2), Float) + * val y = DenseVector(0.0f, 1.0f, 0.0f, 0.0f, 1.0f, 0.0f, 0.0f) * * val splitter = KFoldSplitter(numFolds = 3) * splitter.splitData(x, y)