picnicml · matejklemen · Jul 13, 2019 · Jul 13, 2019 · Jul 13, 2019 · Jul 13, 2019
diff --git a/src/main/scala/io/picnicml/doddlemodel/data/CsvLoader.scala b/src/main/scala/io/picnicml/doddlemodel/data/CsvLoader.scala
@@ -9,7 +9,27 @@ import scala.io.{BufferedSource, Source}
 
 object CsvLoader {
 
-  /** Loads a csv dataset with 2 header lines (1st line for feature names and 2nd for types). */
+  /** Loads a csv dataset with 2 header lines (first line for feature names and second for types).
+    * @param datasetFilePath csv file to load
+    * @param na value to interpret as N/A data in the given dataset
+    *
+    * @example Reading the iris dataset.
+    *   {{{
+    *     import io.picnicml.doddlemodel.data.CsvLoader.loadCsvDataset
+    *
+    *     val (data, featureInfo) = loadCsvDataset("src/main/resources/datasets/iris.csv")
+    *     // separate features from the label
+    *     val (irisFeatures, irisLabels) = (data(::, 0 to 3), data(::, -1))
+    *   }}}
+    *
+    * @example Reading a dataset where N/A values are marked with `NA`.
+    *   {{{
+    *     import io.picnicml.doddlemodel.data.CsvLoader.loadCsvDataset
+    *
+    *     // specify a value to interpret as N/A data
+    *     val (data, featureInfo) = loadCsvDataset("src/main/resources/datasets/dummy_csv_reading.csv", "NA")
+    *   }}}
+    */
   def loadCsvDataset(datasetFilePath: String, na: String = "NA"): FeaturesWithIndex =
     loadCsvDataset(Source.fromFile(datasetFilePath), na)
 

diff --git a/src/main/scala/io/picnicml/doddlemodel/data/DatasetUtils.scala b/src/main/scala/io/picnicml/doddlemodel/data/DatasetUtils.scala
@@ -6,13 +6,71 @@ import scala.util.Random
 
 object DatasetUtils {
 
-  /** Shuffles rows of the dataset. */
+  /** Shuffles rows of the dataset.
+    * @param x features to be shuffled
+    * @param y labels corresponding to features
+    *
+    * @example Shuffle a dataset randomly.
+    *   {{{
+    *     import scala.util.Random
+    *     import io.picnicml.doddlemodel.data.CsvLoader.loadCsvDataset
+    *     import io.picnicml.doddlemodel.data.DatasetUtils.shuffleDataset
+    *
+    *     val (data, featureInfo) = loadCsvDataset("src/main/resources/datasets/iris.csv")
+    *     val (dataX, dataY) = (data(::, 0 to 3), data(::, -1))
+    *
+    *     val (shuffledX, shuffledY) = shuffleDataset(dataX, dataY)
+    *
+    *     // seeded shuffle - seed is passed to shuffler implicitly
+    *     implicit val rand: Random = new Random(42)
+    *     val (shuffledX, shuffledY) = shuffleDataset(dataX, dataY)
+    *   }}}
+    */
   def shuffleDataset(x: Features, y: Target)(implicit rand: Random = new Random()): Dataset = {
     val shuffleIndices = rand.shuffle((0 until y.length).toIndexedSeq)
     (x(shuffleIndices, ::).toDenseMatrix, y(shuffleIndices).toDenseVector)
   }
 
-  /** Splits the dataset into two subsets for training and testing. */
+  /** Splits the dataset into two subsets for training and testing.
+    * @param x features to be split
+    * @param y labels corresponding to features
+    * @param proportionTrain proportion of dataset to be put into training set - between 0.0 and 1.0
+    *
+    * @example Split dataset into training and test set.
+    *   {{{
+    *     import io.picnicml.doddlemodel.data.CsvLoader.loadCsvDataset
+    *     import io.picnicml.doddlemodel.data.DatasetUtils.splitDataset
+    *
+    *     val (data, featureInfo) = loadCsvDataset("src/main/resources/datasets/iris.csv")
+    *     val (dataX, dataY) = (data(::, 0 to 3), data(::, -1))
+    *
+    *     // by default, the split is 50%:50%
+    *     val trTeSplit = splitDataset(dataX, dataY)
+    *
+    *     // put 80% of data into training set and 20% into test set
+    *     val trTeSplit = splitDataset(dataX, dataY, 0.8f)
+    *     val (trainX, trainY, testX, testY) = (trTeSplit.xTr, trTeSplit.yTr, trTeSplit.xTe, trTeSplit.yTe)
+    *   }}}
+    *
+    *   @example Split dataset into training, validation and test set in the ratio 60%: 10%: 30%. This is done by
+    *            performing two train-test splits in a row. First we split the dataset in ratio (60% + 10%): 30%,
+    *            obtaining combined training and validation set and the test set. Then we split the first part using
+    *            the ratio (60% / 70%): (10% / 70%) to obtain the training and validation set.
+    *     {{{
+    *       import io.picnicml.doddlemodel.data.CsvLoader.loadCsvDataset
+    *       import io.picnicml.doddlemodel.data.DatasetUtils.splitDataset
+    *
+    *       val (data, featureInfo) = loadCsvDataset("src/main/resources/datasets/iris.csv")
+    *       val (dataX, dataY) = (data(::, 0 to 3), data(::, -1))
+    *
+    *       val teVsNonTeSplit = splitDataset(dataX, dataY, 0.7f)
+    *       val (trValX, trValY, testX, testY) = (teVsNonTeSplit.xTr, teVsNonTeSplit.yTr,
+    *                                             teVsNonTeSplit.xTe, teVsNonTeSplit.yTe)
+    *
+    *       val trValSplit = splitDataset(trValX, trValY, (0.6f / 0.7f))
+    *       val (trainX, trainY, valX, valY) = (trValSplit.xTr, trValSplit.yTr, trValSplit.xTe, trValSplit.yTe)
+    *     }}}
+    */
   def splitDataset(x: Features, y: Target, proportionTrain: Float = 0.5f): TrainTestSplit = {
     val numTrain = numberOfTrainExamplesBasedOnProportion(x.rows, proportionTrain)
     val trIndices = 0 until numTrain

diff --git a/src/main/scala/io/picnicml/doddlemodel/data/Feature.scala b/src/main/scala/io/picnicml/doddlemodel/data/Feature.scala
@@ -35,20 +35,71 @@ object Feature {
       subset(subsetIndices:_*)
     }
 
+    /** Create a feature index with subset of features, provided by feature names.
+      * @param names subset of features to be selected
+      *
+      * @example Create feature index based on features "f1" and "f3" from a constructed feature index.
+      *   {{{
+      *     import io.picnicml.doddlemodel.data.Feature.{FeatureIndex, NumericalFeature}
+      *
+      *     val featureIndex = FeatureIndex(List("f1", "f2", "f3"), List(NumericalFeature, NumericalFeature,
+      *       NumericalFeature), List(0, 1, 2))
+      *     val subIndex = featureIndex.subset("f1", "f3")
+      *   }}}
+      *
+      */
     def subset(names: String*): FeatureIndex = {
       val nameToIndex = this.names.zipWithIndex.toMap
       subset(names.map(n => nameToIndex(n)):_*)
     }
 
+    /** Create a feature index with subset of features, provided by feature indices.
+      * @param indices column indices for subset of features to be selected
+      *
+      * @example Create feature index based on second and third (i.e. indices 1, 2) features from a constructed
+      *          feature index.
+      *   {{{
+      *     import io.picnicml.doddlemodel.data.Feature.{FeatureIndex, NumericalFeature}
+      *
+      *     val featureIndex = FeatureIndex(List("f1", "f2", "f3"), List(NumericalFeature, NumericalFeature,
+      *       NumericalFeature), List(0, 1, 2))
+      *     val subIndex = featureIndex.subset(1 to 2)
+      *   }}}
+      */
     def subset(indices: IndexedSeq[Int]): FeatureIndex = subset(indices:_*)
 
+    /** Create a feature index with subset of features, provided by feature indices. Alternative interface to do same
+      * as with `FeatureIndex.subset(indices: IndexedSeq[Int])`.
+      * @param indices column indices for subset of features to be selected
+      *
+      * @example Create feature index based on second and third (i.e. indices 1, 2) features from a constructed
+      *          feature index.
+      * {{{
+      *   import io.picnicml.doddlemodel.data.Feature.{FeatureIndex, NumericalFeature}
+      *
+      *   val featureIndex = FeatureIndex(List("f1", "f2", "f3"), List(NumericalFeature, NumericalFeature,
+      *     NumericalFeature), List(0, 1, 2))
+      *   val subIndex = featureIndex.subset(1, 2)
+      * }}}
+      */
     // DummyImplicit is needed to avoid the same type as String* after erasure
     def subset(indices: Int*)(implicit di: DummyImplicit): FeatureIndex = new FeatureIndex(
       indices.toIndexedSeq.map(i => this.names(i)),
       indices.toIndexedSeq.map(i => this.types(i)),
       indices.toIndexedSeq.map(i => this.columnIndices(i))
     )
 
+    /** Create a feature index by dropping a feature by column index.
+      * @param index index of column to be dropped
+      * @example Drop the third (index 2) feature from a feature index.
+      *   {{{
+      *     import io.picnicml.doddlemodel.data.Feature.{FeatureIndex, NumericalFeature}
+      *
+      *     val featureIndex = FeatureIndex(List("f1", "f2", "f3"), List(NumericalFeature, NumericalFeature,
+      *       NumericalFeature), List(0, 1, 2))
+      *     val subIndex = featureIndex.drop(2)
+      *   }}}
+      */
     def drop(index: Int): FeatureIndex = new FeatureIndex(
       this.names.zipWithIndex.flatMap { case (n, i) => if (i != index) n.some else none[String] },
       this.types.zipWithIndex.flatMap { case (t, i) => if (i != index) t.some else none[FeatureType] },
@@ -61,26 +112,60 @@ object Feature {
       this.names.zip(this.types).map { case (n, t) => s"$n (${t.headerLineString})" } mkString ", "
   }
 
+  /** A structure that keeps track of feature metadata (names, types and indices). This is needed
+    * because some methods are only applicable to a certain type of features, e.g. [0, 1] scaling
+    * only makes sense for numerical features. */
   object FeatureIndex {
 
+    /** Construct feature index with `n` categorical features. Feature names are generated automatically - `i`th
+      * feature gets assigned the name "f`i`" (using 0-based counting).
+      * @param n number of categorical features in feature index
+      */
     def categorical(n: Int): FeatureIndex =
       categorical((0 until n).toList)
 
     def categorical(columnIndices: List[Int]): FeatureIndex =
       apply(columnIndices.indices.map(i => s"f$i").toList, columnIndices.map(_ => CategoricalFeature), columnIndices)
 
+    /** Construct feature index with `n` numerical features. Feature names are generated automatically - `i`th
+      * feature gets assigned the name "f`i`" (using 0-based counting).
+      * @param n number of numerical features in feature index
+      */
     def numerical(n: Int): FeatureIndex =
       numerical((0 until n).toList)
 
     def numerical(columnIndices: List[Int]): FeatureIndex =
       apply(columnIndices.indices.map(i => s"f$i").toList, columnIndices.map(_ => NumericalFeature), columnIndices)
 
+    /** Construct feature index from feature types. Feature names are generated automatically - `i`th
+      * feature gets assigned the name "f`i`" (using 0-based counting).
+      * @param types list of feature types
+      *
+      * @example Construct a feature index with one numerical and two categorical features.
+      *   {{{
+      *     import io.picnicml.doddlemodel.data.Feature.{FeatureIndex, NumericalFeature, CategoricalFeature}
+      *     val featureIndex = FeatureIndex(List(CategoricalFeature, NumericalFeature, CategoricalFeature))
+      *   }}}
+      */
     def apply(types: List[FeatureType]): FeatureIndex =
       apply(types.indices.map(i => s"f$i").toList, types, types.indices.toList)
 
     def apply(types: List[FeatureType], columnIndices: List[Int]): FeatureIndex =
       apply(types.indices.map(i => s"f$i").toList, types, columnIndices)
 
+    /** Construct a feature index with custom feature names, types and column indices.
+      * @param names feature names
+      * @param types feature types
+      * @param columnIndices column index for each feature
+      *
+      * @example Construct a feature index with three features, named "age" (numerical), "height" (numerical)
+      *          and "group" (categorical).
+      *   {{{
+      *     import io.picnicml.doddlemodel.data.Feature.{FeatureIndex, NumericalFeature, CategoricalFeature}
+      *     val featureIndex = FeatureIndex(List("age", "height", "group"), List(NumericalFeature,
+      *       NumericalFeature, CategoricalFeature), List(0, 1, 2))
+      *   }}}
+      */
     def apply(names: List[String], types: List[FeatureType], columnIndices: List[Int]): FeatureIndex =
       new FeatureIndex(names.toIndexedSeq, types.toIndexedSeq, columnIndices.toIndexedSeq)
   }

diff --git a/src/main/scala/io/picnicml/doddlemodel/data/package.scala b/src/main/scala/io/picnicml/doddlemodel/data/package.scala
@@ -4,6 +4,7 @@ import breeze.linalg.{DenseMatrix, DenseVector, unique}
 import io.picnicml.doddlemodel.CrossScalaCompat.floatOrdering
 import io.picnicml.doddlemodel.data.Feature.FeatureIndex
 
+/** Provides data management utilities and definitions of custom doddle-model data types. */
 package object data {
 
   type RealVector = DenseVector[Float]
@@ -17,9 +18,24 @@ package object data {
   type Dataset = (Features, Target)
   type DatasetWithIndex = (Features, Target, FeatureIndex)
 
+  /** Loads and returns the Boston Housing prices dataset. */
   def loadBostonDataset: DatasetWithIndex = ResourceDatasetLoaders.loadBostonDataset
+
+  /** Loads and returns the Breast cancer Wisconsin (diagnostic) dataset.
+    *
+    * @see <a href="https://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Wisconsin+%28Diagnostic%29" target="_blank">
+    *        Breast cancer dataset on UCI Machine Learning Repository </a>
+    */
   def loadBreastCancerDataset: DatasetWithIndex = ResourceDatasetLoaders.loadBreastCancerDataset
+
+  /** Loads and returns the Iris dataset.
+    *
+    * @see <a href="https://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Wisconsin+%28Diagnostic%29" target="_blank">
+    *        Iris dataset on UCI Machine Learning Repository </a>
+    */
   def loadIrisDataset: DatasetWithIndex = ResourceDatasetLoaders.loadIrisDataset
+
+  /** Loads and returns an artificial dataset with a Poisson target variable. */
   def loadHighSchoolTestDataset: DatasetWithIndex = ResourceDatasetLoaders.loadHighSchoolTestDataset
 
   def numberOfUniqueGroups(groups: IntVector): Int = {

diff --git a/src/main/scala/io/picnicml/doddlemodel/dummy/classification/MostFrequentClassifier.scala b/src/main/scala/io/picnicml/doddlemodel/dummy/classification/MostFrequentClassifier.scala
@@ -6,15 +6,19 @@ import io.picnicml.doddlemodel.CrossScalaCompat.floatOrdering
 import io.picnicml.doddlemodel.data.{Features, Simplex, Target}
 import io.picnicml.doddlemodel.typeclasses.Classifier
 
-/** An immutable dummy classifier that always predicts the most frequent label.
-  *
-  * Examples:
-  * val model = MostFrequentClassifier()
-  */
 case class MostFrequentClassifier private (numClasses: Option[Int], mostFrequentClass: Option[Float])
 
+/** An immutable dummy classifier that always predicts the most frequent label. */
 object MostFrequentClassifier {
 
+  /** Create a majority classifier.
+    *
+    * @example
+    *   {{{
+    *     import io.picnicml.doddlemodel.dummy.classification.MostFrequentClassifier
+    *     val model = MostFrequentClassifier()
+    *   }}}
+    */
   def apply(): MostFrequentClassifier = MostFrequentClassifier(none, none)
 
   @SerialVersionUID(0L)

diff --git a/src/main/scala/io/picnicml/doddlemodel/dummy/classification/StratifiedClassifier.scala b/src/main/scala/io/picnicml/doddlemodel/dummy/classification/StratifiedClassifier.scala
@@ -9,11 +9,6 @@ import io.picnicml.doddlemodel.dummy.classification.StratifiedClassifier.ev
 import io.picnicml.doddlemodel.syntax.OptionSyntax._
 import io.picnicml.doddlemodel.typeclasses.Classifier
 
-/** An immutable dummy classifier that samples predictions from a stratified categorical distribution.
-  *
-  * Examples:
-  * val model = StratifiedClassifier()
-  */
 case class StratifiedClassifier private (numClasses: Option[Int],
                                          targetDistr: Option[Multinomial[DenseVector[Double], Int]]) {
 
@@ -23,8 +18,17 @@ case class StratifiedClassifier private (numClasses: Option[Int],
   }
 }
 
+/** An immutable dummy classifier that samples predictions from a stratified categorical distribution. */
 object StratifiedClassifier {
 
+  /** Create a stratified classifier.
+    *
+    * @example
+    *   {{{
+    *     import io.picnicml.doddlemodel.dummy.classification.StratifiedClassifier
+    *     val model = StratifiedClassifier()
+    *   }}}
+    */
   def apply(): StratifiedClassifier = StratifiedClassifier(none, none)
 
   @SerialVersionUID(0L)

diff --git a/src/main/scala/io/picnicml/doddlemodel/dummy/classification/UniformClassifier.scala b/src/main/scala/io/picnicml/doddlemodel/dummy/classification/UniformClassifier.scala
@@ -6,15 +6,19 @@ import cats.syntax.option._
 import io.picnicml.doddlemodel.data.{Features, Simplex, Target}
 import io.picnicml.doddlemodel.typeclasses.Classifier
 
-/** An immutable dummy classifier that samples predictions from a uniform categorical distribution.
-  *
-  * Examples:
-  * val model = UniformClassifier()
-  */
 case class UniformClassifier private (numClasses: Option[Int])
 
+/** An immutable dummy classifier that samples predictions from a uniform categorical distribution. */
 object UniformClassifier {
 
+  /** Create a uniform classifier.
+    *
+    * @example
+    *   {{{
+    *     import io.picnicml.doddlemodel.dummy.classification.UniformClassifier
+    *     val model = UniformClassifier()
+    *   }}}
+    */
   def apply(): UniformClassifier = UniformClassifier(none)
 
   @SerialVersionUID(0L)

diff --git a/src/main/scala/io/picnicml/doddlemodel/dummy/regression/MeanRegressor.scala b/src/main/scala/io/picnicml/doddlemodel/dummy/regression/MeanRegressor.scala
@@ -6,15 +6,20 @@ import cats.syntax.option._
 import io.picnicml.doddlemodel.data.{Features, Target}
 import io.picnicml.doddlemodel.typeclasses.Regressor
 
-/** An immutable dummy regressor that always predicts the sample mean.
-  *
-  * Examples:
-  * val model = MeanRegressor()
-  */
+
 case class MeanRegressor private (mean: Option[Float])
 
+/** An immutable dummy regressor that always predicts the sample mean. */
 object MeanRegressor {
 
+  /** Create a mean regressor.
+    *
+    * @example
+    * {{{
+    *   import io.picnicml.doddlemodel.dummy.regression.MeanRegressor
+    *   val model = MeanRegressor()
+    * }}}
+    */
   def apply(): MeanRegressor = MeanRegressor(none)
 
   @SerialVersionUID(0L)

diff --git a/src/main/scala/io/picnicml/doddlemodel/dummy/regression/MedianRegressor.scala b/src/main/scala/io/picnicml/doddlemodel/dummy/regression/MedianRegressor.scala
@@ -6,15 +6,19 @@ import cats.syntax.option._
 import io.picnicml.doddlemodel.data.{Features, Target}
 import io.picnicml.doddlemodel.typeclasses.Regressor
 
-/** An immutable dummy regressor that always predicts the sample median.
-  *
-  * Examples:
-  * val model = MedianRegressor()
-  */
 case class MedianRegressor private (median: Option[Float])
 
+/** An immutable dummy regressor that always predicts the sample median. */
 object MedianRegressor {
 
+  /** Create a median regressor.
+    *
+    * @example
+    * {{{
+    *   import io.picnicml.doddlemodel.dummy.regression.MedianRegressor
+    *   val model = MedianRegressor()
+    * }}}
+    */
   def apply(): MedianRegressor = MedianRegressor(none)
 
   @SerialVersionUID(0L)