Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 21 additions & 1 deletion src/main/scala/io/picnicml/doddlemodel/data/CsvLoader.scala
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,27 @@ import scala.io.{BufferedSource, Source}

object CsvLoader {

/** Loads a csv dataset with 2 header lines (1st line for feature names and 2nd for types). */
/** Loads a csv dataset with 2 header lines (first line for feature names and second for types).
* @param datasetFilePath csv file to load
* @param na value to interpret as N/A data in the given dataset
*
* @example Reading the iris dataset.
* {{{
* import io.picnicml.doddlemodel.data.CsvLoader.loadCsvDataset
*
* val (data, featureInfo) = loadCsvDataset("src/main/resources/datasets/iris.csv")
* // separate features from the label
* val (irisFeatures, irisLabels) = (data(::, 0 to 3), data(::, -1))
* }}}
*
* @example Reading a dataset where N/A values are marked with `NA`.
* {{{
* import io.picnicml.doddlemodel.data.CsvLoader.loadCsvDataset
*
* // specify a value to interpret as N/A data
* val (data, featureInfo) = loadCsvDataset("src/main/resources/datasets/dummy_csv_reading.csv", "NA")
* }}}
*/
def loadCsvDataset(datasetFilePath: String, na: String = "NA"): FeaturesWithIndex =
loadCsvDataset(Source.fromFile(datasetFilePath), na)

Expand Down
62 changes: 60 additions & 2 deletions src/main/scala/io/picnicml/doddlemodel/data/DatasetUtils.scala
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,71 @@ import scala.util.Random

object DatasetUtils {

/** Shuffles rows of the dataset. */
/** Shuffles rows of the dataset.
* @param x features to be shuffled
* @param y labels corresponding to features
*
* @example Shuffle a dataset randomly.
* {{{
* import scala.util.Random
* import io.picnicml.doddlemodel.data.CsvLoader.loadCsvDataset
* import io.picnicml.doddlemodel.data.DatasetUtils.shuffleDataset
*
* val (data, featureInfo) = loadCsvDataset("src/main/resources/datasets/iris.csv")
* val (dataX, dataY) = (data(::, 0 to 3), data(::, -1))
*
* val (shuffledX, shuffledY) = shuffleDataset(dataX, dataY)
*
* // seeded shuffle - seed is passed to shuffler implicitly
* implicit val rand: Random = new Random(42)
* val (shuffledX, shuffledY) = shuffleDataset(dataX, dataY)
* }}}
*/
def shuffleDataset(x: Features, y: Target)(implicit rand: Random = new Random()): Dataset = {
val shuffleIndices = rand.shuffle((0 until y.length).toIndexedSeq)
(x(shuffleIndices, ::).toDenseMatrix, y(shuffleIndices).toDenseVector)
}

/** Splits the dataset into two subsets for training and testing. */
/** Splits the dataset into two subsets for training and testing.
* @param x features to be split
* @param y labels corresponding to features
* @param proportionTrain proportion of dataset to be put into training set - between 0.0 and 1.0
*
* @example Split dataset into training and test set.
* {{{
* import io.picnicml.doddlemodel.data.CsvLoader.loadCsvDataset
* import io.picnicml.doddlemodel.data.DatasetUtils.splitDataset
*
* val (data, featureInfo) = loadCsvDataset("src/main/resources/datasets/iris.csv")
* val (dataX, dataY) = (data(::, 0 to 3), data(::, -1))
*
* // by default, the split is 50%:50%
* val trTeSplit = splitDataset(dataX, dataY)
*
* // put 80% of data into training set and 20% into test set
* val trTeSplit = splitDataset(dataX, dataY, 0.8f)
* val (trainX, trainY, testX, testY) = (trTeSplit.xTr, trTeSplit.yTr, trTeSplit.xTe, trTeSplit.yTe)
* }}}
*
* @example Split dataset into training, validation and test set in the ratio 60%: 10%: 30%. This is done by
* performing two train-test splits in a row. First we split the dataset in ratio (60% + 10%): 30%,
* obtaining combined training and validation set and the test set. Then we split the first part using
* the ratio (60% / 70%): (10% / 70%) to obtain the training and validation set.
* {{{
* import io.picnicml.doddlemodel.data.CsvLoader.loadCsvDataset
* import io.picnicml.doddlemodel.data.DatasetUtils.splitDataset
*
* val (data, featureInfo) = loadCsvDataset("src/main/resources/datasets/iris.csv")
* val (dataX, dataY) = (data(::, 0 to 3), data(::, -1))
*
* val teVsNonTeSplit = splitDataset(dataX, dataY, 0.7f)
* val (trValX, trValY, testX, testY) = (teVsNonTeSplit.xTr, teVsNonTeSplit.yTr,
* teVsNonTeSplit.xTe, teVsNonTeSplit.yTe)
*
* val trValSplit = splitDataset(trValX, trValY, (0.6f / 0.7f))
* val (trainX, trainY, valX, valY) = (trValSplit.xTr, trValSplit.yTr, trValSplit.xTe, trValSplit.yTe)
* }}}
*/
def splitDataset(x: Features, y: Target, proportionTrain: Float = 0.5f): TrainTestSplit = {
val numTrain = numberOfTrainExamplesBasedOnProportion(x.rows, proportionTrain)
val trIndices = 0 until numTrain
Expand Down
85 changes: 85 additions & 0 deletions src/main/scala/io/picnicml/doddlemodel/data/Feature.scala
Original file line number Diff line number Diff line change
Expand Up @@ -35,20 +35,71 @@ object Feature {
subset(subsetIndices:_*)
}

/** Create a feature index with subset of features, provided by feature names.
* @param names subset of features to be selected
*
* @example Create feature index based on features "f1" and "f3" from a constructed feature index.
* {{{
* import io.picnicml.doddlemodel.data.Feature.{FeatureIndex, NumericalFeature}
*
* val featureIndex = FeatureIndex(List("f1", "f2", "f3"), List(NumericalFeature, NumericalFeature,
* NumericalFeature), List(0, 1, 2))
* val subIndex = featureIndex.subset("f1", "f3")
* }}}
*
*/
def subset(names: String*): FeatureIndex = {
val nameToIndex = this.names.zipWithIndex.toMap
subset(names.map(n => nameToIndex(n)):_*)
}

/** Create a feature index with subset of features, provided by feature indices.
* @param indices column indices for subset of features to be selected
*
* @example Create feature index based on second and third (i.e. indices 1, 2) features from a constructed
* feature index.
* {{{
* import io.picnicml.doddlemodel.data.Feature.{FeatureIndex, NumericalFeature}
*
* val featureIndex = FeatureIndex(List("f1", "f2", "f3"), List(NumericalFeature, NumericalFeature,
* NumericalFeature), List(0, 1, 2))
* val subIndex = featureIndex.subset(1 to 2)
* }}}
*/
def subset(indices: IndexedSeq[Int]): FeatureIndex = subset(indices:_*)

/** Create a feature index with subset of features, provided by feature indices. Alternative interface to do same
* as with `FeatureIndex.subset(indices: IndexedSeq[Int])`.
* @param indices column indices for subset of features to be selected
*
* @example Create feature index based on second and third (i.e. indices 1, 2) features from a constructed
* feature index.
* {{{
* import io.picnicml.doddlemodel.data.Feature.{FeatureIndex, NumericalFeature}
*
* val featureIndex = FeatureIndex(List("f1", "f2", "f3"), List(NumericalFeature, NumericalFeature,
* NumericalFeature), List(0, 1, 2))
* val subIndex = featureIndex.subset(1, 2)
* }}}
*/
// DummyImplicit is needed to avoid the same type as String* after erasure
def subset(indices: Int*)(implicit di: DummyImplicit): FeatureIndex = new FeatureIndex(
indices.toIndexedSeq.map(i => this.names(i)),
indices.toIndexedSeq.map(i => this.types(i)),
indices.toIndexedSeq.map(i => this.columnIndices(i))
)

/** Create a feature index by dropping a feature by column index.
* @param index index of column to be dropped
* @example Drop the third (index 2) feature from a feature index.
* {{{
* import io.picnicml.doddlemodel.data.Feature.{FeatureIndex, NumericalFeature}
*
* val featureIndex = FeatureIndex(List("f1", "f2", "f3"), List(NumericalFeature, NumericalFeature,
* NumericalFeature), List(0, 1, 2))
* val subIndex = featureIndex.drop(2)
* }}}
*/
def drop(index: Int): FeatureIndex = new FeatureIndex(
this.names.zipWithIndex.flatMap { case (n, i) => if (i != index) n.some else none[String] },
this.types.zipWithIndex.flatMap { case (t, i) => if (i != index) t.some else none[FeatureType] },
Expand All @@ -61,26 +112,60 @@ object Feature {
this.names.zip(this.types).map { case (n, t) => s"$n (${t.headerLineString})" } mkString ", "
}

/** A structure that keeps track of feature metadata (names, types and indices). This is needed
* because some methods are only applicable to a certain type of features, e.g. [0, 1] scaling
* only makes sense for numerical features. */
object FeatureIndex {

/** Construct feature index with `n` categorical features. Feature names are generated automatically - `i`th
* feature gets assigned the name "f`i`" (using 0-based counting).
* @param n number of categorical features in feature index
*/
def categorical(n: Int): FeatureIndex =
categorical((0 until n).toList)

def categorical(columnIndices: List[Int]): FeatureIndex =
apply(columnIndices.indices.map(i => s"f$i").toList, columnIndices.map(_ => CategoricalFeature), columnIndices)

/** Construct feature index with `n` numerical features. Feature names are generated automatically - `i`th
* feature gets assigned the name "f`i`" (using 0-based counting).
* @param n number of numerical features in feature index
*/
def numerical(n: Int): FeatureIndex =
numerical((0 until n).toList)

def numerical(columnIndices: List[Int]): FeatureIndex =
apply(columnIndices.indices.map(i => s"f$i").toList, columnIndices.map(_ => NumericalFeature), columnIndices)

/** Construct feature index from feature types. Feature names are generated automatically - `i`th
* feature gets assigned the name "f`i`" (using 0-based counting).
* @param types list of feature types
*
* @example Construct a feature index with one numerical and two categorical features.
* {{{
* import io.picnicml.doddlemodel.data.Feature.{FeatureIndex, NumericalFeature, CategoricalFeature}
* val featureIndex = FeatureIndex(List(CategoricalFeature, NumericalFeature, CategoricalFeature))
* }}}
*/
def apply(types: List[FeatureType]): FeatureIndex =
apply(types.indices.map(i => s"f$i").toList, types, types.indices.toList)

def apply(types: List[FeatureType], columnIndices: List[Int]): FeatureIndex =
apply(types.indices.map(i => s"f$i").toList, types, columnIndices)

/** Construct a feature index with custom feature names, types and column indices.
* @param names feature names
* @param types feature types
* @param columnIndices column index for each feature
*
* @example Construct a feature index with three features, named "age" (numerical), "height" (numerical)
* and "group" (categorical).
* {{{
* import io.picnicml.doddlemodel.data.Feature.{FeatureIndex, NumericalFeature, CategoricalFeature}
* val featureIndex = FeatureIndex(List("age", "height", "group"), List(NumericalFeature,
* NumericalFeature, CategoricalFeature), List(0, 1, 2))
* }}}
*/
def apply(names: List[String], types: List[FeatureType], columnIndices: List[Int]): FeatureIndex =
new FeatureIndex(names.toIndexedSeq, types.toIndexedSeq, columnIndices.toIndexedSeq)
}
Expand Down
16 changes: 16 additions & 0 deletions src/main/scala/io/picnicml/doddlemodel/data/package.scala
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import breeze.linalg.{DenseMatrix, DenseVector, unique}
import io.picnicml.doddlemodel.CrossScalaCompat.floatOrdering
import io.picnicml.doddlemodel.data.Feature.FeatureIndex

/** Provides data management utilities and definitions of custom doddle-model data types. */
package object data {

type RealVector = DenseVector[Float]
Expand All @@ -17,9 +18,24 @@ package object data {
type Dataset = (Features, Target)
type DatasetWithIndex = (Features, Target, FeatureIndex)

/** Loads and returns the Boston Housing prices dataset. */
def loadBostonDataset: DatasetWithIndex = ResourceDatasetLoaders.loadBostonDataset

/** Loads and returns the Breast cancer Wisconsin (diagnostic) dataset.
*
* @see <a href="https://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Wisconsin+%28Diagnostic%29" target="_blank">
* Breast cancer dataset on UCI Machine Learning Repository </a>
*/
def loadBreastCancerDataset: DatasetWithIndex = ResourceDatasetLoaders.loadBreastCancerDataset

/** Loads and returns the Iris dataset.
*
* @see <a href="https://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Wisconsin+%28Diagnostic%29" target="_blank">
* Iris dataset on UCI Machine Learning Repository </a>
*/
def loadIrisDataset: DatasetWithIndex = ResourceDatasetLoaders.loadIrisDataset

/** Loads and returns an artificial dataset with a Poisson target variable. */
def loadHighSchoolTestDataset: DatasetWithIndex = ResourceDatasetLoaders.loadHighSchoolTestDataset

def numberOfUniqueGroups(groups: IntVector): Int = {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,19 @@ import io.picnicml.doddlemodel.CrossScalaCompat.floatOrdering
import io.picnicml.doddlemodel.data.{Features, Simplex, Target}
import io.picnicml.doddlemodel.typeclasses.Classifier

/** An immutable dummy classifier that always predicts the most frequent label.
*
* Examples:
* val model = MostFrequentClassifier()
*/
case class MostFrequentClassifier private (numClasses: Option[Int], mostFrequentClass: Option[Float])

/** An immutable dummy classifier that always predicts the most frequent label. */
object MostFrequentClassifier {

/** Create a majority classifier.
*
* @example
* {{{
* import io.picnicml.doddlemodel.dummy.classification.MostFrequentClassifier
* val model = MostFrequentClassifier()
* }}}
*/
def apply(): MostFrequentClassifier = MostFrequentClassifier(none, none)

@SerialVersionUID(0L)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,6 @@ import io.picnicml.doddlemodel.dummy.classification.StratifiedClassifier.ev
import io.picnicml.doddlemodel.syntax.OptionSyntax._
import io.picnicml.doddlemodel.typeclasses.Classifier

/** An immutable dummy classifier that samples predictions from a stratified categorical distribution.
*
* Examples:
* val model = StratifiedClassifier()
*/
case class StratifiedClassifier private (numClasses: Option[Int],
targetDistr: Option[Multinomial[DenseVector[Double], Int]]) {

Expand All @@ -23,8 +18,17 @@ case class StratifiedClassifier private (numClasses: Option[Int],
}
}

/** An immutable dummy classifier that samples predictions from a stratified categorical distribution. */
object StratifiedClassifier {

/** Create a stratified classifier.
*
* @example
* {{{
* import io.picnicml.doddlemodel.dummy.classification.StratifiedClassifier
* val model = StratifiedClassifier()
* }}}
*/
def apply(): StratifiedClassifier = StratifiedClassifier(none, none)

@SerialVersionUID(0L)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,19 @@ import cats.syntax.option._
import io.picnicml.doddlemodel.data.{Features, Simplex, Target}
import io.picnicml.doddlemodel.typeclasses.Classifier

/** An immutable dummy classifier that samples predictions from a uniform categorical distribution.
*
* Examples:
* val model = UniformClassifier()
*/
case class UniformClassifier private (numClasses: Option[Int])

/** An immutable dummy classifier that samples predictions from a uniform categorical distribution. */
object UniformClassifier {

/** Create a uniform classifier.
*
* @example
* {{{
* import io.picnicml.doddlemodel.dummy.classification.UniformClassifier
* val model = UniformClassifier()
* }}}
*/
def apply(): UniformClassifier = UniformClassifier(none)

@SerialVersionUID(0L)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,20 @@ import cats.syntax.option._
import io.picnicml.doddlemodel.data.{Features, Target}
import io.picnicml.doddlemodel.typeclasses.Regressor

/** An immutable dummy regressor that always predicts the sample mean.
*
* Examples:
* val model = MeanRegressor()
*/

case class MeanRegressor private (mean: Option[Float])

/** An immutable dummy regressor that always predicts the sample mean. */
object MeanRegressor {

/** Create a mean regressor.
*
* @example
* {{{
* import io.picnicml.doddlemodel.dummy.regression.MeanRegressor
* val model = MeanRegressor()
* }}}
*/
def apply(): MeanRegressor = MeanRegressor(none)

@SerialVersionUID(0L)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,19 @@ import cats.syntax.option._
import io.picnicml.doddlemodel.data.{Features, Target}
import io.picnicml.doddlemodel.typeclasses.Regressor

/** An immutable dummy regressor that always predicts the sample median.
*
* Examples:
* val model = MedianRegressor()
*/
case class MedianRegressor private (median: Option[Float])

/** An immutable dummy regressor that always predicts the sample median. */
object MedianRegressor {

/** Create a median regressor.
*
* @example
* {{{
* import io.picnicml.doddlemodel.dummy.regression.MedianRegressor
* val model = MedianRegressor()
* }}}
*/
def apply(): MedianRegressor = MedianRegressor(none)

@SerialVersionUID(0L)
Expand Down
Loading