Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
package io.picnicml.doddlemodel.preprocessing

import breeze.linalg.DenseVector
import io.picnicml.doddlemodel.data.Feature.FeatureIndex
import io.picnicml.doddlemodel.data.{Features, RealVector}
import io.picnicml.doddlemodel.typeclasses.Transformer

case class Binarizer(private val thresholds: RealVector, private val featureIndex: FeatureIndex) {
private val numNumeric = featureIndex.numerical.columnIndices.length
require(numNumeric == 0 || numNumeric == thresholds.length, "A threshold should be given for every numerical column")
}

/** An immutable preprocessor that binarizes numerical features according to a threshold.
* Numerical feature values that are greater than the threshold are set to `1.0`, while those that are lower or equal
* are set to `0.0`. Non-numerical features are left untouched.
* */
object Binarizer {

/** Create a binarizer where a single threshold is applied to all numerical columns.
*
* @param threshold threshold to be applied
* @param featureIndex feature index associated with features - this is needed so that only numerical features are
* transformed by this preprocessor; could be a subset of columns to be transformed
*
* @example Binarize a matrix with two features: one numerical and one categorical.
* {{{
* import io.picnicml.doddlemodel.preprocessing.Binarizer.ev
*
* val featureIndex = FeatureIndex(List(NumericalFeature, CategoricalFeature))
* val x = DenseMatrix(
* List(1.0, 0.0),
* List(-1.0, 1.0),
* List(2.0, 0.0)
* )
* // equivalently, DenseVector(0.0) could be used
* val threshold = 0.0
* val binarizer = Binarizer(threshold, featureIndex)
* // Note: no fitting required
* val xTransformed = ev.transform(binarizer, x)
* }}}
*/
def apply(threshold: Double, featureIndex: FeatureIndex): Binarizer = {
val numNumeric: Int = featureIndex.numerical.columnIndices.length
val thresholdsExtended = DenseVector.fill(numNumeric) { threshold }
Binarizer(thresholdsExtended, featureIndex)
}

implicit lazy val ev: Transformer[Binarizer] = new Transformer[Binarizer] {

override def isFitted(model: Binarizer): Boolean = true

override def fit(model: Binarizer, x: Features): Binarizer = model

override protected def transformSafe(model: Binarizer, x: Features): Features = {
Comment thread
inejc marked this conversation as resolved.
Comment thread
inejc marked this conversation as resolved.
val xCopy = x.copy
model.featureIndex.numerical.columnIndices.zipWithIndex.foreach {
case (colIndex, thresholdIndex) => (0 until xCopy.rows).foreach {
rowIndex =>
xCopy(rowIndex, colIndex) = if (xCopy(rowIndex, colIndex) > model.thresholds(thresholdIndex)) 1.0 else 0.0
}
}

xCopy
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
package io.picnicml.doddlemodel.preprocessing

import breeze.linalg.*
import io.picnicml.doddlemodel.data.Features
import io.picnicml.doddlemodel.preprocessing.Norms.{L2Norm, Norm}
import io.picnicml.doddlemodel.typeclasses.Transformer

case class Normalizer(normFunction: Norm = L2Norm)

/** An immutable preprocessor that normalizes rows to unit norm according to specified norm function.
* See [[io.picnicml.doddlemodel.preprocessing.Norms]] for supported norm functions.
*
* @example Scale rows to unit norm according to L2 norm.
* {{{
* import io.picnicml.doddlemodel.preprocessing.Normalizer.ev
* import io.picnicml.doddlemodel.preprocessing.Norms.L2Norm
*
* val x = DenseMatrix(
* List(1.0, 2.0, 2.0),
* List(-2.0, 0.0, 0.0)
* )
* val l2Normalizer = Normalizer(L2Norm)
* // Note: no fitting required
* val xNormalized = ev.transform(l2Normalizer, x)
* }}}
* */
object Normalizer {

implicit lazy val ev: Transformer[Normalizer] = new Transformer[Normalizer] {

override def isFitted(model: Normalizer): Boolean = true

override def fit(model: Normalizer, x: Features): Normalizer = model
Comment thread
inejc marked this conversation as resolved.

override protected def transformSafe(model: Normalizer, x: Features): Features = {
val rowNorms = model.normFunction(x)
// no-op for zero vector
rowNorms(rowNorms :== 0.0) := 1.0
x(::, *) /:/ rowNorms
}
}
}
24 changes: 24 additions & 0 deletions src/main/scala/io/picnicml/doddlemodel/preprocessing/Norms.scala
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
package io.picnicml.doddlemodel.preprocessing

import breeze.linalg.{Axis, max, sum}
import breeze.numerics.{abs, pow, sqrt}
import io.picnicml.doddlemodel.data.{Features, RealVector}

object Norms {
Comment thread
inejc marked this conversation as resolved.

sealed trait Norm {
def apply(x: Features): RealVector
}

final case object L1Norm extends Norm {
override def apply(x: Features): RealVector = sum(abs(x), Axis._1)
}

final case object L2Norm extends Norm {
override def apply(x: Features): RealVector = sqrt(sum(pow(x, 2), Axis._1))
}

final case object MaxNorm extends Norm {
override def apply(x: Features): RealVector = max(abs(x), Axis._1)
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
package io.picnicml.doddlemodel.preprocessing

import breeze.linalg.{Axis, max, min}
import cats.syntax.option._
import io.picnicml.doddlemodel.data.Feature.FeatureIndex
import io.picnicml.doddlemodel.data.{Features, RealVector}
import io.picnicml.doddlemodel.syntax.OptionSyntax._
import io.picnicml.doddlemodel.typeclasses.Transformer

case class RangeScaler private (private val scale: Option[RealVector],
private val minAdjustment: Option[RealVector],
private val range: (Double, Double),
private val featureIndex: FeatureIndex)

/** An immutable preprocessor that scales numerical features to a specified range.
* Non-numerical features are left untouched.
* */
object RangeScaler {

/** Create a RangeScaler to scale numerical features to the range [0, 1] (i.e. both bounds included).
*
* @param range lower and upper bound of range
* @param featureIndex feature index associated with features - this is needed so that only numerical features are
* transformed by this preprocessor; could be a subset of columns to be transformed
*
* @example Scale a matrix with two features (one numerical and one categorical) to range [0.0, 1.0].
* {{{
* import io.picnicml.doddlemodel.preprocessing.RangeScaler.ev
*
* val featureIndex = FeatureIndex(List(NumericalFeature, CategoricalFeature))
* val x = DenseMatrix(
* List(2.0, 1.0),
* List(3.0, 0.0),
* List(0.0, 0.0),
* List(5.0, 1.0)
* )
* val rangeScaler = RangeScaler((0.0, 1.0), featureIndex)
* val trainedRangeScaler = ev.fit(rangeScaler, x)
* ev.transform(trainedRangeScaler, x)
* }}}
*/
def apply(range: (Double, Double), featureIndex: FeatureIndex): RangeScaler = {
val (lowerBound, upperBound) = range
require(upperBound > lowerBound, "Upper bound of range must be greater than lower bound")
RangeScaler(none, none, range, featureIndex)
}

implicit lazy val ev: Transformer[RangeScaler] = new Transformer[RangeScaler] {

override def isFitted(model: RangeScaler): Boolean =
model.scale.isDefined && model.minAdjustment.isDefined

override def fit(model: RangeScaler, x: Features): RangeScaler = {
val (lowerBound, upperBound) = model.range
val numericColIndices = model.featureIndex.numerical.columnIndices
val colMax = max(x(::, numericColIndices), Axis._0).t.toDenseVector
val colMin = min(x(::, numericColIndices), Axis._0).t.toDenseVector
val dataRange = colMax - colMin
// avoid division by zero for constant features (max == min)
dataRange(dataRange :== 0.0) := 1.0

val scale = (upperBound - lowerBound) / dataRange
val minAdjustment = lowerBound - (colMin *:* scale)

model.copy(scale.some, minAdjustment.some)
}

override protected def transformSafe(model: RangeScaler, x: Features): Features = {
Comment thread
inejc marked this conversation as resolved.
val xCopy = x.copy
val scale = model.scale.getOrBreak
val minAdjustment = model.minAdjustment.getOrBreak
model.featureIndex.numerical.columnIndices.zipWithIndex.foreach {
case (colIndex, idx) =>
xCopy(::, colIndex) := (xCopy(::, colIndex) *:* scale(idx)) +:+ minAdjustment(idx)
}

xCopy
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
package io.picnicml.doddlemodel.preprocessing

import breeze.linalg.{DenseMatrix, DenseVector}
import io.picnicml.doddlemodel.TestingUtils
import io.picnicml.doddlemodel.data.Feature.{CategoricalFeature, FeatureIndex, NumericalFeature}
import io.picnicml.doddlemodel.preprocessing.Binarizer.ev
import org.scalatest.{FlatSpec, Matchers}

class BinarizerTest extends FlatSpec with Matchers with TestingUtils {

private val x = DenseMatrix(
List(0.0, 1.0, 0.0),
List(0.3, -1.0, 1.0),
List(-0.3, 2.0, 0.0)
)

"Binarizer" should "process the numerical columns by corresponding thresholds" in {
val featureIndex = FeatureIndex(List(NumericalFeature, NumericalFeature, CategoricalFeature))
val thresholds: DenseVector[Double] = DenseVector(0.0, -1.5)

val binarizer = Binarizer(thresholds, featureIndex)
val xBinarizedExpected = DenseMatrix(
List(0.0, 1.0, 0.0),
List(1.0, 1.0, 1.0),
List(0.0, 1.0, 0.0)
)

breezeEqual(ev.transform(binarizer, x), xBinarizedExpected) shouldBe true
}

it should "process all the numerical columns by a single threshold" in {
val featureIndex = FeatureIndex(List(NumericalFeature, NumericalFeature, NumericalFeature))
val threshold: Double = 0.5

val binarizer = Binarizer(threshold, featureIndex)
val xBinarizedExpected = DenseMatrix(
List(0.0, 1.0, 0.0),
List(0.0, 0.0, 1.0),
List(0.0, 1.0, 0.0)
)

breezeEqual(ev.transform(binarizer, x), xBinarizedExpected) shouldBe true
}

it should "amount to no-op if there are no numerical features in data" in {
val featureIndex = FeatureIndex(List(CategoricalFeature, CategoricalFeature, CategoricalFeature))
val thresholds1: DenseVector[Double] = DenseVector(0.0, -1.5)
val thresholds2: Double = 0.5

val binarizer1 = Binarizer(thresholds1, featureIndex)
val binarizer2 = Binarizer(thresholds2, featureIndex)

breezeEqual(ev.transform(binarizer1, x), x) shouldBe true
breezeEqual(ev.transform(binarizer2, x), x) shouldBe true
}

it should "fail when the amount of passed thresholds is different to number of numerical features in data" in {
val featureIndex = FeatureIndex(List(NumericalFeature, NumericalFeature, NumericalFeature))
val thresholds: DenseVector[Double] = DenseVector(0.0, -1.5)

// 3 numeric columns vs 2 thresholds
an [IllegalArgumentException] should be thrownBy Binarizer(thresholds, featureIndex)
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
package io.picnicml.doddlemodel.preprocessing

import breeze.linalg.DenseMatrix
import io.picnicml.doddlemodel.TestingUtils
import io.picnicml.doddlemodel.preprocessing.Normalizer.ev
import io.picnicml.doddlemodel.preprocessing.Norms.{L1Norm, MaxNorm}
import org.scalactic.{Equality, TolerantNumerics}
import org.scalatest.{FlatSpec, Matchers}

class NormalizerTest extends FlatSpec with Matchers with TestingUtils {

implicit val doubleTolerance: Equality[Double] = TolerantNumerics.tolerantDoubleEquality(1e-4)

"Normalizer" should "scale rows to unit norm using various norms" in {
val x = DenseMatrix(
List(1.0, 2.0, 2.0),
List(-1.0, 1.0, 0.5),
List(-2.0, 0.0, 0.0)
)
val l2Normalizer = Normalizer()
val l1Normalizer = Normalizer(L1Norm)
val maxNormalizer = Normalizer(MaxNorm)

breezeEqual(ev.transform(l2Normalizer, x),
DenseMatrix(
List(0.3333, 0.6666, 0.6666),
List(-0.6666, 0.6666, 0.3333),
List(-1.0, 0.0, 0.0)
)
) shouldBe true

breezeEqual(ev.transform(l1Normalizer, x),
DenseMatrix(
List(0.2, 0.4, 0.4),
List(-0.4, 0.4, 0.2),
List(-1.0, 0.0, 0.0)
)
) shouldBe true

breezeEqual(ev.transform(maxNormalizer, x),
DenseMatrix(
List(0.5, 1.0, 1.0),
List(-1.0, 1.0, 0.5),
List(-1.0, 0.0, 0.0)
)
) shouldBe true
}

it should "handle rows with zero norm" in {
val l2Normalizer = Normalizer()
val x = DenseMatrix(
List(0.0, 0.0, 0.0),
List(0.0, 3.0, 4.0)
)
val xNormalizedExpected = DenseMatrix(
List(0.0, 0.0, 0.0),
List(0.0, 0.6, 0.8)
)

breezeEqual(ev.transform(l2Normalizer, x), xNormalizedExpected) shouldBe true
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
package io.picnicml.doddlemodel.preprocessing

import breeze.linalg.{DenseMatrix, DenseVector}
import io.picnicml.doddlemodel.TestingUtils
import org.scalactic.{Equality, TolerantNumerics}
import org.scalatest.{FlatSpec, Matchers}

class NormsTest extends FlatSpec with Matchers with TestingUtils {

implicit val doubleTolerance: Equality[Double] = TolerantNumerics.tolerantDoubleEquality(1e-4)

private val x = DenseMatrix(
List(0.0, 0.0, 0.0),
List(1.0, 2.0, 2.0),
List(-2.0, 0.0, 0.0)
)

"Norms" should "calculate the L2 norm of each row" in {
val xExpected = DenseVector(0.0, 3.0, 2.0)
breezeEqual(Norms.L2Norm(x), xExpected) shouldBe true
}

"Norms" should "calculate the L1 norm of each row" in {
val xExpected = DenseVector(0.0, 5.0, 2.0)
breezeEqual(Norms.L1Norm(x), xExpected) shouldBe true
}

"Norms" should "calculate the max norm of each row" in {
val xExpected = DenseVector(0.0, 2.0, 2.0)
breezeEqual(Norms.MaxNorm(x), xExpected) shouldBe true
}

}
Loading