Skip to content
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
package io.picnicml.doddlemodel.preprocessing

import breeze.linalg.{*, DenseVector}
import io.picnicml.doddlemodel.data.Feature.FeatureIndex
import io.picnicml.doddlemodel.data.{Features, RealVector}
import io.picnicml.doddlemodel.typeclasses.Transformer

case class Binarizer private (private val thresholds: RealVector, private val featureIndex: FeatureIndex) {
Comment thread
inejc marked this conversation as resolved.
Outdated
private val numNumeric = featureIndex.numerical.columnIndices.length
require(numNumeric == 0 || numNumeric == thresholds.length, "A threshold should be given for every numerical column")
}

object Binarizer {

def apply(threshold: Double, featureIndex: FeatureIndex): Binarizer = {
val numNumeric: Int = featureIndex.numerical.columnIndices.length
val thresholdsExtended = DenseVector.fill(numNumeric) {threshold}
Comment thread
inejc marked this conversation as resolved.
Outdated
Binarizer(thresholdsExtended, featureIndex)
}

implicit lazy val ev: Transformer[Binarizer] = new Transformer[Binarizer] {

override def isFitted(model: Binarizer): Boolean = true

override def fit(model: Binarizer, x: Features): Binarizer = model

override protected def transformSafe(model: Binarizer, x: Features): Features = {
Comment thread
inejc marked this conversation as resolved.
Comment thread
inejc marked this conversation as resolved.
val xCopy = x.copy
val numericColIndices = model.featureIndex.numerical.columnIndices
// only perform binarization if there are numerical columns, otherwise keep input
if(numericColIndices.nonEmpty) {
val numericColsOnly = x(::, numericColIndices).toDenseMatrix
xCopy(::, numericColIndices) := (numericColsOnly(*, ::) >:> model.thresholds).mapValues((v: Boolean) =>
if (v) 1.0 else 0.0)
}

xCopy
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
package io.picnicml.doddlemodel.preprocessing

import breeze.linalg.*
import io.picnicml.doddlemodel.data.Features
import io.picnicml.doddlemodel.preprocessing.Norms.{L2Norm, Norm}
import io.picnicml.doddlemodel.typeclasses.Transformer

case class Normalizer private (private val normFunction: Norm = L2Norm)
Comment thread
inejc marked this conversation as resolved.
Outdated

object Normalizer {

implicit lazy val ev: Transformer[Normalizer] = new Transformer[Normalizer] {

override def isFitted(model: Normalizer): Boolean = true

override def fit(model: Normalizer, x: Features): Normalizer = model
Comment thread
inejc marked this conversation as resolved.

override protected def transformSafe(model: Normalizer, x: Features): Features = {
val rowNorms = model.normFunction(x)
// no-op for zero vector
rowNorms(rowNorms :== 0.0) := 1.0
x(::, *) /:/ rowNorms
}
}
}
24 changes: 24 additions & 0 deletions src/main/scala/io/picnicml/doddlemodel/preprocessing/Norms.scala
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
package io.picnicml.doddlemodel.preprocessing

import breeze.linalg.{Axis, max, sum}
import breeze.numerics.{abs, pow, sqrt}
import io.picnicml.doddlemodel.data.{Features, RealVector}

object Norms {
Comment thread
inejc marked this conversation as resolved.

sealed trait Norm {
def apply(x: Features): RealVector
}

final case object L1Norm extends Norm {
override def apply(x: Features): RealVector = sum(abs(x), Axis._1)
}

final case object L2Norm extends Norm {
override def apply(x: Features): RealVector = sqrt(sum(pow(x, 2), Axis._1))
}

case object MaxNorm extends Norm {
override def apply(x: Features): RealVector = max(abs(x), Axis._1)
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
package io.picnicml.doddlemodel.preprocessing

import breeze.linalg.{*, Axis, max, min}
Comment thread
inejc marked this conversation as resolved.
Outdated
import cats.syntax.option._
import io.picnicml.doddlemodel.data.Feature.FeatureIndex
import io.picnicml.doddlemodel.data.{Features, RealVector}
import io.picnicml.doddlemodel.syntax.OptionSyntax._
import io.picnicml.doddlemodel.typeclasses.Transformer

case class RangeScaler private (private val scale: Option[RealVector],
private val minAdjustment: Option[RealVector],
private val range: (Double, Double),
private val featureIndex: FeatureIndex)

object RangeScaler {

def apply(range: (Double, Double), featureIndex: FeatureIndex): RangeScaler = {
val (lowerBound, upperBound) = range
require(upperBound > lowerBound, "Upper bound of range must be greater than lower bound")
RangeScaler(none, none, range, featureIndex)
}

implicit lazy val ev: Transformer[RangeScaler] = new Transformer[RangeScaler] {

override def isFitted(model: RangeScaler): Boolean =
model.scale.isDefined && model.minAdjustment.isDefined

override def fit(model: RangeScaler, x: Features): RangeScaler = {
val (lowerBound, upperBound) = model.range
val numericColIndices = model.featureIndex.numerical.columnIndices
val numericColsOnly = x(::, numericColIndices).toDenseMatrix
Comment thread
inejc marked this conversation as resolved.
Outdated
val (colMax: RealVector, colMin: RealVector) =
(max(numericColsOnly, Axis._0).inner, min(numericColsOnly, Axis._0).inner)
Comment thread
inejc marked this conversation as resolved.
Outdated
val dataRange = colMax - colMin
// avoid division by zero for constant features (max == min)
dataRange(dataRange :== 0.0) := 1.0

val scale = (upperBound - lowerBound) / dataRange
val minAdjustment = lowerBound - (colMin *:* scale)

model.copy(scale.some, minAdjustment.some)
}

override protected def transformSafe(model: RangeScaler, x: Features): Features = {
Comment thread
inejc marked this conversation as resolved.
val xCopy = x.copy
val numericColIndices = model.featureIndex.numerical.columnIndices
// only perform scaling if there are numerical columns, otherwise keep input
if(numericColIndices.nonEmpty) {
val numericColsOnly = x(::, numericColIndices).toDenseMatrix
Comment thread
inejc marked this conversation as resolved.
Outdated
numericColsOnly := numericColsOnly(*, ::) *:* model.scale.getOrBreak
numericColsOnly := numericColsOnly(*, ::) +:+ model.minAdjustment.getOrBreak
xCopy(::, numericColIndices) := numericColsOnly
}

xCopy
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
package io.picnicml.doddlemodel.preprocessing

import breeze.linalg.{DenseMatrix, DenseVector}
import io.picnicml.doddlemodel.TestingUtils
import io.picnicml.doddlemodel.data.Feature.{CategoricalFeature, FeatureIndex, NumericalFeature}
import io.picnicml.doddlemodel.preprocessing.Binarizer.ev
import org.scalatest.{FlatSpec, Matchers}

class BinarizerTest extends FlatSpec with Matchers with TestingUtils {

private val x = DenseMatrix(
List(0.0, 1.0, 0.0),
List(0.3, -1.0, 1.0),
List(-0.3, 2.0, 0.0)
)

"Binarizer" should "process the numerical columns by corresponding thresholds" in {
val featureIndex = FeatureIndex(List(NumericalFeature, NumericalFeature, CategoricalFeature))
val thresholds: DenseVector[Double] = DenseVector(0.0, -1.5)

val binarizer = Binarizer(thresholds, featureIndex)
val xBinarizedExpected = DenseMatrix(
List(0.0, 1.0, 0.0),
List(1.0, 1.0, 1.0),
List(0.0, 1.0, 0.0)
)

breezeEqual(ev.transform(binarizer, x), xBinarizedExpected) shouldBe true
}

it should "process all the numerical columns by a single threshold" in {
val featureIndex = FeatureIndex(List(NumericalFeature, NumericalFeature, NumericalFeature))
val threshold: Double = 0.5

val binarizer = Binarizer(threshold, featureIndex)
val xBinarizedExpected = DenseMatrix(
List(0.0, 1.0, 0.0),
List(0.0, 0.0, 1.0),
List(0.0, 1.0, 0.0)
)

breezeEqual(ev.transform(binarizer, x), xBinarizedExpected) shouldBe true
}

it should "amount to no-op if there are no numerical features in data" in {
val featureIndex = FeatureIndex(List(CategoricalFeature, CategoricalFeature, CategoricalFeature))
val thresholds1: DenseVector[Double] = DenseVector(0.0, -1.5)
val thresholds2: Double = 0.5

val binarizer1 = Binarizer(thresholds1, featureIndex)
val binarizer2 = Binarizer(thresholds2, featureIndex)

breezeEqual(ev.transform(binarizer1, x), x) shouldBe true
breezeEqual(ev.transform(binarizer2, x), x) shouldBe true
}

it should "fail when the amount of passed thresholds is different to number of numerical features in data" in {
val featureIndex = FeatureIndex(List(NumericalFeature, NumericalFeature, NumericalFeature))
val thresholds: DenseVector[Double] = DenseVector(0.0, -1.5)

// 3 numeric columns vs 2 thresholds
an [IllegalArgumentException] should be thrownBy Binarizer(thresholds, featureIndex)
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
package io.picnicml.doddlemodel.preprocessing

import breeze.linalg.DenseMatrix
import io.picnicml.doddlemodel.TestingUtils
import io.picnicml.doddlemodel.preprocessing.Normalizer.ev
import io.picnicml.doddlemodel.preprocessing.Norms.{L1Norm, MaxNorm}
import org.scalactic.{Equality, TolerantNumerics}
import org.scalatest.{FlatSpec, Matchers}

class NormalizerTest extends FlatSpec with Matchers with TestingUtils {

implicit val doubleTolerance: Equality[Double] = TolerantNumerics.tolerantDoubleEquality(1e-4)

"Normalizer" should "scale rows to unit norm using various norms" in {
val x = DenseMatrix(
List(1.0, 2.0, 2.0),
List(-1.0, 1.0, 0.5),
List(-2.0, 0.0, 0.0)
)
val l2Normalizer = Normalizer()
val l1Normalizer = Normalizer(L1Norm)
val maxNormalizer = Normalizer(MaxNorm)

breezeEqual(ev.transform(l2Normalizer, x),
DenseMatrix(
List(0.3333, 0.6666, 0.6666),
List(-0.6666, 0.6666, 0.3333),
List(-1.0, 0.0, 0.0)
)
) shouldBe true

breezeEqual(ev.transform(l1Normalizer, x),
DenseMatrix(
List(0.2, 0.4, 0.4),
List(-0.4, 0.4, 0.2),
List(-1.0, 0.0, 0.0)
)
) shouldBe true

breezeEqual(ev.transform(maxNormalizer, x),
DenseMatrix(
List(0.5, 1.0, 1.0),
List(-1.0, 1.0, 0.5),
List(-1.0, 0.0, 0.0)
)
) shouldBe true
}

it should "handle rows with zero norm" in {
val l2Normalizer = Normalizer()
val x = DenseMatrix(
List(0.0, 0.0, 0.0),
List(0.0, 3.0, 4.0)
)
val xNormalizedExpected = DenseMatrix(
List(0.0, 0.0, 0.0),
List(0.0, 0.6, 0.8)
)

breezeEqual(ev.transform(l2Normalizer, x), xNormalizedExpected) shouldBe true
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
package io.picnicml.doddlemodel.preprocessing

import breeze.linalg.DenseMatrix
import io.picnicml.doddlemodel.TestingUtils
import io.picnicml.doddlemodel.data.Feature.{CategoricalFeature, FeatureIndex, NumericalFeature}
import io.picnicml.doddlemodel.preprocessing.RangeScaler.ev
import org.scalactic.{Equality, TolerantNumerics}
import org.scalatest.{FlatSpec, Matchers}

class RangeScalerTest extends FlatSpec with Matchers with TestingUtils {
Comment thread
inejc marked this conversation as resolved.

implicit val doubleTolerance: Equality[Double] = TolerantNumerics.tolerantDoubleEquality(1e-4)

private val x = DenseMatrix(
List(-3.0, 2.0, 1.0),
List(-3.0, 3.0, 0.0),
List(-3.0, 0.0, 0.0),
List(-3.0, 5.0, 1.0)
)

"Range scaler" should "scale numerical features to specified range" in {
val featureIndex = FeatureIndex(List(NumericalFeature, NumericalFeature, CategoricalFeature))
val rangeScaler = RangeScaler((0.0, 1.0), featureIndex)
val trainedRangeScaler = ev.fit(rangeScaler, x)

val xScaledExpected = DenseMatrix(
List(0.0, 0.4, 1.0),
List(0.0, 0.6, 0.0),
List(0.0, 0.0, 0.0),
List(0.0, 1.0, 1.0)
)
breezeEqual(ev.transform(trainedRangeScaler, x), xScaledExpected) shouldBe true
}

Comment thread
inejc marked this conversation as resolved.
it should "scale selected subset of numerical features to specified range" in {
val featureIndex = FeatureIndex(List(NumericalFeature, NumericalFeature, CategoricalFeature))
val rangeScaler = RangeScaler((0.0, 1.0), featureIndex.subset(1 to 1))
val trainedRangeScaler = ev.fit(rangeScaler, x)

val xScaledExpected = DenseMatrix(
List(-3.0, 0.4, 1.0),
List(-3.0, 0.6, 0.0),
List(-3.0, 0.0, 0.0),
List(-3.0, 1.0, 1.0)
)
breezeEqual(ev.transform(trainedRangeScaler, x), xScaledExpected) shouldBe true
}

it should "amount to no-op if there are no numerical features in data" in {
val featureIndex = FeatureIndex(List(CategoricalFeature, CategoricalFeature, CategoricalFeature))
val rangeScaler = RangeScaler((0.0, 1.0), featureIndex)
val trainedRangeScaler = ev.fit(rangeScaler, x)

breezeEqual(ev.transform(trainedRangeScaler, x), x) shouldBe true
}
}