Skip to content
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
package io.picnicml.doddlemodel.preprocessing

import breeze.linalg.{*, DenseVector}
import io.picnicml.doddlemodel.data.Feature.FeatureIndex
import io.picnicml.doddlemodel.data.{Features, RealVector}
import io.picnicml.doddlemodel.typeclasses.Transformer

case class Binarizer private (private val featureIndex: FeatureIndex,
Comment thread
inejc marked this conversation as resolved.
Outdated
private val thresholds: RealVector)

object Binarizer {

def apply(threshold: Double, featureIndex: FeatureIndex): Binarizer = {
val numNumeric: Int = featureIndex.numerical.columnIndices.length
require(numNumeric > 0, "There must be at least 1 numeric column in the given data")
val thresholdsExtended = DenseVector.fill(numNumeric) {threshold}
Comment thread
inejc marked this conversation as resolved.
Outdated
new Binarizer(featureIndex, thresholdsExtended)
Comment thread
inejc marked this conversation as resolved.
Outdated
}

def apply(thresholds: RealVector, featureIndex: FeatureIndex): Binarizer = {
val numNumeric = featureIndex.numerical.columnIndices.length
require(numNumeric > 0, "There must be at least 1 numeric column in the given data")
require(numNumeric == thresholds.length, "A threshold should be given for every numerical column")
new Binarizer(featureIndex, thresholds)
}

implicit lazy val ev: Transformer[Binarizer] = new Transformer[Binarizer] {

override def fit(model: Binarizer, x: Features): Binarizer = model

override protected def transformSafe(model: Binarizer, x: Features): Features = {
Comment thread
inejc marked this conversation as resolved.
Comment thread
inejc marked this conversation as resolved.
val numericColsOnly = x(::, model.featureIndex.numerical.columnIndices).toDenseMatrix
(numericColsOnly(*, ::) >:> model.thresholds).mapValues((v: Boolean) => if (v) 1.0 else 0.0)
}

override def isFitted(model: Binarizer): Boolean = true
Comment thread
inejc marked this conversation as resolved.
Outdated
}
}
Comment thread
inejc marked this conversation as resolved.
Outdated
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
package io.picnicml.doddlemodel.preprocessing

import breeze.linalg.{*, Axis, max, sum}
import breeze.numerics.{abs, pow, sqrt}
import io.picnicml.doddlemodel.data.{Features, RealVector}
import io.picnicml.doddlemodel.typeclasses.Transformer

case class Normalizer private (private val normFunction: Features => RealVector)

object Normalizer {

def apply(norm: String = "l2"): Normalizer = {
// TODO: expose norms for re-use
Comment thread
inejc marked this conversation as resolved.
Outdated
val normFunction = norm match {
Comment thread
inejc marked this conversation as resolved.
Outdated
case "l2" => (x: Features) => sqrt(sum(pow(x, 2), Axis._1))
case "l1" => (x: Features) => sum(abs(x), Axis._1)
case "max" => (x: Features) => max(abs(x), Axis._1)
case _ => throw new IllegalArgumentException("Unsupported norm")
}
new Normalizer(normFunction)
Comment thread
inejc marked this conversation as resolved.
Outdated
}

implicit lazy val ev: Transformer[Normalizer] = new Transformer[Normalizer] {
override def fit(model: Normalizer, x: Features): Normalizer = model
Comment thread
inejc marked this conversation as resolved.

override protected def transformSafe(model: Normalizer, x: Features): Features = {
val rowNorms = model.normFunction(x)
// no-op for zero vector
rowNorms(rowNorms :== 0.0) := 1.0
x(::, *) /:/ rowNorms
}

override def isFitted(model: Normalizer): Boolean = true
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
package io.picnicml.doddlemodel.preprocessing

import breeze.linalg.{*, Axis, max, min}
Comment thread
inejc marked this conversation as resolved.
Outdated
import cats.syntax.option._
import io.picnicml.doddlemodel.data.Feature.FeatureIndex
import io.picnicml.doddlemodel.data.{Features, RealVector}
import io.picnicml.doddlemodel.typeclasses.Transformer
import io.picnicml.doddlemodel.syntax.OptionSyntax._

case class RangeScaler private (private val scale: Option[RealVector],
private val minAdjustment: Option[RealVector],
private val range: (Double, Double),
private val featureIndex: FeatureIndex)

object RangeScaler {

def apply(range: (Double, Double), featureIndex: FeatureIndex): RangeScaler = {
val (lowerBound, upperBound) = range
val numNumeric = featureIndex.numerical.columnIndices.length
require(numNumeric > 0, "There must be at least 1 numeric column in the given data")
Comment thread
inejc marked this conversation as resolved.
Outdated
require(upperBound > lowerBound, "Upper bound of range must be greater than lower bound")
RangeScaler(none, none, range, featureIndex)
}

implicit lazy val ev: Transformer[RangeScaler] = new Transformer[RangeScaler] {

override def fit(model: RangeScaler, x: Features): RangeScaler = {
val (lowerBound, upperBound) = model.range
val numericColsOnly = x(::, model.featureIndex.numerical.columnIndices).toDenseMatrix
val (colMax: RealVector, colMin: RealVector) =
(max(numericColsOnly, Axis._0).inner, min(numericColsOnly, Axis._0).inner)
Comment thread
inejc marked this conversation as resolved.
Outdated
val dataRange = colMax - colMin
// avoid division by zero for constant features (max == min)
dataRange(dataRange :== 0.0) := 1.0

val scale = (upperBound - lowerBound) / dataRange
val minAdjustment = lowerBound - (colMin *:* scale)

model.copy(scale.some, minAdjustment.some)
}

override protected def transformSafe(model: RangeScaler, x: Features): Features = {
Comment thread
inejc marked this conversation as resolved.
val numericColsOnly = x(::, model.featureIndex.numerical.columnIndices).toDenseMatrix
val colsScaled: Features = numericColsOnly(*, ::) *:* model.scale.getOrBreak
colsScaled(*, ::) +:+ model.minAdjustment.getOrBreak
}

override def isFitted(model: RangeScaler): Boolean =
model.scale.isDefined && model.minAdjustment.isDefined
}

Comment thread
inejc marked this conversation as resolved.
Outdated
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
package io.picnicml.doddlemodel.preprocessing

import breeze.linalg.{DenseMatrix, DenseVector}
import io.picnicml.doddlemodel.TestingUtils
import io.picnicml.doddlemodel.data.Feature.{CategoricalFeature, FeatureIndex, NumericalFeature}
import io.picnicml.doddlemodel.preprocessing.Binarizer.ev
import org.scalatest.{FlatSpec, Matchers}

class BinarizerTest extends FlatSpec with Matchers with TestingUtils {
val xMatrix = DenseMatrix(
Comment thread
inejc marked this conversation as resolved.
Outdated
List(0.0, 1.0, 0.0),
List(0.3, -1.0, 1.0),
List(-0.3, 2.0, 0.0)
)

"Binarizer" should "process the numerical columns by corresponding thresholds" in {
val featureIndex = FeatureIndex(List(NumericalFeature, NumericalFeature, CategoricalFeature))
val thresholds: DenseVector[Double] = DenseVector(0.0, -1.5)

val binarizer = Binarizer(thresholds, featureIndex)

breezeEqual(ev.transform(binarizer, xMatrix), DenseMatrix(
Comment thread
inejc marked this conversation as resolved.
Outdated
List(0.0, 1.0),
List(1.0, 1.0),
List(0.0, 1.0))) shouldBe true
}

it should "process all the numerical columns by a single threshold" in {
val featureIndex = FeatureIndex(List(NumericalFeature, NumericalFeature, NumericalFeature))
val threshold: Double = 0.5

val binarizer = Binarizer(threshold, featureIndex)

breezeEqual(ev.transform(binarizer, xMatrix), DenseMatrix(
List(0.0, 1.0, 0.0),
List(0.0, 0.0, 1.0),
List(0.0, 1.0, 0.0)
))
}

it should "fail when there are insufficient/no numeric features in data" in {
val featureIndex1 = FeatureIndex(List(NumericalFeature, NumericalFeature, NumericalFeature))
val featureIndex2 = FeatureIndex(List(CategoricalFeature, CategoricalFeature, CategoricalFeature))
val thresholds: DenseVector[Double] = DenseVector(0.0, -1.5)

// 3 numeric columns vs 2 thresholds
an [IllegalArgumentException] should be thrownBy Binarizer(thresholds, featureIndex1)
// 0 numeric columns
an [IllegalArgumentException] should be thrownBy Binarizer(thresholds, featureIndex2)
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
package io.picnicml.doddlemodel.preprocessing

import breeze.linalg.DenseMatrix
import io.picnicml.doddlemodel.TestingUtils
import io.picnicml.doddlemodel.preprocessing.Normalizer.ev
import org.scalactic.{Equality, TolerantNumerics}
import org.scalatest.{FlatSpec, Matchers}

class NormalizerTest extends FlatSpec with Matchers with TestingUtils {

implicit val doubleTolerance: Equality[Double] = TolerantNumerics.tolerantDoubleEquality(1e-4)

"Normalizer" should "scale rows to unit norm using various norms" in {
val xMatrix = DenseMatrix(
List(1.0, 2.0, 2.0),
List(-1.0, 1.0, 0.5),
List(-2.0, 0.0, 0.0)
)
val l2Normalizer = Normalizer()
val l1Normalizer = Normalizer("l1")
val maxNormalizer = Normalizer("max")

breezeEqual(ev.transform(l2Normalizer, xMatrix), DenseMatrix(
List(0.3333, 0.6666, 0.6666),
List(-0.6666, 0.6666, 0.3333),
List(-1.0, 0.0, 0.0)
)) shouldBe true

breezeEqual(ev.transform(l1Normalizer, xMatrix), DenseMatrix(
List(0.2, 0.4, 0.4),
List(-0.4, 0.4, 0.2),
List(-1.0, 0.0, 0.0)
)) shouldBe true

breezeEqual(ev.transform(maxNormalizer, xMatrix), DenseMatrix(
List(0.5, 1.0, 1.0),
List(-1.0, 1.0, 0.5),
List(-1.0, 0.0, 0.0)
)) shouldBe true
}

it should "handle rows with zero norm" in {
val l2Normalizer = Normalizer()
val xMatrix = DenseMatrix(
List(0.0, 0.0, 0.0),
List(0.0, 3.0, 4.0)
)
breezeEqual(ev.transform(l2Normalizer, xMatrix), DenseMatrix(
List(0.0, 0.0, 0.0),
List(0.0, 0.6, 0.8)
)) shouldBe true
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
package io.picnicml.doddlemodel.preprocessing

import breeze.linalg.DenseMatrix
import io.picnicml.doddlemodel.TestingUtils
import io.picnicml.doddlemodel.data.Feature.{CategoricalFeature, FeatureIndex, NumericalFeature}
import org.scalatest.{FlatSpec, Matchers}
import io.picnicml.doddlemodel.preprocessing.RangeScaler.ev
import org.scalactic.{Equality, TolerantNumerics}

class RangeScalerTest extends FlatSpec with Matchers with TestingUtils {
Comment thread
inejc marked this conversation as resolved.
implicit val doubleTolerance: Equality[Double] = TolerantNumerics.tolerantDoubleEquality(1e-4)

"Range scaler" should "scale features to specified range" in {
Comment thread
inejc marked this conversation as resolved.
Outdated
val xMatrix: DenseMatrix[Double] = DenseMatrix(
List(-3.0, 2.0, 1.0),
List(-3.0, 3.0, 0.0),
List(-3.0, 0.0, 0.0),
List(-3.0, 5.0, 1.0)
)
val featureIndex = FeatureIndex(List(NumericalFeature, NumericalFeature, CategoricalFeature))
val rangeScaler = RangeScaler((0.0, 1.0), featureIndex)
val trainedRangeScaler = ev.fit(rangeScaler, xMatrix)
breezeEqual(ev.transform(trainedRangeScaler, xMatrix), DenseMatrix(
List(0.0, 0.4),
List(0.0, 0.6),
List(0.0, 0.0),
List(0.0, 1.0)
)) shouldBe true
}

Comment thread
inejc marked this conversation as resolved.
}