ML Featurizer Examples¶
Unary Numeric Featurizers¶
LogTransformFeaturizer¶
import mlfeaturizer.core.featurizer as fr
data = spark.createDataFrame([
(0, 200000, 2.5),
(1, 10000, 5.0),
(2, 150000, 5.0)], ["id", "v1", "v2"])
logFeaturizer = fr.LogTransformFeaturizer(inputCol='v1', outputCol='output', logType='log10')
logFeaturizer.transform(data).show()
PowerTransformFeaturizer¶
import mlfeaturizer.core.featurizer as fr
data = spark.createDataFrame([
(0, 2.0, 2.5),
(1, 100.0, 5.0),
(2, 3.0, 5.0)], ["id", "v1", "v2"])
powerFeaturizer = fr.PowerTransformFeaturizer(inputCol='v1', outputCol='output', powerType=2)
powerFeaturizer.transform(data).show()
MathFeaturizer¶
import mlfeaturizer.core.featurizer as fr
data = spark.createDataFrame([
(0, 25.0, 2.5),
(1, 100.0, 5.0),
(2, 35.0, 5.0)], ["id", "v1", "v2"])
mathFeaturizer = fr.MathFeaturizer(inputCol='v1', outputCol='output', mathFunction='sqrt')
mathFeaturizer.transform(data).show()
Binary Numeric Featurizers¶
AdditionFeaturizer¶
import mlfeaturizer.core.featurizer as fr
data = spark.createDataFrame([
(0, 2.5, 2.5),
(1, 8.0, 5.0),
(2, 1.0, 5.0)], ["id", "v1", "v2"])
addFeaturizer = fr.AdditionFeaturizer(inputCols=['v1', 'v2'], outputCol='output')
addFeaturizer.transform(data).show()
SubtractionFeaturizer¶
import mlfeaturizer.core.featurizer as fr
data = spark.createDataFrame([
(0, 2.5, 2.5),
(1, 8.0, 5.0),
(2, 1.0, 5.0)], ["id", "v1", "v2"])
subtractFeaturizer = fr.SubtractionFeaturizer(inputCols=['v1', 'v2'], outputCol='output')
subtractFeaturizer.transform(data).show()
MultiplicationFeaturizer¶
import mlfeaturizer.core.featurizer as fr
data = spark.createDataFrame([
(0, 2.5, 2.5),
(1, 8.0, 5.0),
(2, 1.0, 5.0)], ["id", "v1", "v2"])
multiplyFeaturizer = fr.MultiplicationFeaturizer(inputCols=['v1', 'v2'], outputCol='output')
multiplyFeaturizer.transform(data).show()
DivisionFeaturizer¶
import mlfeaturizer.core.featurizer as fr
data = spark.createDataFrame([
(0, 2.5, 2.5),
(1, 8.0, 5.0),
(2, 1.0, 5.0)], ["id", "v1", "v2"])
divideFeaturizer = fr.DivisionFeaturizer(inputCols=['v1', 'v2'], outputCol='output')
divideFeaturizer.transform(data).show()
Unary Temporal Featurizers¶
HourOfDayFeaturizer¶
import mlfeaturizer.core.featurizer as fr
data = spark.createDataFrame([
(0, 2.0, '2018-01-01 12:05:00'),
(1, 100.0, '2018-12-01 08:05:00'),
(2, 3.0, '2015-05-01 23:05:00')], ["id", "v1", "time"])
hourFeaturizer = fr.HourOfDayFeaturizer(inputCol='time', outputCol='hour')
hourFeaturizer.transform(data).show()
DayOfWeekFeaturizer¶
import mlfeaturizer.core.featurizer as fr
data = spark.createDataFrame([
(0, 2.0, '2018-01-01 12:05:00'),
(1, 100.0, '2018-12-01 08:05:00'),
(2, 3.0, '2015-05-01 23:05:00')], ["id", "v1", "time"])
dayFeaturizer = fr.DayOfWeekFeaturizer(inputCol='time', outputCol='day', format='yyyy-MM-dd HH:mm:ss')
dayFeaturizer.transform(data).show()
MonthOfYearFeaturizer¶
import mlfeaturizer.core.featurizer as fr
data = spark.createDataFrame([
(0, 2.0, '2018-01-01 12:05:00'),
(1, 100.0, '2018-12-01 08:05:00'),
(2, 3.0, '2015-05-01 23:05:00')], ["id", "v1", "time"])
monthFeaturizer = fr.MonthOfYearFeaturizer(inputCol='time', outputCol='month', format='yyyy-MM-dd HH:mm:ss')
monthFeaturizer.transform(data).show()
PartsOfDayFeaturizer¶
import mlfeaturizer.core.featurizer as fr
data = spark.createDataFrame([
(0, 2.0, '2018-01-01 12:05:00'),
(1, 100.0, '2018-12-01 08:05:00'),
(2, 3.0, '2015-05-01 23:05:00')], ["id", "v1", "time"])
dayPartFeaturizer = fr.PartsOfDayFeaturizer(inputCol='time', outputCol='dayPart', format='yyyy-MM-dd HH:mm:ss')
dayPartFeaturizer.transform(data).show()
WeekendFeaturizer¶
import mlfeaturizer.core.featurizer as fr
data = spark.createDataFrame([
(0, 2.0, '2018-01-01 12:05:00'),
(1, 100.0, '2018-12-01 08:05:00'),
(2, 3.0, '2015-05-01 23:05:00')], ["id", "v1", "time"])
weekendFeaturizer = fr.WeekendFeaturizer(inputCol='time', outputCol='weekend', format='yyyy-MM-dd HH:mm:ss')
weekendFeaturizer.transform(data).show()
Grouping Featurizer¶
Group by Sum¶
import mlfeaturizer.core.featurizer as fr
data = spark.createDataFrame([
(0, 250.0, '2018-01-01 12:05:00'),
(0, 350.0, '2018-01-03 15:15:00'),
(1, 150.0, '2018-12-01 08:05:00'),
(1, 580.0, '2018-12-02 20:15:00'),
(1, 850.0, '2018-12-03 20:15:00'),
(2, 30.0, '2015-05-01 23:05:00')], ["id", "amountSpent", "time"])
groupByFeaturizer = fr.GroupByFeaturizer(inputCol='id', aggregateCol='amountSpent', aggregateType='sum', outputCol='sumByGroup')
groupByFeaturizer.transform(data).show()
Group by Min¶
import mlfeaturizer.core.featurizer as fr
data = spark.createDataFrame([
(0, 250.0, '2018-01-01 12:05:00'),
(0, 350.0, '2018-01-03 15:15:00'),
(1, 150.0, '2018-12-01 08:05:00'),
(1, 580.0, '2018-12-02 20:15:00'),
(1, 850.0, '2018-12-03 20:15:00'),
(2, 30.0, '2015-05-01 23:05:00')], ["id", "amountSpent", "time"])
groupByFeaturizer = fr.GroupByFeaturizer(inputCol='id', aggregateCol='amountSpent', aggregateType='min', outputCol='minByGroup')
groupByFeaturizer.transform(data).show()
Group by Max¶
import mlfeaturizer.core.featurizer as fr
data = spark.createDataFrame([
(0, 250.0, '2018-01-01 12:05:00'),
(0, 350.0, '2018-01-03 15:15:00'),
(1, 150.0, '2018-12-01 08:05:00'),
(1, 580.0, '2018-12-02 20:15:00'),
(1, 850.0, '2018-12-03 20:15:00'),
(2, 30.0, '2015-05-01 23:05:00')], ["id", "amountSpent", "time"])
groupByFeaturizer = fr.GroupByFeaturizer(inputCol='id', aggregateCol='amountSpent', aggregateType='max', outputCol='maxByGroup')
groupByFeaturizer.transform(data).show()
Group by Average¶
import mlfeaturizer.core.featurizer as fr
data = spark.createDataFrame([
(0, 250.0, '2018-01-01 12:05:00'),
(0, 350.0, '2018-01-03 15:15:00'),
(1, 150.0, '2018-12-01 08:05:00'),
(1, 580.0, '2018-12-02 20:15:00'),
(1, 850.0, '2018-12-03 20:15:00'),
(2, 30.0, '2015-05-01 23:05:00')], ["id", "amountSpent", "time"])
groupByFeaturizer = fr.GroupByFeaturizer(inputCol='id', aggregateCol='amountSpent', aggregateType='avg', outputCol='avgByGroup')
groupByFeaturizer.transform(data).show()
Group by Count¶
import mlfeaturizer.core.featurizer as fr
data = spark.createDataFrame([
(0, 250.0, '2018-01-01 12:05:00'),
(0, 350.0, '2018-01-03 15:15:00'),
(1, 150.0, '2018-12-01 08:05:00'),
(1, 580.0, '2018-12-02 20:15:00'),
(1, 850.0, '2018-12-03 20:15:00'),
(2, 30.0, '2015-05-01 23:05:00')], ["id", "amountSpent", "time"])
groupByFeaturizer = fr.GroupByFeaturizer(inputCol='id', aggregateCol='amountSpent', aggregateType='count', outputCol='countByGroup')
groupByFeaturizer.transform(data).show()
GEO Featurizer¶
Geohash¶
import mlfeaturizer.core.featurizer as fr
data = spark.createDataFrame([
(0, 37.788866, -122.39821, '2018-01-01 12:05:00'),
(0, 37.781555, -122.393990, '2018-01-03 15:15:00'),
(1, 37.791430, -122.401040, '2018-12-01 08:05:00'),
(1, 37.779510, -122.420220,'2018-12-02 20:15:00'),
(1, 37.751301, -122.434082,'2018-12-03 20:15:00'),
(2, 37.754130, -122.488820,'2015-05-01 23:05:00')], ["id", "lat", "lon", "time"])
geohashFeaturizer = fr.GeohashFeaturizer(inputCols=['lat','lon'], outputCol='geohash', precision=8)
geohashFeaturizer.transform(data).show()