Title: | Data Preprocessing, Binning for Classification and Regression |
---|---|
Description: | Various supervised and unsupervised binning tools including using entropy, recursive partition methods and clustering. |
Authors: | Chapman Siu |
Maintainer: | Chapman Siu <[email protected]> |
License: | MIT + file LICENSE |
Version: | 0.2.1 |
Built: | 2025-01-08 06:13:41 UTC |
Source: | https://github.com/sourdoughcat/binst |
Creates bins given breaks
create_bins(x, breaks, method = "cuts")
create_bins(x, breaks, method = "cuts")
x |
X is a numeric vector which is to be discretized |
breaks |
Breaks are the breaks for the vector X to be broken at. This excludes endpoints |
method |
the approach to bin the variable, can either be cuts or hinge. |
A vector same length as X is returned with the numeric discretization
create_bins(1:10, c(3, 5))
create_bins(1:10, c(3, 5))
A convenience functon for creating breaks with various methods.
create_breaks(x, y = NULL, method = "kmeans", control = NULL, ...)
create_breaks(x, y = NULL, method = "kmeans", control = NULL, ...)
x |
X is a numeric vector to be discretized |
y |
Y is the response vector used for calculating metrics for discretization |
method |
Method is the type of discretization approach used. Possible methods are: "dt", "entropy", "kmeans", "jenks" |
control |
Control is used for optional parameters for the method. It is a list of optional parameters for the function |
... |
instead of passing a list into control, arguments can be parsed as is. |
A vector containing the breaks
kmeans_breaks <- create_breaks(1:10) create_bins(1:10, kmeans_breaks) # passing the k means parameter "centers" = 4 kmeans_breaks <- create_breaks(1:10, list(centers=4)) create_bins(1:10, kmeans_breaks) entropy_breaks <- create_breaks(1:10, rep(c(1,2), each = 5), method="entropy") create_bins(1:10, entropy_breaks) dt_breaks <- create_breaks(iris$Sepal.Length, iris$Species, method="dt") create_bins(iris$Sepal.Length, dt_breaks)
kmeans_breaks <- create_breaks(1:10) create_bins(1:10, kmeans_breaks) # passing the k means parameter "centers" = 4 kmeans_breaks <- create_breaks(1:10, list(centers=4)) create_bins(1:10, kmeans_breaks) entropy_breaks <- create_breaks(1:10, rep(c(1,2), each = 5), method="entropy") create_bins(1:10, entropy_breaks) dt_breaks <- create_breaks(iris$Sepal.Length, iris$Species, method="dt") create_bins(iris$Sepal.Length, dt_breaks)
Create breaks using decision trees (recursive partitioning)
create_dtbreaks(x, y, control = NULL)
create_dtbreaks(x, y, control = NULL)
x |
X is a numeric vector to be discretized |
y |
Y is the response vector used for calculating metrics for discretization |
control |
Control is used for optional parameters for the method |
A vector containing the breaks
dt_breaks <- create_breaks(iris$Sepal.Length, iris$Species, method="dt") create_bins(iris$Sepal.Length, dt_breaks)
dt_breaks <- create_breaks(iris$Sepal.Length, iris$Species, method="dt") create_bins(iris$Sepal.Length, dt_breaks)
Create breaks using earth (i.e. MARS)
create_earthbreaks(x, y, control = NULL)
create_earthbreaks(x, y, control = NULL)
x |
X is a numeric vector to be discretized |
y |
Y is the response vector used for calculating metrics for discretization |
control |
Control is used for optional parameters for the method |
A vector containing the breaks
earth_breaks <- create_breaks(x=iris$Sepal.Length, y=iris$Sepal.Width, method="earth") create_bins(iris$Sepal.Length, earth_breaks)
earth_breaks <- create_breaks(x=iris$Sepal.Length, y=iris$Sepal.Width, method="earth") create_bins(iris$Sepal.Length, earth_breaks)
Create Jenks breaks
create_jenksbreaks(x, control = NULL)
create_jenksbreaks(x, control = NULL)
x |
X is a numeric vector to be discretized |
control |
Control is used for optional parameters for the method |
A vector containing the breaks
jenks_breaks <- create_breaks(1:10, method="jenks") create_bins(1:10, jenks_breaks)
jenks_breaks <- create_breaks(1:10, method="jenks") create_bins(1:10, jenks_breaks)
Create kmeans breaks.
create_kmeansbreaks(x, control = NULL)
create_kmeansbreaks(x, control = NULL)
x |
X is a numeric vector to be discretized |
control |
Control is used for optional parameters for the method |
A vector containing the breaks
kmeans_breaks <- create_breaks(1:10) create_bins(1:10, kmeans_breaks)
kmeans_breaks <- create_breaks(1:10) create_bins(1:10, kmeans_breaks)
Create breaks using mdlp
create_mdlpbreaks(x, y)
create_mdlpbreaks(x, y)
x |
X is a numeric vector to be discretized |
y |
Y is the response vector used for calculating metrics for discretization |
A vector containing the breaks
entropy_breaks <- create_breaks(1:10, rep(c(1,2), each = 5), method="entropy") create_bins(1:10, entropy_breaks)
entropy_breaks <- create_breaks(1:10, rep(c(1,2), each = 5), method="entropy") create_bins(1:10, entropy_breaks)
gets the default parameters for each method.
get_control(method = "kmeans", control = NULL)
get_control(method = "kmeans", control = NULL)
method |
Method is the type of discretization approach used |
control |
Control are the controls for the algorithm |
List of default parameters