diff --git a/docs/Project.toml b/docs/Project.toml index eb91252b..6658d1f3 100644 --- a/docs/Project.toml +++ b/docs/Project.toml @@ -21,8 +21,9 @@ ProgressMeter = "92933f4c-e287-5a05-a399-4b506db050ca" RDatasets = "ce6b1742-4840-55fa-b093-852dadbb1d8b" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" ScientificTypes = "321657f4-b219-11e9-178b-2701a2544e81" +StableRNGs = "860ef19b-820b-49d6-a774-d7a799459cd3" Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c" WordTokenizers = "796a5d58-b03d-544a-977e-18100b691f6e" [compat] -Documenter = "1" \ No newline at end of file +Documenter = "1" diff --git a/docs/src/common_workflows/composition/notebook.md b/docs/src/common_workflows/composition/notebook.md index 949d5322..a8d21fff 100644 --- a/docs/src/common_workflows/composition/notebook.md +++ b/docs/src/common_workflows/composition/notebook.md @@ -18,27 +18,33 @@ learning model from MLJFlux. ````@example composition using MLJ # Has MLJFlux models using Flux # For more flexibility -import RDatasets # Dataset source import Random # To create imbalance import Imbalance # To solve the imbalance import Optimisers # native Flux.jl optimisers no longer supported +using StableRNGs # for reproducability +using CategoricalArrays ```` ### Loading and Splitting the Data ````@example composition -iris = RDatasets.dataset("datasets", "iris"); -y, X = unpack(iris, ==(:Species), rng=123); -X = Float32.(X); # To be compatible with type of network network parameters +iris = load_iris() # a named-tuple of vectors +species, X = unpack(iris, ==(:target), rng=StableRNG(123)) +X = Flux.fmap(column-> Float32.(column), X) # Flux prefers Float32 data nothing #hide ```` -To simulate an imbalanced dataset, we will take a random sample: +The iris dataset has a target with uniformly distributed values, `"versicolor"`, +`"setosa"`, and `"virginica"`. To manufacture an unbalanced dataset, we'll combine the +first two into a single classs, `"colosa"`: ````@example composition -Random.seed!(803429) -subset_indices = rand(1:size(X, 1), 100) -X, y = X[subset_indices, :], y[subset_indices] +y = coerce( + map(y) do species + species == "virginica" ? unwrap(species) : "colosa" + end, + Multiclass, +) Imbalance.checkbalance(y) ```` @@ -61,7 +67,7 @@ clf = NeuralNetworkClassifier( optimiser=Optimisers.Adam(0.01), batch_size=8, epochs=50, - rng=42, + rng=StableRNG(123), ) ```` @@ -83,20 +89,18 @@ pipeline = standarizer |> balanced_model By this, any training data will be standardized then oversampled then passed to the model. Meanwhile, for inference, the standardizer will automatically use the training -set's mean and std and the oversampler will be transparent. +set's mean and std and the oversampler will play no role. ### Training the Composed Model -It's indistinguishable from training a single model. +The pipeline model can be evaluated like any other model: ````@example composition mach = machine(pipeline, X, y) -fit!(mach) -cv=CV(nfolds=5) -evaluate!(mach, resampling=cv, measure=accuracy) +cv=CV(nfolds=6) +evaluate!(mach, resampling=cv, measure=accuracy, verbosity=0) ```` --- *This page was generated using [Literate.jl](https://github.com/fredrikekre/Literate.jl).* -