diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index 29baf6db..ce7ba389 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -14,7 +14,7 @@ jobs: uses: rlespinasse/github-slug-action@v3.x - uses: actions/setup-node@v3 with: - node-version: 'lts/*' + node-version: "lts/*" - uses: actions/checkout@v3 - run: | npm install @@ -40,7 +40,7 @@ jobs: with: github_token: ${{ secrets.GITHUB_TOKEN }} publish_dir: gh-pages - user_name: 'openEO CI' + user_name: "openEO CI" user_email: openeo.ci@uni-muenster.de cname: processes.openeo.org - name: deploy to ${{ env.GITHUB_REF_SLUG }} @@ -50,5 +50,5 @@ jobs: github_token: ${{ secrets.GITHUB_TOKEN }} publish_dir: gh-pages destination_dir: ${{ env.GITHUB_REF_SLUG }} - user_name: 'openEO CI' - user_email: openeo.ci@uni-muenster.de \ No newline at end of file + user_name: "openEO CI" + user_email: openeo.ci@uni-muenster.de diff --git a/CHANGELOG.md b/CHANGELOG.md index f0b68043..2db39a19 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,12 +15,28 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - `date_difference` - `filter_vector` - `flatten_dimensions` + - `import_cube` + - `import_ml_model` - `load_geojson` - `load_ml_model` + - `load_stac_ml` - `load_url` - - `ml_fit_class_random_forest` - - `ml_fit_regr_random_forest` + - `mlm_class_catboost` + - `mlm_class_lighttae` + - `mlm_class_mlp` + - `mlm_class_random_forest` + - `mlm_class_svm` + - `mlm_class_tae` + - `mlm_class_tempcnn` + - `mlm_class_xgboost` + - `mlm_regr_random_forest` + - `mlm_regr_svm` + - `ml_fit` + - `ml_label_class` - `ml_predict` + - `ml_predict_probabilities` + - `ml_smooth_class` + - `ml_uncertainty_class` - `save_ml_model` - `unflatten_dimension` - `vector_buffer` @@ -379,4 +395,3 @@ Older versions of the processes were released as part of the openEO API, see the [0.4.2]: [0.4.1]: [0.4.0]: - diff --git a/README.md b/README.md index 24c28899..71f78f57 100644 --- a/README.md +++ b/README.md @@ -40,7 +40,7 @@ This repository contains a set of files formally describing the openEO Processes ## Process * All new processes must be added to the [`proposals`](proposals/) folder. -* Processes will only be moved from proposals to the stable process specifications once there are at least two implementations and an example process in the [`examples`](examples/) folder showing it in a use case. This doesn't require a PSC vote individually as it's not a breaking change, just an addition. +* Processes will only be moved from proposals to the stable process specifications once there are at least two implementations and an example process in the [`openEO community examples`](https://github.com/Open-EO/openeo-community-examples/) showing it in a use case. This doesn't require a PSC vote individually as it's not a breaking change, just an addition. * The [`proposals`](proposals/) folder allows breaking changes without a PSC vote and without increasing the major version number (i.e. a breaking change in the proposals doesn't require us to make the next version number 2.0.0). * The proposals are released as experimental processes with the other processes. * Each release and all breaking changes in the stable process specifications must go through PSC vote. \ No newline at end of file diff --git a/proposals/load_ml_model.json b/proposals/load_ml_model.json index 7fa86d89..8980a20d 100644 --- a/proposals/load_ml_model.json +++ b/proposals/load_ml_model.json @@ -1,7 +1,7 @@ { "id": "load_ml_model", - "summary": "Load a ML model", - "description": "Loads a machine learning model from a STAC Item.\n\nSuch a model could be trained and saved as part of a previous batch job with processes such as ``ml_fit_regr_random_forest()`` and ``save_ml_model()``.", + "summary": "Load a machine learning model by ID", + "description": "Loads a machine learning model that is managed by the current back-end, identified by a back-end specific model identifier.\n\nThis allows the back-end to host and optimize models for efficient inference and training (e.g., via model-specific libraries such as Terratorch). Back-end specific models can be referenced by a string identifier similar to collections.\n\nIf you want to load a model from a STAC Item implementing the `mlm` extension, use `load_stac_ml()` instead.", "categories": [ "machine learning", "import" @@ -9,27 +9,16 @@ "experimental": true, "parameters": [ { - "name": "uri", - "description": "The STAC Item to load the machine learning model from. The STAC Item must implement the `ml-model` extension.", - "schema": [ - { - "title": "URL", - "type": "string", - "format": "uri", - "subtype": "uri", - "pattern": "^https?://" - }, - { - "title": "User-uploaded File", - "type": "string", - "subtype": "file-path", - "pattern": "^[^\r\n\\:'\"]+$" - } - ] + "name": "id", + "description": "The back-end specific identifier of the machine learning model to load.", + "schema": { + "type": "string", + "pattern": "^[\\w\\-\\.~/]+$" + } } ], "returns": { - "description": "A machine learning model to be used with machine learning processes such as ``ml_predict()``.", + "description": "A machine learning model to be used with machine learning processes such as `ml_predict()`.", "schema": { "type": "object", "subtype": "ml-model" @@ -37,10 +26,18 @@ }, "links": [ { - "href": "https://github.com/stac-extensions/ml-model", - "title": "STAC ml-model extension", + "href": "https://github.com/stac-extensions/mlm", + "title": "Machine Learning Model STAC extension", + "type": "text/html", + "rel": "about" + }, + { + "href": "https://openeo.org/documentation/1.0/", + "title": "openEO API documentation", "type": "text/html", "rel": "about" } ] } + + diff --git a/proposals/load_stac_ml.json b/proposals/load_stac_ml.json new file mode 100644 index 00000000..d3c6426f --- /dev/null +++ b/proposals/load_stac_ml.json @@ -0,0 +1,73 @@ +{ + "id": "load_stac_ml", + "summary": "Load a ML model from a STAC Item", + "description": "Loads a machine learning model from a STAC Item.\n\nSuch a model could be trained and saved as part of a previous batch job with processes such as ``ml_fit()`` and ``save_ml_model()`` or externally hosted models.", + "categories": [ + "machine learning", + "import" + ], + "experimental": true, + "parameters": [ + { + "name": "uri", + "description": "The STAC Item to load the machine learning model from. The STAC Item must implement the [`mlm`](https://github.com/stac-extensions/mlm) extension. This parameter can point to a remote STAC Item via ``URL`` or a local JSON file.", + "schema": [ + { + "title": "URL", + "type": "string", + "format": "uri", + "subtype": "uri", + "pattern": "^https?://" + }, + { + "title": "User-uploaded File", + "type": "string", + "subtype": "file-path", + "pattern": "^[^\r\n\\:'\"]+$" + } + ] + }, + { + "name": "model_asset", + "description": "The Asset name of the given STAC Item which represents the actual ML model. The asset must list ``mlm:model`` as its role. If only one asset lists ``mlm:model`` as its role, this parameter is optional as this asset will be used by default. If multiple assets list ``mlm:model`` as their role, this parameter is required to determine which asset to use.", + "schema": { + "type": "string" + }, + "default": null, + "optional": true + }, + { + "name": "input_index", + "description": "STAC:MLM items supports multiple ML model input specification. This parameter specifies the index of the input specification in the ``mlm:input`` array to use for prediction or training. As ``mlm:input`` is an array, the first input in the array has index 0.", + "schema": { + "type": "integer" + }, + "default": 0, + "optional": true + }, + { + "name": "output_index", + "description": "STAC:MLM items supports multiple ML model output specification. This parameter specifies the index of the output specification in the ``mlm:output`` array to use for prediction or training. As ``mlm:output`` is an array, the first output in the array has index 0.", + "schema": { + "type": "integer" + }, + "default": 0, + "optional": true + } + ], + "returns": { + "description": "A machine learning model to be used with machine learning processes such as ``ml_predict()``.", + "schema": { + "type": "object", + "subtype": "ml-model" + } + }, + "links": [ + { + "href": "https://github.com/stac-extensions/mlm", + "title": "Machine Learning Model STAC extension", + "type": "text/html", + "rel": "about" + } + ] +} diff --git a/proposals/ml_fit.json b/proposals/ml_fit.json new file mode 100644 index 00000000..e792bd64 --- /dev/null +++ b/proposals/ml_fit.json @@ -0,0 +1,67 @@ +{ + "id": "ml_fit", + "summary": "Train a machine learning model", + "description": "Executes the fit of a specified machine learning model based on training data.\n\nThe function is generic and supports different machine learning models.", + "categories": [ + "machine learning" + ], + "experimental": true, + "parameters": [ + { + "name": "model", + "description": "The machine learning model to be trained. This should be an instance of a model that supports the `ml_fit` method.", + "schema": { + "type": "object", + "subtype": "ml-model" + } + }, + { + "name": "training_set", + "description": "The training set for the model, provided as a vector data cube. This set contains both the independent variables and the dependent variable that the model analyzes to learn patterns and relationships within the data.", + "schema": [ + { + "type": "object", + "subtype": "datacube", + "dimensions": [ + { + "type": "geometry" + }, + { + "type": "bands" + } + ] + }, + { + "type": "object", + "subtype": "datacube", + "dimensions": [ + { + "type": "geometry" + }, + { + "type": "other" + } + ] + }, + { + "type": "string" + } + ] + }, + { + "name": "target", + "description": "The name of the variable in the training set that serves as the target for model training.", + "schema": { + "type": "string" + } + } + ], + "returns": { + "description": "A trained model object that can be saved with `save_ml_model()` and restored with `load_ml_model()`.", + "schema": { + "type": "object", + "subtype": "ml-model" + } + } +} + diff --git a/proposals/ml_label_class.json b/proposals/ml_label_class.json new file mode 100644 index 00000000..649fe0e2 --- /dev/null +++ b/proposals/ml_label_class.json @@ -0,0 +1,38 @@ +{ + "id": "ml_label_class", + "summary": "Convert probability data cube to labeled data cube", + "description": "Converts a probability data cube to a labeled data cube by applying softmax normalization and selecting the class with the highest probability. Optionally allows mapping of class indices to custom labels.", + "categories": [ + "machine learning" + ], + "experimental": true, + "parameters": [ + { + "name": "data", + "description": "The input probability data cube to be labeled. Each band should represent the probability of a different class.", + "schema": { + "type": "object", + "subtype": "datacube" + } + }, + { + "name": "labels", + "description": "Optional dictionary mapping class indices to custom labels. The dictionary keys are the class indices (as integers) and the values are the custom labels for each class. If not provided, the class indices will be used as labels.", + "optional": true, + "default": null, + "schema": { + "type": [ + "object", + "null" + ] + } + } + ], + "returns": { + "description": "A labeled data cube where each pixel contains the class label with the highest probability after softmax normalization.", + "schema": { + "type": "object", + "subtype": "datacube" + } + } +} \ No newline at end of file diff --git a/proposals/ml_predict.json b/proposals/ml_predict.json index 87cd2500..4741bcd9 100644 --- a/proposals/ml_predict.json +++ b/proposals/ml_predict.json @@ -17,21 +17,11 @@ }, { "name": "model", - "description": "A ML model that was trained with one of the ML training processes such as ``ml_fit_regr_random_forest()``.", + "description": "A ML model that was trained.", "schema": { "type": "object", "subtype": "ml-model" } - }, - { - "name": "dimensions", - "description": "Zero or more dimensions that will be reduced by the model. Fails with a `DimensionNotAvailable` exception if one of the specified dimensions does not exist.", - "schema": { - "type": "array", - "items": { - "type": "string" - } - } } ], "returns": { @@ -47,3 +37,4 @@ } } } + diff --git a/proposals/ml_predict_probabilities.json b/proposals/ml_predict_probabilities.json new file mode 100644 index 00000000..c44689b6 --- /dev/null +++ b/proposals/ml_predict_probabilities.json @@ -0,0 +1,39 @@ +{ + "id": "ml_predict_probabilities", + "summary": "Predict class probabilities using ML", + "description": "Applies a machine learning model to a data cube of input features and returns the predicted class probabilities.", + "categories": [ + "machine learning" + ], + "experimental": true, + "parameters": [ + { + "name": "data", + "description": "The data cube containing the input features.", + "schema": { + "type": "object", + "subtype": "datacube" + } + }, + { + "name": "model", + "description": "A ML model that was trained and supports probability predictions.", + "schema": { + "type": "object", + "subtype": "ml-model" + } + } + ], + "returns": { + "description": "A data cube with the predicted class probabilities. It removes the specified dimensions and adds a new dimension for the class probabilities. The dimension has the name `classes` and is of type `other`. Each label in the dimension represents a class, and the values are the probabilities for each class.", + "schema": { + "type": "object", + "subtype": "datacube", + "dimensions": [ + { + "type": "other" + } + ] + } + } +} \ No newline at end of file diff --git a/proposals/ml_smooth_class.json b/proposals/ml_smooth_class.json new file mode 100644 index 00000000..380de6ad --- /dev/null +++ b/proposals/ml_smooth_class.json @@ -0,0 +1,66 @@ +{ + "id": "ml_smooth_class", + "summary": "Apply Bayesian smoothing to classification results", + "description": "Applies spatial smoothing to classification probabilities results from a machine learning model using Bayesian inference. This process helps reduce noise and improve classification accuracy by considering spatial context and neighbourhood information. The smoothing is based on a window-based approach.", + "categories": [ + "machine learning" + ], + "experimental": true, + "parameters": [ + { + "name": "data", + "description": "The input classification probabilities data to be smoothed. This should be a classified data cube.", + "schema": { + "type": "object", + "subtype": "datacube" + } + }, + { + "name": "window_size", + "description": "The size of the moving window used for spatial smoothing. Must be an odd number to ensure a central pixel.", + "optional": true, + "default": 7, + "schema": { + "type": "integer", + "minimum": 3, + "multipleOf": 2 + } + }, + { + "name": "neighborhood_fraction", + "description": "The fraction of neighbouring pixels to consider in the smoothing process. Controls the influence of local spatial information.", + "optional": true, + "default": 0.5, + "schema": { + "type": "number", + "minimum": 0, + "maximum": 1 + } + }, + { + "name": "smoothness", + "description": "Controls the degree of smoothing applied. Higher values result in smoother outputs but may lose fine details.", + "optional": true, + "default": 10, + "schema": { + "type": "integer", + "minimum": 1 + } + } + ], + "returns": { + "description": "A smoothed classification result with reduced noise and improved spatial consistency.", + "schema": { + "type": "object", + "subtype": "datacube" + } + }, + "links": [ + { + "href": "https://doi.org/10.3390/rs16234572", + "title": "Camara et al. (2024): Bayesian Inference for Post-Processing of Remote-Sensing Image Classification", + "type": "text/html", + "rel": "about" + } + ] +} \ No newline at end of file diff --git a/proposals/ml_tune_grid.json b/proposals/ml_tune_grid.json new file mode 100644 index 00000000..b19ad755 --- /dev/null +++ b/proposals/ml_tune_grid.json @@ -0,0 +1,146 @@ +{ + "id": "ml_tune_grid", + "summary": "Grid search hyperparameter tuning", + "description": "Performs exhaustive grid search over specified hyperparameter combinations to find optimal model settings. Each combination in the parameter grid is evaluated using cross-validation or a hold-out validation set, and the best trained model based on the specified scoring metrics is returned.", + "categories": [ + "machine learning" + ], + "experimental": true, + "parameters": [ + { + "name": "model", + "description": "An untrained machine learning model to be trained and evaluated for each hyperparameter combination.", + "schema": { + "type": "object", + "subtype": "ml-model" + } + }, + { + "name": "training_set", + "description": "The training set for the model, provided as a vector data cube or a reference. This set contains both the independent variables and the dependent variable that the model analyzes to learn patterns and relationships within the data.", + "schema": [ + { + "type": "object", + "subtype": "datacube", + "dimensions": [ + { + "type": "geometry" + }, + { + "type": "bands" + } + ] + }, + { + "type": "object", + "subtype": "datacube", + "dimensions": [ + { + "type": "geometry" + }, + { + "type": "other" + } + ] + }, + { + "type": "string" + } + ] + }, + { + "name": "parameters", + "description": "Hyperparameter grid defining the search space. Each key is a hyperparameter name, and each value is an array of candidate values to try. All combinations will be evaluated.", + "schema": { + "type": "object", + "additionalProperties": { + "type": "array", + "items": {} + } + } + }, + { + "name": "target", + "description": "The name of the variable in the training set that serves as the target or ground truth for model training. This may refer to class labels, continuous values, segmentation masks, or any reference variable depending on the model task. Set to `null` for unsupervised or self-supervised tasks.", + "optional": true, + "default": "label", + "schema": [ + { + "type": "string" + }, + { + "type": "null" + } + ] + }, + { + "name": "scoring", + "description": "One or more metrics used to evaluate and compare model performance. The available metrics depend on the model task and the back-end implementation. Examples include 'accuracy', 'f1', 'kappa', and 'auc' for classification; 'rmse', 'mae', and 'r2' for regression; 'iou' and 'dice' for segmentation; or any custom metric supported by the back-end. If set to `null`, the back-end selects an appropriate default metric based on the model task. When multiple metrics are provided, the first metric in the list is used to select the best model.", + "optional": true, + "default": null, + "schema": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + }, + "minItems": 1 + }, + { + "type": "null" + } + ] + }, + { + "name": "cv", + "description": "Number of cross-validation folds. If 0 or 1, uses a single train/validation split instead of cross-validation.", + "optional": true, + "default": 5, + "schema": { + "type": "integer", + "minimum": 0 + } + }, + { + "name": "validation_split", + "description": "Fraction of training data used for validation when cv <= 1. Ignored when using cross-validation.", + "optional": true, + "default": 0.2, + "schema": { + "type": "number", + "minimum": 0, + "exclusiveMaximum": 1 + } + }, + { + "name": "seed", + "description": "Random seed for reproducibility of data splits and model training.", + "optional": true, + "default": null, + "schema": { + "type": [ + "integer", + "null" + ] + } + } + ], + "returns": { + "description": "A trained model with the best hyperparameter combination found during the grid search. The model metadata includes the tuning results such as the best hyperparameters and evaluation scores.", + "schema": { + "type": "object", + "subtype": "ml-model" + } + }, + "links": [ + { + "href": "https://www.jmlr.org/papers/v13/bergstra12a.html", + "title": "Bergstra & Bengio (2012) - Random Search for Hyper-Parameter Optimization", + "type": "text/html", + "rel": "about" + } + ] +} diff --git a/proposals/ml_tune_random.json b/proposals/ml_tune_random.json new file mode 100644 index 00000000..9d7274c2 --- /dev/null +++ b/proposals/ml_tune_random.json @@ -0,0 +1,185 @@ +{ + "id": "ml_tune_random", + "summary": "Random search hyperparameter tuning", + "description": "Performs random search over hyperparameter distributions to find optimal model settings. Unlike grid search, random search samples a fixed number of parameter combinations from specified distributions, which can be more efficient for high-dimensional parameter spaces. Returns the best trained model based on the specified scoring metrics.", + "categories": [ + "machine learning" + ], + "experimental": true, + "parameters": [ + { + "name": "model", + "description": "An untrained machine learning model to be trained and evaluated for each sampled hyperparameter combination.", + "schema": { + "type": "object", + "subtype": "ml-model" + } + }, + { + "name": "training_set", + "description": "The training set for the model, provided as a vector data cube or a reference. This set contains both the independent variables and the dependent variable that the model analyzes to learn patterns and relationships within the data.", + "schema": [ + { + "type": "object", + "subtype": "datacube", + "dimensions": [ + { + "type": "geometry" + }, + { + "type": "bands" + } + ] + }, + { + "type": "object", + "subtype": "datacube", + "dimensions": [ + { + "type": "geometry" + }, + { + "type": "other" + } + ] + }, + { + "type": "string" + } + ] + }, + { + "name": "parameters", + "description": "Hyperparameter distributions defining the search space. Each key is a hyperparameter name. Values can be arrays (uniform sampling), or objects specifying distributions with 'type' ('uniform', 'log_uniform', 'int_uniform', 'choice') and parameters ('min', 'max', 'values').", + "schema": { + "type": "object", + "additionalProperties": { + "anyOf": [ + { + "type": "array", + "description": "List of discrete values to sample from uniformly.", + "items": {} + }, + { + "type": "object", + "description": "Distribution specification.", + "properties": { + "type": { + "type": "string", + "enum": [ + "uniform", + "log_uniform", + "int_uniform", + "choice" + ] + }, + "min": { + "type": "number" + }, + "max": { + "type": "number" + }, + "values": { + "type": "array" + } + } + } + ] + } + } + }, + { + "name": "n_iter", + "description": "Number of random parameter combinations to sample and evaluate.", + "optional": true, + "default": 10, + "schema": { + "type": "integer", + "minimum": 1 + } + }, + { + "name": "target", + "description": "The name of the variable in the training set that serves as the target or ground truth for model training. This may refer to class labels, continuous values, segmentation masks, or any reference variable depending on the model task. Set to `null` for unsupervised or self-supervised tasks.", + "optional": true, + "default": "label", + "schema": [ + { + "type": "string" + }, + { + "type": "null" + } + ] + }, + { + "name": "scoring", + "description": "One or more metrics used to evaluate and compare model performance. The available metrics depend on the model task and the back-end implementation. Examples include 'accuracy', 'f1', 'kappa', and 'auc' for classification; 'rmse', 'mae', and 'r2' for regression; 'iou' and 'dice' for segmentation; or any custom metric supported by the back-end. If set to `null`, the back-end selects an appropriate default metric based on the model task. When multiple metrics are provided, the first metric in the list is used to select the best model.", + "optional": true, + "default": null, + "schema": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + }, + "minItems": 1 + }, + { + "type": "null" + } + ] + }, + { + "name": "cv", + "description": "Number of cross-validation folds. If 0 or 1, uses a single train/validation split instead of cross-validation.", + "optional": true, + "default": 5, + "schema": { + "type": "integer", + "minimum": 0 + } + }, + { + "name": "validation_split", + "description": "Fraction of training data used for validation when cv <= 1. Ignored when using cross-validation.", + "optional": true, + "default": 0.2, + "schema": { + "type": "number", + "minimum": 0, + "exclusiveMaximum": 1 + } + }, + { + "name": "seed", + "description": "Random seed for reproducibility of parameter sampling, data splits, and model training.", + "optional": true, + "default": null, + "schema": { + "type": [ + "integer", + "null" + ] + } + } + ], + "returns": { + "description": "A trained model with the best hyperparameter combination found during the random search. The model metadata includes the tuning results such as the best hyperparameters and evaluation scores.", + "schema": { + "type": "object", + "subtype": "ml-model" + } + }, + "links": [ + { + "href": "https://www.jmlr.org/papers/v13/bergstra12a.html", + "title": "Bergstra & Bengio (2012) - Random Search for Hyper-Parameter Optimization", + "type": "text/html", + "rel": "about" + } + ] +} diff --git a/proposals/ml_uncertainty_class.json b/proposals/ml_uncertainty_class.json new file mode 100644 index 00000000..86402906 --- /dev/null +++ b/proposals/ml_uncertainty_class.json @@ -0,0 +1,40 @@ +{ + "id": "ml_uncertainty_class", + "summary": "Estimate uncertainty of classification probabilities", + "description": "Calculates uncertainty estimates for classification probabilities using various methods. The uncertainty is derived from the probability distribution of class predictions.", + "categories": [ + "machine learning" + ], + "experimental": true, + "parameters": [ + { + "name": "data", + "description": "The data cube containing the classification probabilities.", + "schema": { + "type": "object", + "subtype": "datacube" + } + }, + { + "name": "method", + "description": "The method to use for uncertainty estimation. 'margin' calculates the difference between the two highest probabilities, 'ratio' uses the ratio of the two highest probabilities, and 'least-confidence' uses 1 minus the highest probability.", + "default": "margin", + "optional": true, + "schema": { + "type": "string", + "enum": [ + "margin", + "ratio", + "least-confidence" + ] + } + } + ], + "returns": { + "description": "A data cube with the uncertainty estimates. The output maintains the same dimensions as the input data cube, but replaces the class probabilities with uncertainty values. Higher values indicate higher uncertainty in the classification.", + "schema": { + "type": "object", + "subtype": "datacube" + } + } +} \ No newline at end of file diff --git a/proposals/ml_validate.json b/proposals/ml_validate.json new file mode 100644 index 00000000..54aa4eb3 --- /dev/null +++ b/proposals/ml_validate.json @@ -0,0 +1,102 @@ +{ + "id": "ml_validate", + "summary": "Validate a trained ML model", + "description": "Evaluates a trained machine learning model on a validation set and computes performance metrics. The model must have been previously trained using a process such as ``ml_fit()``, ``ml_tune_grid()``, or ``ml_tune_random()``.", + "categories": [ + "machine learning" + ], + "experimental": true, + "parameters": [ + { + "name": "model", + "description": "A trained machine learning model to evaluate.", + "schema": { + "type": "object", + "subtype": "ml-model" + } + }, + { + "name": "validation_set", + "description": "The validation set used to evaluate the model, provided as a vector data cube or a reference. This set must contain the same independent variables the model was trained on, along with the expected target variable for comparison.", + "schema": [ + { + "type": "object", + "subtype": "datacube", + "dimensions": [ + { + "type": "geometry" + }, + { + "type": "bands" + } + ] + }, + { + "type": "object", + "subtype": "datacube", + "dimensions": [ + { + "type": "geometry" + }, + { + "type": "other" + } + ] + }, + { + "type": "string" + } + ] + }, + { + "name": "target", + "description": "The name of the variable in the validation set that serves as the ground truth for evaluation. This may refer to class labels, continuous values, segmentation masks, or any reference variable depending on the model task. Set to `null` for unsupervised or self-supervised tasks.", + "optional": true, + "default": "label", + "schema": [ + { + "type": "string" + }, + { + "type": "null" + } + ] + }, + { + "name": "scoring", + "description": "One or more metrics to compute for evaluating model performance. The available metrics depend on the model task and the back-end implementation. Examples include 'accuracy', 'f1', 'kappa', and 'auc' for classification; 'rmse', 'mae', and 'r2' for regression; 'iou' and 'dice' for segmentation; or any custom metric supported by the back-end. If set to `null`, the back-end selects an appropriate default metric based on the model task.", + "optional": true, + "default": null, + "schema": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + }, + "minItems": 1 + }, + { + "type": "null" + } + ] + } + ], + "returns": { + "description": "The trained model enriched with validation scores in its metadata. The model can be directly used for prediction with ``ml_predict()``.", + "schema": { + "type": "object", + "subtype": "ml-model" + } + }, + "links": [ + { + "href": "https://doi.org/10.1214/09-SS054", + "title": "Arlot & Celisse (2010) - A Survey of Cross-Validation Procedures for Model Selection", + "type": "text/html", + "rel": "about" + } + ] +} diff --git a/proposals/mlm_class_catboost.json b/proposals/mlm_class_catboost.json new file mode 100644 index 00000000..7a46d0a1 --- /dev/null +++ b/proposals/mlm_class_catboost.json @@ -0,0 +1,62 @@ +{ + "id": "mlm_class_catboost", + "summary": "Initialize a CatBoost classification model", + "description": "Initializes a CatBoost classification model. This component sets up the model structure but does not perform training or handle data splitting. The resulting model can be trained later using ``ml_fit``.", + "categories": [ + "machine learning" + ], + "experimental": true, + "parameters": [ + { + "name": "iterations", + "description": "The maximum number of trees that can be built during the training process.", + "optional": true, + "default": 5, + "schema": { + "type": "integer", + "minimum": 1, + "maximum": 500 + } + }, + { + "name": "depth", + "description": "Depth of the trees in the CatBoost model.", + "optional": true, + "default": 5, + "schema": { + "type": "integer", + "minimum": 1, + "maximum": 16 + } + }, + { + "name": "seed", + "description": "The random seed used for training, for reproducibility. If not given or `null`, no seed is used and results may differ on subsequent use.", + "optional": true, + "default": 0, + "schema": { + "type": [ + "integer", + "null" + ], + "minimum": 0, + "maximum": 2147483647 + } + } + ], + "returns": { + "description": "A model object that can be trained using ``ml_fit``.", + "schema": { + "type": "object", + "subtype": "ml-model" + } + }, + "links": [ + { + "href": "https://catboost.ai/", + "title": "CatBoost Documentation", + "type": "text/html", + "rel": "about" + } + ] +} \ No newline at end of file diff --git a/proposals/mlm_class_lighttae.json b/proposals/mlm_class_lighttae.json new file mode 100644 index 00000000..b9f88533 --- /dev/null +++ b/proposals/mlm_class_lighttae.json @@ -0,0 +1,128 @@ +{ + "id": "mlm_class_lighttae", + "summary": "Initialize a LightTAE classification model", + "description": "Initializes a Lightweight Temporal Self-Attention (LightTAE) classification model. This component sets up the model structure but does not perform training or handle data splitting. The resulting model can be trained later using ``ml_fit``.", + "categories": [ + "machine learning" + ], + "experimental": true, + "parameters": [ + { + "name": "epochs", + "description": "Number of training epochs.", + "optional": true, + "default": 150, + "schema": { + "type": "integer", + "minimum": 1 + } + }, + { + "name": "batch_size", + "description": "Size of the training batches.", + "optional": true, + "default": 128, + "schema": { + "type": "integer", + "minimum": 1 + } + }, + { + "name": "optimizer", + "description": "The optimizer to use for training.", + "optional": true, + "default": "adam", + "schema": { + "type": "string", + "enum": [ + "adam", + "adabound", + "adabelief", + "madagrad", + "nadam", + "qhadam", + "radam", + "swats", + "yogi" + ] + } + }, + { + "name": "learning_rate", + "description": "The learning rate for the optimizer.", + "optional": true, + "default": 0.0005, + "schema": { + "type": "number", + "minimum": 0 + } + }, + { + "name": "epsilon", + "description": "A small constant for numerical stability in the optimizer.", + "optional": true, + "default": 1e-8, + "schema": { + "type": "number", + "minimum": 0 + } + }, + { + "name": "weight_decay", + "description": "Weight decay (L2 penalty) for the optimizer.", + "optional": true, + "default": 0.0007, + "schema": { + "type": "number", + "minimum": 0 + } + }, + { + "name": "lr_decay_epochs", + "description": "Number of epochs between learning rate decay steps.", + "optional": true, + "default": 50, + "schema": { + "type": "integer", + "minimum": 1 + } + }, + { + "name": "lr_decay_rate", + "description": "Rate at which the learning rate decays.", + "optional": true, + "default": 1, + "schema": { + "type": "number", + "minimum": 0 + } + }, + { + "name": "seed", + "description": "A randomization seed to use for reproducibility. If not given or `null`, no seed is used and results may differ on subsequent use.", + "optional": true, + "default": null, + "schema": { + "type": [ + "integer", + "null" + ] + } + } + ], + "returns": { + "description": "A model object that can be trained using ``ml_fit``.", + "schema": { + "type": "object", + "subtype": "ml-model" + } + }, + "links": [ + { + "href": "https://doi.org/10.1007/978-3-030-65742-0_12", + "title": "V. S. F. Garnot and L. Landrieu (2020), Lightweight Temporal Self-attention for Classifying Satellite Images Time Series", + "type": "text/html", + "rel": "about" + } + ] +} \ No newline at end of file diff --git a/proposals/mlm_class_mlp.json b/proposals/mlm_class_mlp.json new file mode 100644 index 00000000..45722c74 --- /dev/null +++ b/proposals/mlm_class_mlp.json @@ -0,0 +1,137 @@ +{ + "id": "mlm_class_mlp", + "summary": "Initialize an MLP classification model", + "description": "Initializes a Multi-Layer Perceptron (MLP) classification model. This component sets up the model structure but does not perform training or handle data splitting. The resulting model can be trained later using ``ml_fit``.", + "categories": [ + "machine learning" + ], + "experimental": true, + "parameters": [ + { + "name": "hidden_layer_sizes", + "description": "List of integers specifying the number of neurons in each hidden layer.", + "default": [ + 512, + 512, + 512 + ], + "schema": { + "type": "array", + "items": { + "type": "integer", + "minimum": 1 + }, + "minItems": 1 + } + }, + { + "name": "activation", + "description": "Activation function for the hidden layers.", + "optional": true, + "default": "relu", + "schema": { + "type": "string", + "enum": [ + "relu", + "tanh", + "logistic" + ] + } + }, + { + "name": "dropout_rates", + "description": "List of numbers between 0 and 1 specifying the dropout rate for each hidden layer.", + "default": [ + 0.4, + 0.3, + 0.2 + ], + "schema": { + "type": "array", + "items": { + "type": "number", + "minimum": 0, + "maximum": 1 + }, + "minItems": 1 + } + }, + { + "name": "epochs", + "description": "Number of training epochs.", + "optional": true, + "default": 100, + "schema": { + "type": "integer", + "minimum": 1 + } + }, + { + "name": "batch_size", + "description": "Size of the training batches.", + "optional": true, + "default": 64, + "schema": { + "type": "integer", + "minimum": 1 + } + }, + { + "name": "optimizer", + "description": "The optimizer to use for training.", + "optional": true, + "default": "adam", + "schema": { + "type": "string", + "enum": [ + "adam", + "adabound", + "adabelief", + "madagrad", + "nadam", + "qhadam", + "radam", + "swats", + "yogi" + ] + } + }, + { + "name": "learning_rate", + "description": "The learning rate for the optimizer.", + "optional": true, + "default": 0.001, + "schema": { + "type": "number", + "minimum": 0 + } + }, + { + "name": "seed", + "description": "A randomization seed to use for reproducibility. If not given or `null`, no seed is used and results may differ on subsequent use.", + "optional": true, + "default": null, + "schema": { + "type": [ + "integer", + "null" + ] + } + } + ], + "returns": { + "description": "A model object that can be trained using ``ml_fit``.", + "schema": { + "type": "object", + "subtype": "ml-model" + } + }, + "links": [ + { + "href": "https://arxiv.org/abs/1611.06455", + "title": "Z. Wang, W. Yan, and T. Oates (2017), Time Series Classification from Scratch with Deep Neural Networks: A Strong Baseline", + "type": "text/html", + "rel": "about" + } + ] +} \ No newline at end of file diff --git a/proposals/ml_fit_class_random_forest.json b/proposals/mlm_class_random_forest.json similarity index 51% rename from proposals/ml_fit_class_random_forest.json rename to proposals/mlm_class_random_forest.json index 63da48a1..0881e0f0 100644 --- a/proposals/ml_fit_class_random_forest.json +++ b/proposals/mlm_class_random_forest.json @@ -1,55 +1,12 @@ { - "id": "ml_fit_class_random_forest", - "summary": "Train a random forest classification model", - "description": "Executes the fit of a random forest classification based on training data. The process does not include a separate split of the data in test, validation and training data. The Random Forest classification model is based on the approach by Breiman (2001).", + "id": "mlm_class_random_forest", + "summary": "Initialize a random forest classification model", + "description": "Initializes a Random Forest classification model. This component sets up the model structure but does not perform training or handle data splitting. The resulting model can be trained later using ``ml_fit``.", "categories": [ "machine learning" ], "experimental": true, "parameters": [ - { - "name": "predictors", - "description": "The predictors for the classification model as a vector data cube. Aggregated to the features (vectors) of the target input variable.", - "schema": [ - { - "type": "object", - "subtype": "datacube", - "dimensions": [ - { - "type": "geometry" - }, - { - "type": "bands" - } - ] - }, - { - "type": "object", - "subtype": "datacube", - "dimensions": [ - { - "type": "geometry" - }, - { - "type": "other" - } - ] - } - ] - }, - { - "name": "target", - "description": "The training sites for the classification model as a vector data cube. This is associated with the target variable for the Random Forest model. The geometry has to associated with a value to predict (e.g. fractional forest canopy cover).", - "schema": { - "type": "object", - "subtype": "datacube", - "dimensions": [ - { - "type": "geometry" - } - ] - } - }, { "name": "max_variables", "description": "Specifies how many split variables will be used at a node.\n\nThe following options are available:\n\n- *integer*: The given number of variables are considered for each split.\n- `all`: All variables are considered for each split.\n- `log2`: The logarithm with base 2 of the number of variables are considered for each split.\n- `onethird`: A third of the number of variables are considered for each split.\n- `sqrt`: The square root of the number of variables are considered for each split. This is often the default for classification.", @@ -93,7 +50,7 @@ } ], "returns": { - "description": "A model object that can be saved with ``save_ml_model()`` and restored with ``load_ml_model()``.", + "description": "A model object that can be trained using ``ml_fit``.", "schema": { "type": "object", "subtype": "ml-model" @@ -107,4 +64,4 @@ "rel": "about" } ] -} +} \ No newline at end of file diff --git a/proposals/mlm_class_svm.json b/proposals/mlm_class_svm.json new file mode 100644 index 00000000..bf0a8ca7 --- /dev/null +++ b/proposals/mlm_class_svm.json @@ -0,0 +1,112 @@ +{ + "id": "mlm_class_svm", + "summary": "Initialize an SVM classification model", + "description": "Initializes a Support Vector Machine (SVM) classification model. This component sets up the model structure but does not perform training or handle data splitting. The resulting model can be trained later using ``ml_fit``.", + "categories": [ + "machine learning" + ], + "experimental": true, + "parameters": [ + { + "name": "kernel", + "description": "Specifies the kernel type to be used in the algorithm.", + "optional": true, + "default": "rbf", + "schema": { + "type": "string", + "enum": [ + "linear", + "poly", + "rbf", + "sigmoid" + ] + } + }, + { + "name": "C", + "description": "Regularization parameter. The strength of the regularization is inversely proportional to C. Must be strictly positive.", + "optional": true, + "default": 1, + "schema": { + "type": "number", + "minimum": 0 + } + }, + { + "name": "gamma", + "description": "Kernel coefficient for 'rbf', 'poly', and 'sigmoid'. Higher values lead to tighter fits.", + "optional": true, + "default": 1, + "schema": { + "type": "number", + "minimum": 0 + } + }, + { + "name": "degree", + "description": "Degree of the polynomial kernel function (only relevant for 'poly' kernel).", + "optional": true, + "default": 3, + "schema": { + "type": "integer", + "minimum": 1 + } + }, + { + "name": "coef0", + "description": "Independent term in the kernel function (only relevant for 'poly' and 'sigmoid' kernels).", + "optional": true, + "default": 0, + "schema": { + "type": "number" + } + }, + { + "name": "tolerance", + "description": "Tolerance of termination criterion.", + "optional": true, + "default": 0.001, + "schema": { + "type": "number", + "minimum": 0 + } + }, + { + "name": "cachesize", + "description": "Size of the kernel cache in MB.", + "optional": true, + "default": 1000, + "schema": { + "type": "integer", + "minimum": 1 + } + }, + { + "name": "seed", + "description": "A randomization seed to use for reproducibility. If not given or `null`, no seed is used and results may differ on subsequent use.", + "optional": true, + "default": null, + "schema": { + "type": [ + "integer", + "null" + ] + } + } + ], + "returns": { + "description": "A model object that can be trained using ``ml_fit``.", + "schema": { + "type": "object", + "subtype": "ml-model" + } + }, + "links": [ + { + "href": "https://link.springer.com/article/10.1007/BF00994018", + "title": "C. Cortes and V. Vapnik (1995), Support-vector networks", + "type": "text/html", + "rel": "about" + } + ] +} \ No newline at end of file diff --git a/proposals/mlm_class_tae.json b/proposals/mlm_class_tae.json new file mode 100644 index 00000000..6f81e30e --- /dev/null +++ b/proposals/mlm_class_tae.json @@ -0,0 +1,128 @@ +{ + "id": "mlm_class_tae", + "summary": "Initialize a TAE classification model", + "description": "Initializes a Temporal Self-Attention (TAE) classification model. This component sets up the model structure but does not perform training or handle data splitting. The resulting model can be trained later using ``ml_fit``.", + "categories": [ + "machine learning" + ], + "experimental": true, + "parameters": [ + { + "name": "epochs", + "description": "Number of training epochs.", + "optional": true, + "default": 150, + "schema": { + "type": "integer", + "minimum": 1 + } + }, + { + "name": "batch_size", + "description": "Size of the training batches.", + "optional": true, + "default": 64, + "schema": { + "type": "integer", + "minimum": 1 + } + }, + { + "name": "optimizer", + "description": "The optimizer to use for training.", + "optional": true, + "default": "adam", + "schema": { + "type": "string", + "enum": [ + "adam", + "adabound", + "adabelief", + "madagrad", + "nadam", + "qhadam", + "radam", + "swats", + "yogi" + ] + } + }, + { + "name": "learning_rate", + "description": "The learning rate for the optimizer.", + "optional": true, + "default": 0.001, + "schema": { + "type": "number", + "minimum": 0 + } + }, + { + "name": "epsilon", + "description": "A small constant for numerical stability in the optimizer.", + "optional": true, + "default": 1e-8, + "schema": { + "type": "number", + "minimum": 0 + } + }, + { + "name": "weight_decay", + "description": "Weight decay (L2 penalty) for the optimizer.", + "optional": true, + "default": 0.000001, + "schema": { + "type": "number", + "minimum": 0 + } + }, + { + "name": "lr_decay_epochs", + "description": "Number of epochs between learning rate decay steps.", + "optional": true, + "default": 1, + "schema": { + "type": "integer", + "minimum": 1 + } + }, + { + "name": "lr_decay_rate", + "description": "Rate at which the learning rate decays.", + "optional": true, + "default": 0.95, + "schema": { + "type": "number", + "minimum": 0 + } + }, + { + "name": "seed", + "description": "A randomization seed to use for reproducibility. If not given or `null`, no seed is used and results may differ on subsequent use.", + "optional": true, + "default": null, + "schema": { + "type": [ + "integer", + "null" + ] + } + } + ], + "returns": { + "description": "A model object that can be trained using ``ml_fit``.", + "schema": { + "type": "object", + "subtype": "ml-model" + } + }, + "links": [ + { + "href": "https://doi.org/10.1109/CVPR42600.2020.01234", + "title": "V. Garnot, L. Landrieu, S. Giordano, and N. Chehata (2020), Satellite Image Time Series Classification With Pixel-Set Encoders and Temporal Self-Attention", + "type": "text/html", + "rel": "about" + } + ] +} \ No newline at end of file diff --git a/proposals/mlm_class_tempcnn.json b/proposals/mlm_class_tempcnn.json new file mode 100644 index 00000000..cc9da6f6 --- /dev/null +++ b/proposals/mlm_class_tempcnn.json @@ -0,0 +1,159 @@ +{ + "id": "mlm_class_tempcnn", + "summary": "Initialize a TempCNN classification model", + "description": "Initializes a Temporal Convolutional Neural Network (TempCNN) classification model. This component sets up the model structure but does not perform training or handle data splitting. The resulting model can be trained later using ``ml_fit``.", + "categories": [ + "machine learning" + ], + "experimental": true, + "parameters": [ + { + "name": "cnn_layers", + "description": "List of integers specifying the number of filters in each convolutional layer.", + "default": [ + 256, + 256, + 256 + ], + "schema": { + "type": "array", + "items": { + "type": "integer", + "minimum": 1 + }, + "minItems": 1 + } + }, + { + "name": "cnn_kernels", + "description": "List of integers specifying the kernel size for each convolutional layer.", + "default": [ + 7, + 7, + 7 + ], + "schema": { + "type": "array", + "items": { + "type": "integer", + "minimum": 1 + }, + "minItems": 1 + } + }, + { + "name": "cnn_dropout_rates", + "description": "List of numbers between 0 and 1 specifying the dropout rate for each convolutional layer.", + "default": [ + 0.2, + 0.2, + 0.2 + ], + "schema": { + "type": "array", + "items": { + "type": "number", + "minimum": 0, + "maximum": 1 + }, + "minItems": 1 + } + }, + { + "name": "dense_layer_nodes", + "description": "Number of nodes in the dense layer.", + "default": 256, + "schema": { + "type": "integer", + "minimum": 1 + } + }, + { + "name": "dense_layer_dropout_rate", + "description": "Dropout rate for the dense layer.", + "default": 0.5, + "schema": { + "type": "number", + "minimum": 0, + "maximum": 1 + } + }, + { + "name": "epochs", + "description": "Number of training epochs.", + "optional": true, + "default": 100, + "schema": { + "type": "integer", + "minimum": 1 + } + }, + { + "name": "batch_size", + "description": "Size of the training batches.", + "optional": true, + "default": 64, + "schema": { + "type": "integer", + "minimum": 1 + } + }, + { + "name": "optimizer", + "description": "The optimizer to use for training.", + "optional": true, + "default": "adam", + "schema": { + "type": "string", + "enum": [ + "adam", + "adabound", + "adabelief", + "madagrad", + "nadam", + "qhadam", + "radam", + "swats", + "yogi" + ] + } + }, + { + "name": "learning_rate", + "description": "The learning rate for the optimizer.", + "optional": true, + "default": 0.001, + "schema": { + "type": "number", + "minimum": 0 + } + }, + { + "name": "seed", + "description": "A randomization seed to use for reproducibility. If not given or `null`, no seed is used and results may differ on subsequent use.", + "optional": true, + "default": null, + "schema": { + "type": [ + "integer", + "null" + ] + } + } + ], + "returns": { + "description": "A model object that can be trained using ``ml_fit``.", + "schema": { + "type": "object", + "subtype": "ml-model" + } + }, + "links": [ + { + "href": "https://doi.org/10.3390/rs11050523", + "title": "Pelletier et al. (2019): Temporal Convolutional Neural Network for the Classification of Satellite Image Time Series", + "type": "text/html", + "rel": "about" + } + ] +} diff --git a/proposals/mlm_class_xgboost.json b/proposals/mlm_class_xgboost.json new file mode 100644 index 00000000..a7b21f50 --- /dev/null +++ b/proposals/mlm_class_xgboost.json @@ -0,0 +1,89 @@ +{ + "id": "mlm_class_xgboost", + "summary": "Initialize an XGBoost classification model", + "description": "Initializes an XGBoost classification model. This component sets up the model structure but does not perform training or handle data splitting. The resulting model can be trained later using ``ml_fit``.", + "categories": [ + "machine learning" + ], + "experimental": true, + "parameters": [ + { + "name": "learning_rate", + "description": "Step size shrinkage used in update to prevent overfitting.", + "optional": true, + "default": 0.15, + "schema": { + "type": "number", + "minimum": 0 + } + }, + { + "name": "max_depth", + "description": "Maximum depth of a tree.", + "optional": true, + "default": 5, + "schema": { + "type": "integer", + "minimum": 1 + } + }, + { + "name": "min_child_weight", + "description": "Minimum sum of instance weight (hessian) needed in a child.", + "optional": true, + "default": 1, + "schema": { + "type": "number", + "minimum": 0 + } + }, + { + "name": "subsample", + "description": "Subsample ratio of the training instance.", + "optional": true, + "default": 0.8, + "schema": { + "type": "number", + "minimum": 0, + "maximum": 1 + } + }, + { + "name": "min_split_loss", + "description": "Minimum loss reduction required to make a further partition on a leaf node of the tree.", + "optional": true, + "default": 1, + "schema": { + "type": "number", + "minimum": 0 + } + }, + { + "name": "seed", + "description": "A randomization seed to use for reproducibility. If not given or `null`, no seed is used and results may differ on subsequent use.", + "optional": true, + "default": null, + "schema": { + "type": [ + "integer", + "null" + ] + } + } + ], + "returns": { + "description": "A model object that can be trained using ``ml_fit``.", + "schema": { + "type": "object", + "subtype": "ml-model" + } + }, + "links": [ + { + "href": "https://dl.acm.org/doi/10.1145/2939672.2939785", + "title": "Chen and Guestrin (2016), XGBoost: A Scalable Tree Boosting System", + "type": "text/html", + "rel": "about" + } + ] +} \ No newline at end of file diff --git a/proposals/ml_fit_regr_random_forest.json b/proposals/mlm_regr_random_forest.json similarity index 51% rename from proposals/ml_fit_regr_random_forest.json rename to proposals/mlm_regr_random_forest.json index 39207324..f643a73a 100644 --- a/proposals/ml_fit_regr_random_forest.json +++ b/proposals/mlm_regr_random_forest.json @@ -1,55 +1,12 @@ { - "id": "ml_fit_regr_random_forest", - "summary": "Train a random forest regression model", - "description": "Executes the fit of a random forest regression based on training data. The process does not include a separate split of the data in test, validation and training data. The Random Forest regression model is based on the approach by Breiman (2001).", + "id": "mlm_regr_random_forest", + "summary": "Initialize a random forest regression model", + "description": "Initializes a Random Forest regression model. This component prepares the model structure but does not perform training. The model can be trained later using ``ml_fit``.", "categories": [ "machine learning" ], "experimental": true, "parameters": [ - { - "name": "predictors", - "description": "The predictors for the regression model as a vector data cube. Aggregated to the features (vectors) of the target input variable.", - "schema": [ - { - "type": "object", - "subtype": "datacube", - "dimensions": [ - { - "type": "geometry" - }, - { - "type": "bands" - } - ] - }, - { - "type": "object", - "subtype": "datacube", - "dimensions": [ - { - "type": "geometry" - }, - { - "type": "other" - } - ] - } - ] - }, - { - "name": "target", - "description": "The training sites for the regression model as a vector data cube. This is associated with the target variable for the Random Forest model. The geometry has to associated with a value to predict (e.g. fractional forest canopy cover).", - "schema": { - "type": "object", - "subtype": "datacube", - "dimensions": [ - { - "type": "geometry" - } - ] - } - }, { "name": "max_variables", "description": "Specifies how many split variables will be used at a node.\n\nThe following options are available:\n\n- *integer*: The given number of variables are considered for each split.\n- `all`: All variables are considered for each split.\n- `log2`: The logarithm with base 2 of the number of variables are considered for each split.\n- `onethird`: A third of the number of variables are considered for each split. This is often the default for regression.\n- `sqrt`: The square root of the number of variables are considered for each split.", @@ -93,7 +50,7 @@ } ], "returns": { - "description": "A model object that can be saved with ``save_ml_model()`` and restored with ``load_ml_model()``.", + "description": "A model object that can be trained using ``ml_fit``.", "schema": { "type": "object", "subtype": "ml-model" @@ -108,3 +65,4 @@ } ] } + diff --git a/proposals/mlm_regr_svm.json b/proposals/mlm_regr_svm.json new file mode 100644 index 00000000..ba63930c --- /dev/null +++ b/proposals/mlm_regr_svm.json @@ -0,0 +1,122 @@ +{ + "id": "mlm_regr_svm", + "summary": "Initialize an SVM regression model", + "description": "Initializes a Support Vector Machine (SVM) regression model. This component sets up the model structure but does not perform training or handle data splitting. The resulting model can be trained later using ``ml_fit``.", + "categories": [ + "machine learning" + ], + "experimental": true, + "parameters": [ + { + "name": "kernel", + "description": "Specifies the kernel type to be used in the algorithm.", + "optional": true, + "default": "rbf", + "schema": { + "type": "string", + "enum": [ + "linear", + "poly", + "rbf", + "sigmoid" + ] + } + }, + { + "name": "C", + "description": "Regularization parameter. The strength of the regularization is inversely proportional to C. Must be strictly positive.", + "optional": true, + "default": 1, + "schema": { + "type": "number", + "minimum": 0 + } + }, + { + "name": "epsilon", + "description": "Epsilon in the epsilon-SVR model. Specifies the epsilon-tube within which no penalty is associated in the training loss function with points predicted within a distance epsilon from the actual value.", + "optional": true, + "default": 0.1, + "schema": { + "type": "number", + "minimum": 0 + } + }, + { + "name": "gamma", + "description": "Kernel coefficient for 'rbf', 'poly', and 'sigmoid'. Higher values lead to tighter fits.", + "optional": true, + "default": 1, + "schema": { + "type": "number", + "minimum": 0 + } + }, + { + "name": "degree", + "description": "Degree of the polynomial kernel function (only relevant for 'poly' kernel).", + "optional": true, + "default": 3, + "schema": { + "type": "integer", + "minimum": 1 + } + }, + { + "name": "coef0", + "description": "Independent term in the kernel function (only relevant for 'poly' and 'sigmoid' kernels).", + "optional": true, + "default": 0, + "schema": { + "type": "number" + } + }, + { + "name": "tolerance", + "description": "Tolerance of termination criterion.", + "optional": true, + "default": 0.001, + "schema": { + "type": "number", + "minimum": 0 + } + }, + { + "name": "cachesize", + "description": "Size of the kernel cache in MB.", + "optional": true, + "default": 1000, + "schema": { + "type": "integer", + "minimum": 1 + } + }, + { + "name": "seed", + "description": "A randomization seed to use for reproducibility. If not given or `null`, no seed is used and results may differ on subsequent use.", + "optional": true, + "default": null, + "schema": { + "type": [ + "integer", + "null" + ] + } + } + ], + "returns": { + "description": "A model object that can be trained using ``ml_fit``.", + "schema": { + "type": "object", + "subtype": "ml-model" + } + }, + "links": [ + { + "href": "https://link.springer.com/chapter/10.1007/978-1-4302-5990-9_4", + "title": "Awad, M., Khanna, R., Awad, M., & Khanna, R. (2015). Support vector regression. Efficient learning machines: Theories, concepts, and applications for engineers and system designers.", + "type": "text/html", + "rel": "about" + } + ] +} \ No newline at end of file diff --git a/proposals/save_ml_model.json b/proposals/save_ml_model.json index 5e9ea8b0..ab5820ba 100644 --- a/proposals/save_ml_model.json +++ b/proposals/save_ml_model.json @@ -1,7 +1,7 @@ { "id": "save_ml_model", "summary": "Save a ML model", - "description": "Saves a machine learning model as part of a batch job.\n\nThe model will be accompanied by a separate STAC Item that implements the [ml-model extension](https://github.com/stac-extensions/ml-model).", + "description": "Saves a machine learning model as part of a batch job.\n\nThe model will be accompanied by a separate STAC Item that implements the [mlm-model extension](https://github.com/stac-extensions/mlm).", "categories": [ "machine learning", "import" @@ -9,13 +9,20 @@ "experimental": true, "parameters": [ { - "name": "data", - "description": "The data to store as a machine learning model.", + "name": "model", + "description": "The machine learning model to store.", "schema": { "type": "object", "subtype": "ml-model" } }, + { + "name": "name", + "description": "A distinct name of the model.", + "schema": { + "type": "string" + } + }, { "name": "options", "description": "Additional parameters to create the file(s).", @@ -35,8 +42,8 @@ }, "links": [ { - "href": "https://github.com/stac-extensions/ml-model", - "title": "STAC ml-model extension", + "href": "https://github.com/stac-extensions/mlm", + "title": "Machine Learning Model STAC extension", "type": "text/html", "rel": "about" } diff --git a/tests/.words b/tests/.words index a50285ba..d5ac5038 100644 --- a/tests/.words +++ b/tests/.words @@ -47,3 +47,58 @@ Hyndman date1 date2 favor +mlm-model +analyzes +XGBoost +Chen +Guestrin +Subsample +hessian +overfitting +Wang +Yan +Oates +adam +sgd +minibatches +Perceptron +feedforward +backpropagation +TempCNN +Pelletier +Cortes +Vapnik +rbf +SVM +Garnot +Landrieu +Giordano +Chehata +LightTAE +TAE +least-confidence +Camara +softmax +Khanna +Awad +CatBoost +epsilon-SVR +Terratorch +hyperparameter +hyperparameters +Hyperparameter +f1 +auc +rmse +mse +mae +r2 +cv +Bergstra +Bengio +Arlot +Celisse +log_uniform +int_uniform +iou +dice \ No newline at end of file diff --git a/tests/docs.html b/tests/docs.html index 04b1c192..1f4aaa58 100644 --- a/tests/docs.html +++ b/tests/docs.html @@ -1,13 +1,19 @@ - - - + + + openEO API Processes - - + + - +