The Altair Community is migrating to a new platform to provide a better experience for you. In preparation for the migration, the Altair Community is on read-only mode from October 28 - November 6, 2024. Technical support via cases will continue to work as is. For any urgent requests from Students/Faculty members, please submit the form linked here
Stacking not implemented correctly?
HeikoPaulheim
Member Posts: 13 Contributor II
Hi RapidMiners,
it occurs to me that stacking is not implemented in RapidMiner the way it ideally should be, according to the literature. This can cause some problems.
Consider the model output of the process below. The decision tree, which is learned as a stacking model, only uses the prediction of the first base learner, which is a 1-NN.
The reason here is that 1-NN is perfect on the training data, and the stacking implementation of RapidMiner uses the same training data for both the base learners and the stacking model learner. Thus, this approach is prone to overfitting. It is advised instead in the literature instead (see, e.g., Ting and Witten '99 [1]) to create the training set for the stacking model learner in a folded setting, i.e., train the base learners on 90% of the data, create predictions for 10%, repeat ten times, and use the set of collected predictions for the stacking model learner. This, however, seems not to be the case in RapidMiner.
What are your feelings on this?
Cheers,
Heiko
[1] http://www.cs.waikato.ac.nz/~ml/publications/1999/99KMT-IHW-Issues.pdf
it occurs to me that stacking is not implemented in RapidMiner the way it ideally should be, according to the literature. This can cause some problems.
Consider the model output of the process below. The decision tree, which is learned as a stacking model, only uses the prediction of the first base learner, which is a 1-NN.
The reason here is that 1-NN is perfect on the training data, and the stacking implementation of RapidMiner uses the same training data for both the base learners and the stacking model learner. Thus, this approach is prone to overfitting. It is advised instead in the literature instead (see, e.g., Ting and Witten '99 [1]) to create the training set for the stacking model learner in a folded setting, i.e., train the base learners on 90% of the data, create predictions for 10%, repeat ten times, and use the set of collected predictions for the stacking model learner. This, however, seems not to be the case in RapidMiner.
What are your feelings on this?
Cheers,
Heiko
[1] http://www.cs.waikato.ac.nz/~ml/publications/1999/99KMT-IHW-Issues.pdf
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.3.015">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="5.3.015" expanded="true" name="Process">
<parameter key="logverbosity" value="init"/>
<parameter key="random_seed" value="2001"/>
<parameter key="send_mail" value="never"/>
<parameter key="notification_email" value=""/>
<parameter key="process_duration_for_mail" value="30"/>
<parameter key="encoding" value="SYSTEM"/>
<process expanded="true">
<operator activated="true" class="retrieve" compatibility="5.3.015" expanded="true" height="60" name="Retrieve Sonar" width="90" x="45" y="30">
<parameter key="repository_entry" value="//Samples/data/Sonar"/>
</operator>
<operator activated="true" class="x_validation" compatibility="5.3.015" expanded="true" height="112" name="Validation" width="90" x="179" y="30">
<parameter key="create_complete_model" value="false"/>
<parameter key="average_performances_only" value="true"/>
<parameter key="leave_one_out" value="false"/>
<parameter key="number_of_validations" value="10"/>
<parameter key="sampling_type" value="stratified sampling"/>
<parameter key="use_local_random_seed" value="false"/>
<parameter key="local_random_seed" value="1992"/>
<process expanded="true">
<operator activated="true" class="stacking" compatibility="5.3.015" expanded="true" height="60" name="Stacking" width="90" x="45" y="30">
<parameter key="keep_all_attributes" value="true"/>
<process expanded="true">
<operator activated="true" breakpoints="before" class="k_nn" compatibility="5.3.015" expanded="true" height="76" name="k-NN" width="90" x="45" y="30">
<parameter key="k" value="1"/>
<parameter key="weighted_vote" value="false"/>
<parameter key="measure_types" value="MixedMeasures"/>
<parameter key="mixed_measure" value="MixedEuclideanDistance"/>
<parameter key="nominal_measure" value="NominalDistance"/>
<parameter key="numerical_measure" value="EuclideanDistance"/>
<parameter key="divergence" value="GeneralizedIDivergence"/>
<parameter key="kernel_type" value="radial"/>
<parameter key="kernel_gamma" value="1.0"/>
<parameter key="kernel_sigma1" value="1.0"/>
<parameter key="kernel_sigma2" value="0.0"/>
<parameter key="kernel_sigma3" value="2.0"/>
<parameter key="kernel_degree" value="3.0"/>
<parameter key="kernel_shift" value="1.0"/>
<parameter key="kernel_a" value="1.0"/>
<parameter key="kernel_b" value="0.0"/>
</operator>
<operator activated="true" class="k_nn" compatibility="5.3.015" expanded="true" height="76" name="k-NN (2)" width="90" x="45" y="120">
<parameter key="k" value="31"/>
<parameter key="weighted_vote" value="false"/>
<parameter key="measure_types" value="MixedMeasures"/>
<parameter key="mixed_measure" value="MixedEuclideanDistance"/>
<parameter key="nominal_measure" value="NominalDistance"/>
<parameter key="numerical_measure" value="EuclideanDistance"/>
<parameter key="divergence" value="GeneralizedIDivergence"/>
<parameter key="kernel_type" value="radial"/>
<parameter key="kernel_gamma" value="1.0"/>
<parameter key="kernel_sigma1" value="1.0"/>
<parameter key="kernel_sigma2" value="0.0"/>
<parameter key="kernel_sigma3" value="2.0"/>
<parameter key="kernel_degree" value="3.0"/>
<parameter key="kernel_shift" value="1.0"/>
<parameter key="kernel_a" value="1.0"/>
<parameter key="kernel_b" value="0.0"/>
</operator>
<operator activated="true" class="naive_bayes" compatibility="5.3.015" expanded="true" height="76" name="Naive Bayes" width="90" x="45" y="210">
<parameter key="laplace_correction" value="true"/>
</operator>
<operator activated="true" class="decision_tree" compatibility="5.3.015" expanded="true" height="76" name="Decision Tree" width="90" x="45" y="300">
<parameter key="criterion" value="gain_ratio"/>
<parameter key="minimal_size_for_split" value="4"/>
<parameter key="minimal_leaf_size" value="2"/>
<parameter key="minimal_gain" value="0.1"/>
<parameter key="maximal_depth" value="20"/>
<parameter key="confidence" value="0.25"/>
<parameter key="number_of_prepruning_alternatives" value="3"/>
<parameter key="no_pre_pruning" value="false"/>
<parameter key="no_pruning" value="false"/>
</operator>
<connect from_port="training set 1" to_op="k-NN" to_port="training set"/>
<connect from_port="training set 2" to_op="k-NN (2)" to_port="training set"/>
<connect from_port="training set 3" to_op="Naive Bayes" to_port="training set"/>
<connect from_port="training set 4" to_op="Decision Tree" to_port="training set"/>
<connect from_op="k-NN" from_port="model" to_port="base model 1"/>
<connect from_op="k-NN (2)" from_port="model" to_port="base model 2"/>
<connect from_op="Naive Bayes" from_port="model" to_port="base model 3"/>
<connect from_op="Decision Tree" from_port="model" to_port="base model 4"/>
<portSpacing port="source_training set 1" spacing="0"/>
<portSpacing port="source_training set 2" spacing="0"/>
<portSpacing port="source_training set 3" spacing="0"/>
<portSpacing port="source_training set 4" spacing="0"/>
<portSpacing port="source_training set 5" spacing="0"/>
<portSpacing port="sink_base model 1" spacing="0"/>
<portSpacing port="sink_base model 2" spacing="0"/>
<portSpacing port="sink_base model 3" spacing="0"/>
<portSpacing port="sink_base model 4" spacing="0"/>
<portSpacing port="sink_base model 5" spacing="0"/>
</process>
<process expanded="true">
<operator activated="true" class="decision_tree" compatibility="5.3.015" expanded="true" height="76" name="Decision Tree (2)" width="90" x="45" y="30">
<parameter key="criterion" value="gain_ratio"/>
<parameter key="minimal_size_for_split" value="4"/>
<parameter key="minimal_leaf_size" value="2"/>
<parameter key="minimal_gain" value="0.1"/>
<parameter key="maximal_depth" value="20"/>
<parameter key="confidence" value="0.25"/>
<parameter key="number_of_prepruning_alternatives" value="3"/>
<parameter key="no_pre_pruning" value="false"/>
<parameter key="no_pruning" value="false"/>
</operator>
<connect from_port="stacking examples" to_op="Decision Tree (2)" to_port="training set"/>
<connect from_op="Decision Tree (2)" from_port="model" to_port="stacking model"/>
<portSpacing port="source_stacking examples" spacing="0"/>
<portSpacing port="sink_stacking model" spacing="0"/>
</process>
</operator>
<connect from_port="training" to_op="Stacking" to_port="training set"/>
<connect from_op="Stacking" from_port="model" to_port="model"/>
<portSpacing port="source_training" spacing="0"/>
<portSpacing port="sink_model" spacing="0"/>
<portSpacing port="sink_through 1" spacing="0"/>
</process>
<process expanded="true">
<operator activated="true" class="apply_model" compatibility="5.3.015" expanded="true" height="76" name="Apply Model" width="90" x="45" y="30">
<list key="application_parameters"/>
<parameter key="create_view" value="false"/>
</operator>
<operator activated="true" class="performance_classification" compatibility="5.3.015" expanded="true" height="76" name="Performance" width="90" x="179" y="30">
<parameter key="main_criterion" value="first"/>
<parameter key="accuracy" value="true"/>
<parameter key="classification_error" value="false"/>
<parameter key="kappa" value="false"/>
<parameter key="weighted_mean_recall" value="false"/>
<parameter key="weighted_mean_precision" value="false"/>
<parameter key="spearman_rho" value="false"/>
<parameter key="kendall_tau" value="false"/>
<parameter key="absolute_error" value="false"/>
<parameter key="relative_error" value="false"/>
<parameter key="relative_error_lenient" value="false"/>
<parameter key="relative_error_strict" value="false"/>
<parameter key="normalized_absolute_error" value="false"/>
<parameter key="root_mean_squared_error" value="false"/>
<parameter key="root_relative_squared_error" value="false"/>
<parameter key="squared_error" value="false"/>
<parameter key="correlation" value="false"/>
<parameter key="squared_correlation" value="false"/>
<parameter key="cross-entropy" value="false"/>
<parameter key="margin" value="false"/>
<parameter key="soft_margin_loss" value="false"/>
<parameter key="logistic_loss" value="false"/>
<parameter key="skip_undefined_labels" value="true"/>
<parameter key="use_example_weights" value="true"/>
<list key="class_weights"/>
</operator>
<connect from_port="model" to_op="Apply Model" to_port="model"/>
<connect from_port="test set" to_op="Apply Model" to_port="unlabelled data"/>
<connect from_op="Apply Model" from_port="labelled data" to_op="Performance" to_port="labelled data"/>
<connect from_op="Performance" from_port="performance" to_port="averagable 1"/>
<portSpacing port="source_model" spacing="0"/>
<portSpacing port="source_test set" spacing="0"/>
<portSpacing port="source_through 1" spacing="0"/>
<portSpacing port="sink_averagable 1" spacing="0"/>
<portSpacing port="sink_averagable 2" spacing="0"/>
</process>
</operator>
<connect from_op="Retrieve Sonar" from_port="output" to_op="Validation" to_port="training"/>
<connect from_op="Validation" from_port="model" to_port="result 2"/>
<connect from_op="Validation" from_port="averagable 1" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
<portSpacing port="sink_result 3" spacing="0"/>
</process>
</operator>
</process>
0