Is is possible to loop a collection of models and obtain a collection of predictions?
I'm trying to implement Blagging as described by Tom Fawcett in "Learning from Imbalanced Classes".
I summarized the algorithm:
1) Obtain bootstrap samples from the original imbalanced data set
2) Balance each sample by downsampling
3) Estimate a model (e.g. tree)
4) Put the individual trees to vote
I'm trying to implement the algorithm in Rapidminer. I'm using collections. I can create a collection of models successfully, but when I try to loop this collection to produce another collection of prediction this time I run into trouble. Am I violating a rule about working with collections?
I'm using the abalone data set as an example. I've attached the files.
Any help will be appreciated.
<?xml version="1.0" encoding="UTF-8"?><process version="7.5.003">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="7.5.003" expanded="true" name="Process">
<process expanded="true">
<operator activated="true" class="retrieve" compatibility="7.5.003" expanded="true" height="68" name="Retrieve training" width="90" x="45" y="187">
<parameter key="repository_entry" value="training"/>
</operator>
<operator activated="true" class="concurrency:loop" compatibility="7.5.003" expanded="true" height="82" name="Loop" width="90" x="246" y="187">
<parameter key="number_of_iterations" value="3"/>
<process expanded="true">
<operator activated="true" class="sample_bootstrapping" compatibility="7.5.003" expanded="true" height="82" name="Sample (Bootstrapping)" width="90" x="112" y="34"/>
<operator activated="true" class="filter_examples" compatibility="7.5.003" expanded="true" height="103" name="Filter Examples" width="90" x="179" y="187">
<list key="filters_list">
<parameter key="filters_entry_key" value="Class.equals.positive"/>
</list>
</operator>
<operator activated="true" class="extract_macro" compatibility="7.5.003" expanded="true" height="68" name="Extract Macro" width="90" x="581" y="34">
<parameter key="macro" value="numfraud"/>
<list key="additional_macros"/>
</operator>
<operator activated="true" class="filter_examples" compatibility="7.5.003" expanded="true" height="103" name="Filter Examples (2)" width="90" x="380" y="238">
<list key="filters_list">
<parameter key="filters_entry_key" value="Class.equals.negative"/>
</list>
</operator>
<operator activated="true" class="sample" compatibility="7.5.003" expanded="true" height="82" name="Sample" width="90" x="581" y="238">
<parameter key="sample_size" value="%{numfraud}"/>
<list key="sample_size_per_class"/>
<list key="sample_ratio_per_class"/>
<list key="sample_probability_per_class"/>
</operator>
<operator activated="true" class="append" compatibility="7.5.003" expanded="true" height="103" name="Append" width="90" x="715" y="136"/>
<operator activated="true" class="concurrency:parallel_decision_tree" compatibility="7.5.003" expanded="true" height="82" name="Decision Tree" width="90" x="849" y="85">
<parameter key="criterion" value="gini_index"/>
<parameter key="maximal_depth" value="5"/>
<parameter key="apply_prepruning" value="false"/>
</operator>
<connect from_port="input 1" to_op="Sample (Bootstrapping)" to_port="example set input"/>
<connect from_op="Sample (Bootstrapping)" from_port="example set output" to_op="Filter Examples" to_port="example set input"/>
<connect from_op="Filter Examples" from_port="example set output" to_op="Extract Macro" to_port="example set"/>
<connect from_op="Filter Examples" from_port="original" to_op="Filter Examples (2)" to_port="example set input"/>
<connect from_op="Extract Macro" from_port="example set" to_op="Append" to_port="example set 1"/>
<connect from_op="Filter Examples (2)" from_port="example set output" to_op="Sample" to_port="example set input"/>
<connect from_op="Sample" from_port="example set output" to_op="Append" to_port="example set 2"/>
<connect from_op="Append" from_port="merged set" to_op="Decision Tree" to_port="training set"/>
<connect from_op="Decision Tree" from_port="model" to_port="output 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="source_input 2" spacing="0"/>
<portSpacing port="sink_output 1" spacing="0"/>
<portSpacing port="sink_output 2" spacing="0"/>
</process>
</operator>
<operator activated="true" class="collect" compatibility="7.5.003" expanded="true" height="82" name="Collect" width="90" x="380" y="187"/>
<operator activated="true" class="loop_collection" compatibility="7.5.003" expanded="true" height="82" name="Loop Collection" width="90" x="581" y="187">
<parameter key="set_iteration_macro" value="true"/>
<parameter key="macro_name" value="i"/>
<process expanded="true">
<operator activated="true" class="select" compatibility="7.5.003" expanded="true" height="68" name="Select" width="90" x="112" y="85">
<parameter key="index" value="%{i}"/>
</operator>
<operator activated="true" class="retrieve" compatibility="7.5.003" expanded="true" height="68" name="Retrieve test" width="90" x="112" y="187">
<parameter key="repository_entry" value="test"/>
</operator>
<operator activated="true" class="apply_model" compatibility="7.5.003" expanded="true" height="82" name="Apply Model" width="90" x="246" y="136">
<list key="application_parameters"/>
</operator>
<operator activated="true" class="select_attributes" compatibility="7.5.003" expanded="true" height="82" name="Select Attributes" width="90" x="447" y="136">
<parameter key="attribute_filter_type" value="subset"/>
<parameter key="attributes" value="confidence(positive)|Id"/>
<parameter key="include_special_attributes" value="true"/>
</operator>
<connect from_port="single" to_op="Select" to_port="collection"/>
<connect from_op="Select" from_port="selected" to_op="Apply Model" to_port="model"/>
<connect from_op="Retrieve test" from_port="output" to_op="Apply Model" to_port="unlabelled data"/>
<connect from_op="Apply Model" from_port="labelled data" to_op="Select Attributes" to_port="example set input"/>
<connect from_op="Select Attributes" from_port="example set output" to_port="output 1"/>
<portSpacing port="source_single" spacing="0"/>
<portSpacing port="sink_output 1" spacing="0"/>
<portSpacing port="sink_output 2" spacing="0"/>
</process>
</operator>
<connect from_op="Retrieve training" from_port="output" to_op="Loop" to_port="input 1"/>
<connect from_op="Loop" from_port="output 1" to_op="Collect" to_port="input 1"/>
<connect from_op="Collect" from_port="collection" to_op="Loop Collection" to_port="collection"/>
<connect from_op="Loop Collection" from_port="output 1" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
</process>
</operator>
</process>
Answers
Forgot to attach the training set.
Hey,
cant you just use a bagging operator and balance the classes inside (e.g. with Generate Weight (Stratification))?
Best,
Martin
Dortmund, Germany
Alternatively when you create the original models, don't store them in a collection, but rather as separate models in the repository. Then you can simply use the "Vote" ensemble operator or similar to get your final prediction.
Lindon Ventures
Data Science Consulting from Certified RapidMiner Experts