The Altair Community is migrating to a new platform to provide a better experience for you. In preparation for the migration, the Altair Community is on read-only mode from October 28 - November 6, 2024. Technical support via cases will continue to work as is. For any urgent requests from Students/Faculty members, please submit the form linked here
Bagging for Imbalanced dataset
aryan_hosseinza
Member Posts: 74 Contributor II
Hi everybody ,
I have a very imbalanced dataset (t: 14% , f:86%) , I want to use bagging in a way that I can sample roughly 1/3 of f class and union it with true class and train naive bayes on it ,
I have two question :
1.How can I do this kind of sampling (like what happens in bagging tool in rapid miner but not sampling the whole dataset but only the major class)
2.what type of naive bayes do you suggest me to use inside baggin ? because there are different implementation of various types of naive bayes in rapidminer ? should it be reweightable ? should it be updateable ?
Thanks
I have a very imbalanced dataset (t: 14% , f:86%) , I want to use bagging in a way that I can sample roughly 1/3 of f class and union it with true class and train naive bayes on it ,
I have two question :
1.How can I do this kind of sampling (like what happens in bagging tool in rapid miner but not sampling the whole dataset but only the major class)
2.what type of naive bayes do you suggest me to use inside baggin ? because there are different implementation of various types of naive bayes in rapidminer ? should it be reweightable ? should it be updateable ?
Thanks
0
Answers
Use the sampling operator to get data sets of the same size.
Append back together and use the normal Naive Bayes.
On a side note, Naive Bayes is perfectly capable of dealing with skewed data sets.
So this procedure is a bit weird.
The other Naive Bayes implementations are likely to perform worse, because they are designed for different purposes.
Especially the up-datable one. If you wish you can use W-NaiveBayes this one should be as good as the Naive Bayes from Rapid miner.
<process version="5.2.008">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="5.2.008" expanded="true" name="Process">
<process expanded="true" height="390" width="547">
<operator activated="true" class="retrieve" compatibility="5.2.008" expanded="true" height="60" name="Retrieve" width="90" x="45" y="30">
<parameter key="repository_entry" value="//Samples/data/Sonar"/>
</operator>
<operator activated="true" class="filter_examples" compatibility="5.2.008" expanded="true" height="76" name="Rock" width="90" x="180" y="30">
<parameter key="condition_class" value="attribute_value_filter"/>
<parameter key="parameter_string" value="class=Rock"/>
</operator>
<operator activated="true" class="sample" compatibility="5.2.008" expanded="true" height="76" name="SampleR" width="90" x="313" y="30">
<parameter key="sample_size" value="50"/>
<list key="sample_size_per_class"/>
<list key="sample_ratio_per_class"/>
<list key="sample_probability_per_class"/>
</operator>
<operator activated="true" class="filter_examples" compatibility="5.2.008" expanded="true" height="76" name="Mine" width="90" x="179" y="120">
<parameter key="condition_class" value="attribute_value_filter"/>
<parameter key="parameter_string" value="class=Rock"/>
<parameter key="invert_filter" value="true"/>
</operator>
<operator activated="true" class="sample" compatibility="5.2.008" expanded="true" height="76" name="SampleM" width="90" x="313" y="120">
<parameter key="sample_size" value="50"/>
<list key="sample_size_per_class"/>
<list key="sample_ratio_per_class"/>
<list key="sample_probability_per_class"/>
</operator>
<operator activated="true" class="append" compatibility="5.2.008" expanded="true" height="94" name="Append" width="90" x="112" y="210"/>
<operator activated="true" class="naive_bayes" compatibility="5.2.008" expanded="true" height="76" name="Naive Bayes" width="90" x="246" y="210"/>
<connect from_op="Retrieve" from_port="output" to_op="Rock" to_port="example set input"/>
<connect from_op="Rock" from_port="example set output" to_op="SampleR" to_port="example set input"/>
<connect from_op="Rock" from_port="original" to_op="Mine" to_port="example set input"/>
<connect from_op="SampleR" from_port="example set output" to_op="Append" to_port="example set 1"/>
<connect from_op="Mine" from_port="example set output" to_op="SampleM" to_port="example set input"/>
<connect from_op="SampleM" from_port="example set output" to_op="Append" to_port="example set 2"/>
<connect from_op="Append" from_port="merged set" to_op="Naive Bayes" to_port="training set"/>
<connect from_op="Naive Bayes" from_port="model" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
</process>
</operator>
</process>
I know what you did but it's not exactly what I meant ,
ok , let's figure it out for this example :
T class : 14%
F class : 86%
I want to split the F class into 3 classes , each 29% (F1,F2,F3) and I want to train our algorithm (e.g. Naive bayes or DTree) over (T U F1) & (T U F2) & (T U F3) and test it .... (the rest is just like how it is done in Bagging , but the problem with bagging is that it splits the WHOLE dataset, it doesn't keep part of a dataset and split the rest like what I explained.
<process version="5.2.008">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="5.2.008" expanded="true" name="Process">
<process expanded="true" height="450" width="567">
<operator activated="true" class="retrieve" compatibility="5.2.008" expanded="true" height="60" name="Retrieve" width="90" x="45" y="30">
<parameter key="repository_entry" value="//Samples/data/Sonar"/>
</operator>
<operator activated="true" class="filter_examples" compatibility="5.2.008" expanded="true" height="76" name="Rock" width="90" x="180" y="30">
<parameter key="condition_class" value="attribute_value_filter"/>
<parameter key="parameter_string" value="class=Rock"/>
</operator>
<operator activated="true" class="multiply" compatibility="5.2.008" expanded="true" height="112" name="Multiply" width="90" x="447" y="30"/>
<operator activated="true" class="filter_examples" compatibility="5.2.008" expanded="true" height="76" name="Mine" width="90" x="45" y="120">
<parameter key="condition_class" value="attribute_value_filter"/>
<parameter key="parameter_string" value="class=Rock"/>
<parameter key="invert_filter" value="true"/>
</operator>
<operator activated="true" class="split_data" compatibility="5.2.008" expanded="true" height="112" name="Split Data" width="90" x="45" y="210">
<enumeration key="partitions">
<parameter key="ratio" value="0.34"/>
<parameter key="ratio" value="0.33"/>
<parameter key="ratio" value="0.33"/>
</enumeration>
</operator>
<operator activated="true" class="append" compatibility="5.2.008" expanded="true" height="94" name="Append (3)" width="90" x="179" y="300"/>
<operator activated="true" class="naive_bayes" compatibility="5.2.008" expanded="true" height="76" name="Naive Bayes (3)" width="90" x="313" y="300"/>
<operator activated="true" class="append" compatibility="5.2.008" expanded="true" height="94" name="Append (2)" width="90" x="179" y="210"/>
<operator activated="true" class="naive_bayes" compatibility="5.2.008" expanded="true" height="76" name="Naive Bayes (2)" width="90" x="313" y="210"/>
<operator activated="true" class="append" compatibility="5.2.008" expanded="true" height="94" name="Append" width="90" x="179" y="120"/>
<operator activated="true" class="naive_bayes" compatibility="5.2.008" expanded="true" height="76" name="Naive Bayes" width="90" x="313" y="120"/>
<connect from_op="Retrieve" from_port="output" to_op="Rock" to_port="example set input"/>
<connect from_op="Rock" from_port="example set output" to_op="Multiply" to_port="input"/>
<connect from_op="Rock" from_port="original" to_op="Mine" to_port="example set input"/>
<connect from_op="Multiply" from_port="output 1" to_op="Append" to_port="example set 1"/>
<connect from_op="Multiply" from_port="output 2" to_op="Append (2)" to_port="example set 1"/>
<connect from_op="Multiply" from_port="output 3" to_op="Append (3)" to_port="example set 1"/>
<connect from_op="Mine" from_port="example set output" to_op="Split Data" to_port="example set"/>
<connect from_op="Split Data" from_port="partition 1" to_op="Append" to_port="example set 2"/>
<connect from_op="Split Data" from_port="partition 2" to_op="Append (2)" to_port="example set 2"/>
<connect from_op="Split Data" from_port="partition 3" to_op="Append (3)" to_port="example set 2"/>
<connect from_op="Append (3)" from_port="merged set" to_op="Naive Bayes (3)" to_port="training set"/>
<connect from_op="Naive Bayes (3)" from_port="model" to_port="result 3"/>
<connect from_op="Append (2)" from_port="merged set" to_op="Naive Bayes (2)" to_port="training set"/>
<connect from_op="Naive Bayes (2)" from_port="model" to_port="result 2"/>
<connect from_op="Append" from_port="merged set" to_op="Naive Bayes" to_port="training set"/>
<connect from_op="Naive Bayes" from_port="model" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="90"/>
<portSpacing port="sink_result 2" spacing="72"/>
<portSpacing port="sink_result 3" spacing="72"/>
<portSpacing port="sink_result 4" spacing="0"/>
</process>
</operator>
</process>
<process version="5.2.008">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="5.2.008" expanded="true" name="Process">
<process expanded="true" height="450" width="567">
<operator activated="true" class="retrieve" compatibility="5.2.008" expanded="true" height="60" name="Retrieve" width="90" x="45" y="30">
<parameter key="repository_entry" value="//Samples/data/Sonar"/>
</operator>
<operator activated="true" class="filter_examples" compatibility="5.2.008" expanded="true" height="76" name="Rock" width="90" x="180" y="30">
<parameter key="condition_class" value="attribute_value_filter"/>
<parameter key="parameter_string" value="class=Rock"/>
</operator>
<operator activated="true" class="remember" compatibility="5.2.008" expanded="true" height="60" name="Remember" width="90" x="313" y="30">
<parameter key="name" value="R"/>
<parameter key="io_object" value="ExampleSet"/>
</operator>
<operator activated="true" class="filter_examples" compatibility="5.2.008" expanded="true" height="76" name="Mine" width="90" x="45" y="120">
<parameter key="condition_class" value="attribute_value_filter"/>
<parameter key="parameter_string" value="class=Rock"/>
<parameter key="invert_filter" value="true"/>
</operator>
<operator activated="true" class="bagging" compatibility="5.2.008" expanded="true" height="76" name="Bagging" width="90" x="293" y="173">
<parameter key="iterations" value="3"/>
<process expanded="true" height="450" width="435">
<operator activated="true" class="recall" compatibility="5.2.008" expanded="true" height="60" name="Recall" width="90" x="45" y="30">
<parameter key="name" value="R"/>
<parameter key="io_object" value="ExampleSet"/>
<parameter key="remove_from_store" value="false"/>
</operator>
<operator activated="true" class="append" compatibility="5.2.008" expanded="true" height="94" name="Append" width="90" x="180" y="30"/>
<operator activated="true" class="naive_bayes" compatibility="5.2.008" expanded="true" height="76" name="Naive Bayes" width="90" x="315" y="30"/>
<connect from_port="training set" to_op="Append" to_port="example set 2"/>
<connect from_op="Recall" from_port="result" to_op="Append" to_port="example set 1"/>
<connect from_op="Append" from_port="merged set" to_op="Naive Bayes" to_port="training set"/>
<connect from_op="Naive Bayes" from_port="model" to_port="model"/>
<portSpacing port="source_training set" spacing="0"/>
<portSpacing port="sink_model" spacing="0"/>
</process>
</operator>
<connect from_op="Retrieve" from_port="output" to_op="Rock" to_port="example set input"/>
<connect from_op="Rock" from_port="example set output" to_op="Remember" to_port="store"/>
<connect from_op="Rock" from_port="original" to_op="Mine" to_port="example set input"/>
<connect from_op="Mine" from_port="example set output" to_op="Bagging" to_port="training set"/>
<connect from_op="Bagging" from_port="model" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="72"/>
<portSpacing port="sink_result 2" spacing="0"/>
</process>
</operator>
</process>