The Altair Community is migrating to a new platform to provide a better experience for you. In preparation for the migration, the Altair Community is on read-only mode from October 28 - November 6, 2024. Technical support via cases will continue to work as is. For any urgent requests from Students/Faculty members, please submit the form linked here
GC Overhead limit for Decision Tree
aryan_hosseinza
Member Posts: 74 Contributor II
Hi everybody,
I've got a dataset with 580,000 instances and about 15 nominal features (except for the label which binominal) ,
when I run the process below , it gives me an error which says that GC over head limit (and a large error which says it's not possible to clone sampleset etc.)
is this usual in rapid miner ? or there's something wrong ?
Note : I should note that I have 6 Giga Bytes of memory available for rapidminer , and I guess the problem is with bagging , as it stops running when it reaches the 6th decision tree, but I don't know how to fix it
thanks
I've got a dataset with 580,000 instances and about 15 nominal features (except for the label which binominal) ,
when I run the process below , it gives me an error which says that GC over head limit (and a large error which says it's not possible to clone sampleset etc.)
is this usual in rapid miner ? or there's something wrong ?
Note : I should note that I have 6 Giga Bytes of memory available for rapidminer , and I guess the problem is with bagging , as it stops running when it reaches the 6th decision tree, but I don't know how to fix it
thanks
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.2.008">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="5.2.008" expanded="true" name="Process">
<process expanded="true" height="611" width="1016">
<operator activated="true" class="retrieve" compatibility="5.2.008" expanded="true" height="60" name="Retrieve" width="90" x="45" y="210">
<parameter key="repository_entry" value="descritized_GI_FROM_MI50"/>
</operator>
<operator activated="true" class="set_role" compatibility="5.2.008" expanded="true" height="76" name="Set Role (2)" width="90" x="246" y="210">
<parameter key="name" value="event"/>
<parameter key="target_role" value="label"/>
<list key="set_additional_roles"/>
</operator>
<operator activated="false" class="retrieve" compatibility="5.2.008" expanded="true" height="60" name="Retrieve (2)" width="90" x="45" y="480">
<parameter key="repository_entry" value="MI_67"/>
</operator>
<operator activated="false" class="discretize_by_bins" compatibility="5.2.008" expanded="true" height="94" name="Discretize" width="90" x="246" y="30">
<parameter key="attribute_filter_type" value="single"/>
<parameter key="attribute" value="age"/>
<parameter key="number_of_bins" value="5"/>
<parameter key="range_name_type" value="short"/>
</operator>
<operator activated="true" class="sample_stratified" compatibility="5.2.008" expanded="true" height="76" name="Sample (2)" width="90" x="380" y="210">
<parameter key="sample" value="relative"/>
</operator>
<operator activated="true" class="numerical_to_polynominal" compatibility="5.2.008" expanded="true" height="76" name="Numerical to Polynominal" width="90" x="514" y="210"/>
<operator activated="false" class="select_attributes" compatibility="5.2.008" expanded="true" height="76" name="Select Attributes (3)" width="90" x="514" y="480">
<parameter key="attribute_filter_type" value="subset"/>
<parameter key="attributes" value="event|age|d_04173|day_30_readmits|i_9985|los|num_diags|num_drugs|num_procs|p_14|prev_readmissions||sex"/>
</operator>
<operator activated="true" class="x_validation" compatibility="5.2.008" expanded="true" height="112" name="Validation (2)" width="90" x="782" y="210">
<process expanded="true" height="836" width="2399">
<operator activated="false" class="naive_bayes_kernel" compatibility="5.2.008" expanded="true" height="76" name="Naive Bayes (Kernel)" width="90" x="514" y="30">
<parameter key="laplace_correction" value="false"/>
<parameter key="estimation_mode" value="full"/>
<parameter key="bandwidth_selection" value="fix"/>
<parameter key="bandwidth" value="0.6"/>
</operator>
<operator activated="true" class="bagging" compatibility="5.2.008" expanded="true" height="76" name="Bagging" width="90" x="581" y="210">
<parameter key="sample_ratio" value="0.4"/>
<parameter key="iterations" value="40"/>
<parameter key="average_confidences" value="false"/>
<process expanded="true" height="677" width="1037">
<operator activated="true" class="decision_tree" compatibility="5.2.008" expanded="true" height="76" name="Decision Tree (2)" width="90" x="459" y="201">
<parameter key="criterion" value="gini_index"/>
<parameter key="minimal_size_for_split" value="320"/>
<parameter key="minimal_leaf_size" value="160"/>
<parameter key="maximal_depth" value="10"/>
<parameter key="confidence" value="0.1"/>
<parameter key="number_of_prepruning_alternatives" value="10"/>
<parameter key="no_pre_pruning" value="true"/>
</operator>
<connect from_port="training set" to_op="Decision Tree (2)" to_port="training set"/>
<connect from_op="Decision Tree (2)" from_port="model" to_port="model"/>
<portSpacing port="source_training set" spacing="0"/>
<portSpacing port="sink_model" spacing="0"/>
</process>
</operator>
<operator activated="false" class="decision_tree" compatibility="5.2.008" expanded="true" height="76" name="Decision Tree" width="90" x="380" y="390"/>
<connect from_port="training" to_op="Bagging" to_port="training set"/>
<connect from_op="Bagging" from_port="model" to_port="model"/>
<portSpacing port="source_training" spacing="0"/>
<portSpacing port="sink_model" spacing="0"/>
<portSpacing port="sink_through 1" spacing="0"/>
</process>
<process expanded="true" height="682" width="502">
<operator activated="true" class="apply_model" compatibility="5.2.008" expanded="true" height="76" name="Apply Model" width="90" x="112" y="30">
<list key="application_parameters"/>
</operator>
<operator activated="true" class="performance_binominal_classification" compatibility="5.2.008" expanded="true" height="76" name="Performance" width="90" x="313" y="30">
<parameter key="accuracy" value="false"/>
<parameter key="AUC" value="true"/>
<parameter key="f_measure" value="true"/>
</operator>
<connect from_port="model" to_op="Apply Model" to_port="model"/>
<connect from_port="test set" to_op="Apply Model" to_port="unlabelled data"/>
<connect from_op="Apply Model" from_port="labelled data" to_op="Performance" to_port="labelled data"/>
<connect from_op="Performance" from_port="performance" to_port="averagable 1"/>
<portSpacing port="source_model" spacing="0"/>
<portSpacing port="source_test set" spacing="0"/>
<portSpacing port="source_through 1" spacing="0"/>
<portSpacing port="sink_averagable 1" spacing="0"/>
<portSpacing port="sink_averagable 2" spacing="0"/>
</process>
</operator>
<connect from_op="Retrieve" from_port="output" to_op="Set Role (2)" to_port="example set input"/>
<connect from_op="Set Role (2)" from_port="example set output" to_op="Sample (2)" to_port="example set input"/>
<connect from_op="Sample (2)" from_port="example set output" to_op="Numerical to Polynominal" to_port="example set input"/>
<connect from_op="Numerical to Polynominal" from_port="example set output" to_op="Validation (2)" to_port="training"/>
<connect from_op="Validation (2)" from_port="model" to_port="result 2"/>
<connect from_op="Validation (2)" from_port="averagable 1" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
<portSpacing port="sink_result 3" spacing="0"/>
</process>
</operator>
</process>
0
Answers
you have 3 options:
1. increase the amount of memory of your machine
2. reduce the number of examples
3. reduce the number of iterations for the bagging operator
I would go for the last 2 options. 580.000 examples is quite a lot, I would apply a sampling beforehand. At the same time, 40 iterations for bagging is rather unusual, try to reduce to sth. around 10 and increase it gradually - probably you won't see a notable improvement beginning with a certain iteration count, because the learning curve reaches saturation.
I suggest to use Loop Parameters in combination with the log operator to investigate the impact of the iteration count on the performance.
Best,
Marius
What does the System Monitor View in RapidMiner show? If you can't find it, you have to activate it via View->Show View->System Monitor.
Best,
Marius