The Altair Community is migrating to a new platform to provide a better experience for you. In preparation for the migration, the Altair Community is on read-only mode from October 28 - November 6, 2024. Technical support via cases will continue to work as is. For any urgent requests from Students/Faculty members, please submit the form linked here
"[SOLVED]Using PCA with Test Set Getting Error"
Hello,
I am working on a dataset with large number of attributes(590) and also this dataset has class imbalance problem.It is a binomial classification problem. i am trying to do PCA first and then apply the reduced dataset for cross validation using decision tree. Then i have grouped these two models and applied on the test set. But the rapidminer gives error at the applymodel saying :"The setup does not seem to contain any obvious errors, but you should check the log messages or activate the debug mode in the settings dialog in order to get more information about this problem." when i checked the log the last line :
SEVERE: java.lang.NullPointerException. please help me with that ::) .Here is my xml code:
I am working on a dataset with large number of attributes(590) and also this dataset has class imbalance problem.It is a binomial classification problem. i am trying to do PCA first and then apply the reduced dataset for cross validation using decision tree. Then i have grouped these two models and applied on the test set. But the rapidminer gives error at the applymodel saying :"The setup does not seem to contain any obvious errors, but you should check the log messages or activate the debug mode in the settings dialog in order to get more information about this problem." when i checked the log the last line :
SEVERE: java.lang.NullPointerException. please help me with that ::) .Here is my xml code:
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.3.015">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="5.3.015" expanded="true" name="Process">
<process expanded="true">
<operator activated="true" class="retrieve" compatibility="5.3.015" expanded="true" height="60" name="Retrieve semiconductortesting" width="90" x="45" y="255">
<parameter key="repository_entry" value="../data/semiconductor/semiconductortesting"/>
</operator>
<operator activated="true" class="materialize_data" compatibility="5.3.015" expanded="true" height="76" name="Materialize Data (2)" width="90" x="246" y="255"/>
<operator activated="true" class="retrieve" compatibility="5.3.015" expanded="true" height="60" name="Retrieve kmedoid_undersampled_data" width="90" x="45" y="30">
<parameter key="repository_entry" value="../data/semiconductor/kmedoid_undersampled_data"/>
</operator>
<operator activated="true" class="retrieve" compatibility="5.3.015" expanded="true" height="60" name="Retrieve SMOTE_oversampled_data" width="90" x="45" y="120">
<parameter key="repository_entry" value="../data/semiconductor/SMOTE_oversampled_data"/>
</operator>
<operator activated="true" class="append" compatibility="5.3.015" expanded="true" height="94" name="Append" width="90" x="179" y="30"/>
<operator activated="true" class="materialize_data" compatibility="5.3.015" expanded="true" height="76" name="Materialize Data" width="90" x="313" y="30"/>
<operator activated="false" class="shuffle" compatibility="5.3.015" expanded="true" height="76" name="Shuffle" width="90" x="983" y="210">
<parameter key="use_local_random_seed" value="true"/>
<parameter key="local_random_seed" value="532"/>
</operator>
<operator activated="true" class="principal_component_analysis" compatibility="5.3.015" expanded="true" height="94" name="PCA" width="90" x="447" y="30">
<parameter key="number_of_components" value="250"/>
</operator>
<operator activated="true" class="x_validation" compatibility="5.3.015" expanded="true" height="112" name="Validation" width="90" x="581" y="30">
<parameter key="number_of_validations" value="2"/>
<parameter key="sampling_type" value="shuffled sampling"/>
<process expanded="true">
<operator activated="true" class="decision_tree" compatibility="5.3.015" expanded="true" height="76" name="Decision Tree" width="90" x="133" y="30"/>
<connect from_port="training" to_op="Decision Tree" to_port="training set"/>
<connect from_op="Decision Tree" from_port="model" to_port="model"/>
<portSpacing port="source_training" spacing="0"/>
<portSpacing port="sink_model" spacing="0"/>
<portSpacing port="sink_through 1" spacing="0"/>
</process>
<process expanded="true">
<operator activated="true" class="apply_model" compatibility="5.3.015" expanded="true" height="76" name="ApplyDEcisiontree" width="90" x="45" y="30">
<list key="application_parameters"/>
</operator>
<operator activated="true" class="performance" compatibility="5.3.015" expanded="true" height="76" name="trainingperformance (3)" width="90" x="179" y="30"/>
<operator activated="false" class="performance_binominal_classification" compatibility="5.3.015" expanded="true" height="76" name="trainingperformance (2)" width="90" x="112" y="165">
<parameter key="AUC" value="true"/>
<parameter key="f_measure" value="true"/>
</operator>
<operator activated="false" class="performance_costs" compatibility="5.3.015" expanded="true" height="76" name="trainingcostperformance" width="90" x="246" y="165">
<parameter key="cost_matrix" value="[0.0 4.0;1.0 0.0]"/>
<enumeration key="class_order_definition">
<parameter key="class_name" value="-1.0"/>
<parameter key="class_name" value="1.0"/>
</enumeration>
</operator>
<operator activated="false" breakpoints="after" class="performance_classification" compatibility="5.3.015" expanded="true" height="76" name="trainingperformance" width="90" x="380" y="255">
<parameter key="main_criterion" value="accuracy"/>
<parameter key="classification_error" value="true"/>
<parameter key="weighted_mean_recall" value="true"/>
<parameter key="weighted_mean_precision" value="true"/>
<parameter key="root_mean_squared_error" value="true"/>
<list key="class_weights"/>
</operator>
<connect from_port="model" to_op="ApplyDEcisiontree" to_port="model"/>
<connect from_port="test set" to_op="ApplyDEcisiontree" to_port="unlabelled data"/>
<connect from_op="ApplyDEcisiontree" from_port="labelled data" to_op="trainingperformance (3)" to_port="labelled data"/>
<connect from_op="trainingperformance (3)" from_port="performance" to_port="averagable 1"/>
<portSpacing port="source_model" spacing="0"/>
<portSpacing port="source_test set" spacing="0"/>
<portSpacing port="source_through 1" spacing="0"/>
<portSpacing port="sink_averagable 1" spacing="0"/>
<portSpacing port="sink_averagable 2" spacing="0"/>
</process>
</operator>
<operator activated="true" class="group_models" compatibility="5.3.015" expanded="true" height="94" name="Group Models" width="90" x="313" y="165"/>
<operator activated="true" class="apply_model" compatibility="5.3.015" expanded="true" height="76" name="applyontestdata" width="90" x="447" y="255">
<list key="application_parameters"/>
</operator>
<operator activated="true" class="performance" compatibility="5.3.015" expanded="true" height="76" name="PerformanceTestdata" width="90" x="581" y="210"/>
<connect from_op="Retrieve semiconductortesting" from_port="output" to_op="Materialize Data (2)" to_port="example set input"/>
<connect from_op="Materialize Data (2)" from_port="example set output" to_op="applyontestdata" to_port="unlabelled data"/>
<connect from_op="Retrieve kmedoid_undersampled_data" from_port="output" to_op="Append" to_port="example set 1"/>
<connect from_op="Retrieve SMOTE_oversampled_data" from_port="output" to_op="Append" to_port="example set 2"/>
<connect from_op="Append" from_port="merged set" to_op="Materialize Data" to_port="example set input"/>
<connect from_op="Materialize Data" from_port="example set output" to_op="PCA" to_port="example set input"/>
<connect from_op="PCA" from_port="example set output" to_op="Validation" to_port="training"/>
<connect from_op="PCA" from_port="preprocessing model" to_op="Group Models" to_port="models in 1"/>
<connect from_op="Validation" from_port="model" to_op="Group Models" to_port="models in 2"/>
<connect from_op="Validation" from_port="averagable 1" to_port="result 1"/>
<connect from_op="Group Models" from_port="model out" to_op="applyontestdata" to_port="model"/>
<connect from_op="applyontestdata" from_port="labelled data" to_op="PerformanceTestdata" to_port="labelled data"/>
<connect from_op="PerformanceTestdata" from_port="performance" to_port="result 2"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
<portSpacing port="sink_result 3" spacing="0"/>
</process>
</operator>
</process>
0
Answers
What are the differences in attributes between your training & test datasets?
When generated some sample data to test your process it works without an issue for me, but I know that if the features aren't the same for the training & test then it might cause problems when trying to apply the models.
To test this try connecting your original data (training) to the the apply model to see if it still causes errors. If it doesn't then you know you need to match the features on both sides. You could use an operator like 'Superset' to do this, but this will only add the attributes, it's better to delve in and have a look at the two datasets.
Likely it's something really simple like a field stored as polynominal in the training data is numerical in the test data.
all the XX to XX Operators (Like Nominal to Numerical) have a attribute selector. There you can specifiy which attributes should be converted. This includes regular expression (all attributes which start with count_) or block types like "all numerical".
Cheers,
Martin
Dortmund, Germany