The Altair Community is migrating to a new platform to provide a better experience for you. In preparation for the migration, the Altair Community is on read-only mode from October 28 - November 6, 2024. Technical support via cases will continue to work as is. For any urgent requests from Students/Faculty members, please submit the form linked here
How to use Principle Component Analysis on test data
mamdouhalshamy
Member Posts: 1 Learner III
I am a little bit new to RapidMiner.
I used PCA on the training data and it is reduced from 86 to 21 feature.
I want to generate this 21 feature from the test data to apply the learnt algorithm on them.
Could anyone guide me on how to accomplish this ?
I used PCA on the training data and it is reduced from 86 to 21 feature.
I want to generate this 21 feature from the test data to apply the learnt algorithm on them.
Could anyone guide me on how to accomplish this ?
0
Answers
the PCA outputs a so-called preprocessing model. You can group it together with the actual classification model via Group Model (the order of the inputs is important). If you apply the grouped model the test data will first be transformed with the same PCA as the training data, then the classification model will be applied.
If you do this in a X-Validation you have to add Materialize Data operators due to some internal specialties of PCA together with the X-Validation.
For an example please have a look at the process below.
Best regards,
Marius
I am working on a dataset with large number of attributes(590) and also this dataset has class imbalance problem.It is a binomial classification problem. i am trying to do PCA first and then apply the reduced dataset for cross validation using decision tree. Then i have grouped these two models and applied on the test set. But the rapidminer gives error at the applymodel saying :"The setup does not seem to contain any obvious errors, but you should check the log messages or activate the debug mode in the settings dialog in order to get more information about this problem." when i checked the log the last line :
SEVERE: java.lang.NullPointerException. please help me with that.Here is my xml code:
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.3.015">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="5.3.015" expanded="true" name="Process">
<process expanded="true">
<operator activated="true" class="retrieve" compatibility="5.3.015" expanded="true" height="60" name="Retrieve semiconductortesting" width="90" x="45" y="255">
<parameter key="repository_entry" value="../data/semiconductor/semiconductortesting"/>
</operator>
<operator activated="true" class="materialize_data" compatibility="5.3.015" expanded="true" height="76" name="Materialize Data (2)" width="90" x="246" y="255"/>
<operator activated="true" class="retrieve" compatibility="5.3.015" expanded="true" height="60" name="Retrieve kmedoid_undersampled_data" width="90" x="45" y="30">
<parameter key="repository_entry" value="../data/semiconductor/kmedoid_undersampled_data"/>
</operator>
<operator activated="true" class="retrieve" compatibility="5.3.015" expanded="true" height="60" name="Retrieve SMOTE_oversampled_data" width="90" x="45" y="120">
<parameter key="repository_entry" value="../data/semiconductor/SMOTE_oversampled_data"/>
</operator>
<operator activated="true" class="append" compatibility="5.3.015" expanded="true" height="94" name="Append" width="90" x="179" y="30"/>
<operator activated="true" class="materialize_data" compatibility="5.3.015" expanded="true" height="76" name="Materialize Data" width="90" x="313" y="30"/>
<operator activated="false" class="shuffle" compatibility="5.3.015" expanded="true" height="76" name="Shuffle" width="90" x="983" y="210">
<parameter key="use_local_random_seed" value="true"/>
<parameter key="local_random_seed" value="532"/>
</operator>
<operator activated="true" class="principal_component_analysis" compatibility="5.3.015" expanded="true" height="94" name="PCA" width="90" x="447" y="30">
<parameter key="number_of_components" value="250"/>
</operator>
<operator activated="true" class="x_validation" compatibility="5.3.015" expanded="true" height="112" name="Validation" width="90" x="581" y="30">
<parameter key="number_of_validations" value="2"/>
<parameter key="sampling_type" value="shuffled sampling"/>
<process expanded="true">
<operator activated="true" class="decision_tree" compatibility="5.3.015" expanded="true" height="76" name="Decision Tree" width="90" x="133" y="30"/>
<connect from_port="training" to_op="Decision Tree" to_port="training set"/>
<connect from_op="Decision Tree" from_port="model" to_port="model"/>
<portSpacing port="source_training" spacing="0"/>
<portSpacing port="sink_model" spacing="0"/>
<portSpacing port="sink_through 1" spacing="0"/>
</process>
<process expanded="true">
<operator activated="true" class="apply_model" compatibility="5.3.015" expanded="true" height="76" name="ApplyDEcisiontree" width="90" x="45" y="30">
<list key="application_parameters"/>
</operator>
<operator activated="true" class="performance" compatibility="5.3.015" expanded="true" height="76" name="trainingperformance (3)" width="90" x="179" y="30"/>
<operator activated="false" class="performance_binominal_classification" compatibility="5.3.015" expanded="true" height="76" name="trainingperformance (2)" width="90" x="112" y="165">
<parameter key="AUC" value="true"/>
<parameter key="f_measure" value="true"/>
</operator>
<operator activated="false" class="performance_costs" compatibility="5.3.015" expanded="true" height="76" name="trainingcostperformance" width="90" x="246" y="165">
<parameter key="cost_matrix" value="[0.0 4.0;1.0 0.0]"/>
<enumeration key="class_order_definition">
<parameter key="class_name" value="-1.0"/>
<parameter key="class_name" value="1.0"/>
</enumeration>
</operator>
<operator activated="false" breakpoints="after" class="performance_classification" compatibility="5.3.015" expanded="true" height="76" name="trainingperformance" width="90" x="380" y="255">
<parameter key="main_criterion" value="accuracy"/>
<parameter key="classification_error" value="true"/>
<parameter key="weighted_mean_recall" value="true"/>
<parameter key="weighted_mean_precision" value="true"/>
<parameter key="root_mean_squared_error" value="true"/>
<list key="class_weights"/>
</operator>
<connect from_port="model" to_op="ApplyDEcisiontree" to_port="model"/>
<connect from_port="test set" to_op="ApplyDEcisiontree" to_port="unlabelled data"/>
<connect from_op="ApplyDEcisiontree" from_port="labelled data" to_op="trainingperformance (3)" to_port="labelled data"/>
<connect from_op="trainingperformance (3)" from_port="performance" to_port="averagable 1"/>
<portSpacing port="source_model" spacing="0"/>
<portSpacing port="source_test set" spacing="0"/>
<portSpacing port="source_through 1" spacing="0"/>
<portSpacing port="sink_averagable 1" spacing="0"/>
<portSpacing port="sink_averagable 2" spacing="0"/>
</process>
</operator>
<operator activated="true" class="group_models" compatibility="5.3.015" expanded="true" height="94" name="Group Models" width="90" x="313" y="165"/>
<operator activated="true" class="apply_model" compatibility="5.3.015" expanded="true" height="76" name="applyontestdata" width="90" x="447" y="255">
<list key="application_parameters"/>
</operator>
<operator activated="true" class="performance" compatibility="5.3.015" expanded="true" height="76" name="PerformanceTestdata" width="90" x="581" y="210"/>
<connect from_op="Retrieve semiconductortesting" from_port="output" to_op="Materialize Data (2)" to_port="example set input"/>
<connect from_op="Materialize Data (2)" from_port="example set output" to_op="applyontestdata" to_port="unlabelled data"/>
<connect from_op="Retrieve kmedoid_undersampled_data" from_port="output" to_op="Append" to_port="example set 1"/>
<connect from_op="Retrieve SMOTE_oversampled_data" from_port="output" to_op="Append" to_port="example set 2"/>
<connect from_op="Append" from_port="merged set" to_op="Materialize Data" to_port="example set input"/>
<connect from_op="Materialize Data" from_port="example set output" to_op="PCA" to_port="example set input"/>
<connect from_op="PCA" from_port="example set output" to_op="Validation" to_port="training"/>
<connect from_op="PCA" from_port="preprocessing model" to_op="Group Models" to_port="models in 1"/>
<connect from_op="Validation" from_port="model" to_op="Group Models" to_port="models in 2"/>
<connect from_op="Validation" from_port="averagable 1" to_port="result 1"/>
<connect from_op="Group Models" from_port="model out" to_op="applyontestdata" to_port="model"/>
<connect from_op="applyontestdata" from_port="labelled data" to_op="PerformanceTestdata" to_port="labelled data"/>
<connect from_op="PerformanceTestdata" from_port="performance" to_port="result 2"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
<portSpacing port="sink_result 3" spacing="0"/>
</process>
</operator>
</process>
that was a bug which has since been fixed. Your process should run just fine in RapidMiner Studio 6.4
Regards,
Marco