The Altair Community is migrating to a new platform to provide a better experience for you. In preparation for the migration, the Altair Community is on read-only mode from October 28 - November 6, 2024. Technical support via cases will continue to work as is. For any urgent requests from Students/Faculty members, please submit the form linked here
JMySVMLearner and Platt Scaling versus SVM with Split Validation
Hi,
Using one of RM's templates (JMySVMLearner and Platt Scaling) with my sample data, I get 8204/8646 correct predictions. The XML is:
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.3.015">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="5.3.015" expanded="true" name="Root">
<description>Applies a platt scaling as postprocessing to an SVM model.</description>
<process expanded="true">
<operator activated="true" class="retrieve" compatibility="5.3.015" expanded="true" height="60" name="Retrieve 060114_WOU" width="90" x="45" y="30">
<parameter key="repository_entry" value="//Local Repository/060114_WOU"/>
</operator>
<operator activated="true" class="nominal_to_numerical" compatibility="5.3.015" expanded="true" height="94" name="Nominal to Numerical" width="90" x="112" y="30">
<list key="comparison_groups"/>
</operator>
<operator activated="true" class="replace_missing_values" compatibility="5.3.015" expanded="true" height="94" name="Replace Missing Values" width="90" x="145" y="30">
<list key="columns"/>
</operator>
<operator activated="true" class="support_vector_machine" compatibility="5.3.015" expanded="true" height="112" name="JMySVMLearner" width="90" x="179" y="30">
<parameter key="kernel_type" value="radial"/>
<parameter key="convergence_epsilon" value="0.0010"/>
</operator>
<operator activated="true" breakpoints="after" class="rescale_confidences" compatibility="5.3.015" expanded="true" height="76" name="PlattScaling" width="90" x="313" y="30"/>
<operator activated="true" class="apply_model" compatibility="5.3.015" expanded="true" height="76" name="ModelApplier" width="90" x="447" y="30">
<list key="application_parameters"/>
</operator>
<connect from_op="Retrieve 060114_WOU" from_port="output" to_op="Nominal to Numerical" to_port="example set input"/>
<connect from_op="Nominal to Numerical" from_port="example set output" to_op="Replace Missing Values" to_port="example set input"/>
<connect from_op="Replace Missing Values" from_port="example set output" to_op="JMySVMLearner" to_port="training set"/>
<connect from_op="JMySVMLearner" from_port="model" to_op="PlattScaling" to_port="prediction model"/>
<connect from_op="JMySVMLearner" from_port="exampleSet" to_op="PlattScaling" to_port="example set"/>
<connect from_op="PlattScaling" from_port="example set" to_op="ModelApplier" to_port="unlabelled data"/>
<connect from_op="PlattScaling" from_port="model" to_op="ModelApplier" to_port="model"/>
<connect from_op="ModelApplier" from_port="labelled data" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="126"/>
</process>
</operator>
</process>
However, when I create a new process with the same sample data, split validation (70% training data with stratified sampling) and an SVM with the same settings as what appears for the JMySVMLearner in terms of kernel settings and other values, my accuracy is only 57.21%. The XML is:
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.3.015">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="5.3.015" expanded="true" name="Process">
<process expanded="true">
<operator activated="true" class="retrieve" compatibility="5.3.015" expanded="true" height="60" name="Retrieve 060114_WOU" width="90" x="45" y="30">
<parameter key="repository_entry" value="//Local Repository/060114_WOU"/>
</operator>
<operator activated="true" class="split_validation" compatibility="5.3.015" expanded="true" height="112" name="Validation" width="90" x="179" y="30">
<parameter key="sampling_type" value="stratified sampling"/>
<process expanded="true">
<operator activated="true" class="nominal_to_numerical" compatibility="5.3.015" expanded="true" height="94" name="Nominal to Numerical" width="90" x="-22" y="30">
<list key="comparison_groups"/>
</operator>
<operator activated="true" class="replace_missing_values" compatibility="5.3.015" expanded="true" height="94" name="Replace Missing Values" width="90" x="11" y="30">
<list key="columns"/>
</operator>
<operator activated="true" class="support_vector_machine" compatibility="5.3.015" expanded="true" height="112" name="SVM" width="90" x="45" y="30">
<parameter key="kernel_type" value="radial"/>
</operator>
<connect from_port="training" to_op="Nominal to Numerical" to_port="example set input"/>
<connect from_op="Nominal to Numerical" from_port="example set output" to_op="Replace Missing Values" to_port="example set input"/>
<connect from_op="Replace Missing Values" from_port="example set output" to_op="SVM" to_port="training set"/>
<connect from_op="SVM" from_port="model" to_port="model"/>
<portSpacing port="source_training" spacing="0"/>
<portSpacing port="sink_model" spacing="0"/>
<portSpacing port="sink_through 1" spacing="0"/>
</process>
<process expanded="true">
<operator activated="true" class="apply_model" compatibility="5.3.015" expanded="true" height="76" name="Apply Model" width="90" x="45" y="30">
<list key="application_parameters"/>
</operator>
<operator activated="true" class="performance" compatibility="5.3.015" expanded="true" height="76" name="Performance" width="90" x="179" y="30"/>
<connect from_port="model" to_op="Apply Model" to_port="model"/>
<connect from_port="test set" to_op="Apply Model" to_port="unlabelled data"/>
<connect from_op="Apply Model" from_port="labelled data" to_op="Performance" to_port="labelled data"/>
<connect from_op="Performance" from_port="performance" to_port="averagable 1"/>
<portSpacing port="source_model" spacing="0"/>
<portSpacing port="source_test set" spacing="0"/>
<portSpacing port="source_through 1" spacing="0"/>
<portSpacing port="sink_averagable 1" spacing="0"/>
<portSpacing port="sink_averagable 2" spacing="0"/>
</process>
</operator>
<connect from_op="Retrieve 060114_WOU" from_port="output" to_op="Validation" to_port="training"/>
<connect from_op="Validation" from_port="model" to_port="result 1"/>
<connect from_op="Validation" from_port="training" to_port="result 2"/>
<connect from_op="Validation" from_port="averagable 1" to_port="result 3"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
<portSpacing port="sink_result 3" spacing="0"/>
<portSpacing port="sink_result 4" spacing="0"/>
</process>
</operator>
</process>
I am confused as to why there are such differences in accuracy.
Also, can I apply the Platt scaling to the split validation process somehow since I assume that this is contributing to the difference in accuracy of prediction?
I am new to the idea of Platt scaling and don't fully understand what this template does since there is no training/testing process split that I can see.
Thanks!
James
Using one of RM's templates (JMySVMLearner and Platt Scaling) with my sample data, I get 8204/8646 correct predictions. The XML is:
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.3.015">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="5.3.015" expanded="true" name="Root">
<description>Applies a platt scaling as postprocessing to an SVM model.</description>
<process expanded="true">
<operator activated="true" class="retrieve" compatibility="5.3.015" expanded="true" height="60" name="Retrieve 060114_WOU" width="90" x="45" y="30">
<parameter key="repository_entry" value="//Local Repository/060114_WOU"/>
</operator>
<operator activated="true" class="nominal_to_numerical" compatibility="5.3.015" expanded="true" height="94" name="Nominal to Numerical" width="90" x="112" y="30">
<list key="comparison_groups"/>
</operator>
<operator activated="true" class="replace_missing_values" compatibility="5.3.015" expanded="true" height="94" name="Replace Missing Values" width="90" x="145" y="30">
<list key="columns"/>
</operator>
<operator activated="true" class="support_vector_machine" compatibility="5.3.015" expanded="true" height="112" name="JMySVMLearner" width="90" x="179" y="30">
<parameter key="kernel_type" value="radial"/>
<parameter key="convergence_epsilon" value="0.0010"/>
</operator>
<operator activated="true" breakpoints="after" class="rescale_confidences" compatibility="5.3.015" expanded="true" height="76" name="PlattScaling" width="90" x="313" y="30"/>
<operator activated="true" class="apply_model" compatibility="5.3.015" expanded="true" height="76" name="ModelApplier" width="90" x="447" y="30">
<list key="application_parameters"/>
</operator>
<connect from_op="Retrieve 060114_WOU" from_port="output" to_op="Nominal to Numerical" to_port="example set input"/>
<connect from_op="Nominal to Numerical" from_port="example set output" to_op="Replace Missing Values" to_port="example set input"/>
<connect from_op="Replace Missing Values" from_port="example set output" to_op="JMySVMLearner" to_port="training set"/>
<connect from_op="JMySVMLearner" from_port="model" to_op="PlattScaling" to_port="prediction model"/>
<connect from_op="JMySVMLearner" from_port="exampleSet" to_op="PlattScaling" to_port="example set"/>
<connect from_op="PlattScaling" from_port="example set" to_op="ModelApplier" to_port="unlabelled data"/>
<connect from_op="PlattScaling" from_port="model" to_op="ModelApplier" to_port="model"/>
<connect from_op="ModelApplier" from_port="labelled data" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="126"/>
</process>
</operator>
</process>
However, when I create a new process with the same sample data, split validation (70% training data with stratified sampling) and an SVM with the same settings as what appears for the JMySVMLearner in terms of kernel settings and other values, my accuracy is only 57.21%. The XML is:
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.3.015">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="5.3.015" expanded="true" name="Process">
<process expanded="true">
<operator activated="true" class="retrieve" compatibility="5.3.015" expanded="true" height="60" name="Retrieve 060114_WOU" width="90" x="45" y="30">
<parameter key="repository_entry" value="//Local Repository/060114_WOU"/>
</operator>
<operator activated="true" class="split_validation" compatibility="5.3.015" expanded="true" height="112" name="Validation" width="90" x="179" y="30">
<parameter key="sampling_type" value="stratified sampling"/>
<process expanded="true">
<operator activated="true" class="nominal_to_numerical" compatibility="5.3.015" expanded="true" height="94" name="Nominal to Numerical" width="90" x="-22" y="30">
<list key="comparison_groups"/>
</operator>
<operator activated="true" class="replace_missing_values" compatibility="5.3.015" expanded="true" height="94" name="Replace Missing Values" width="90" x="11" y="30">
<list key="columns"/>
</operator>
<operator activated="true" class="support_vector_machine" compatibility="5.3.015" expanded="true" height="112" name="SVM" width="90" x="45" y="30">
<parameter key="kernel_type" value="radial"/>
</operator>
<connect from_port="training" to_op="Nominal to Numerical" to_port="example set input"/>
<connect from_op="Nominal to Numerical" from_port="example set output" to_op="Replace Missing Values" to_port="example set input"/>
<connect from_op="Replace Missing Values" from_port="example set output" to_op="SVM" to_port="training set"/>
<connect from_op="SVM" from_port="model" to_port="model"/>
<portSpacing port="source_training" spacing="0"/>
<portSpacing port="sink_model" spacing="0"/>
<portSpacing port="sink_through 1" spacing="0"/>
</process>
<process expanded="true">
<operator activated="true" class="apply_model" compatibility="5.3.015" expanded="true" height="76" name="Apply Model" width="90" x="45" y="30">
<list key="application_parameters"/>
</operator>
<operator activated="true" class="performance" compatibility="5.3.015" expanded="true" height="76" name="Performance" width="90" x="179" y="30"/>
<connect from_port="model" to_op="Apply Model" to_port="model"/>
<connect from_port="test set" to_op="Apply Model" to_port="unlabelled data"/>
<connect from_op="Apply Model" from_port="labelled data" to_op="Performance" to_port="labelled data"/>
<connect from_op="Performance" from_port="performance" to_port="averagable 1"/>
<portSpacing port="source_model" spacing="0"/>
<portSpacing port="source_test set" spacing="0"/>
<portSpacing port="source_through 1" spacing="0"/>
<portSpacing port="sink_averagable 1" spacing="0"/>
<portSpacing port="sink_averagable 2" spacing="0"/>
</process>
</operator>
<connect from_op="Retrieve 060114_WOU" from_port="output" to_op="Validation" to_port="training"/>
<connect from_op="Validation" from_port="model" to_port="result 1"/>
<connect from_op="Validation" from_port="training" to_port="result 2"/>
<connect from_op="Validation" from_port="averagable 1" to_port="result 3"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
<portSpacing port="sink_result 3" spacing="0"/>
<portSpacing port="sink_result 4" spacing="0"/>
</process>
</operator>
</process>
I am confused as to why there are such differences in accuracy.
Also, can I apply the Platt scaling to the split validation process somehow since I assume that this is contributing to the difference in accuracy of prediction?
I am new to the idea of Platt scaling and don't fully understand what this template does since there is no training/testing process split that I can see.
Thanks!
James
0
Answers
to add the Platt Scaling (called Rescale Confidences in the current version of RapidMiner) just add it to the training process as in the process below. However the difference in the accuracy comes from the fact that you apply the model on the training data in the first process, which usually dramatically over-estimates your model.
Best regards,
Marius