The Altair Community is migrating to a new platform to provide a better experience for you. In preparation for the migration, the Altair Community is on read-only mode from October 28 - November 6, 2024. Technical support via cases will continue to work as is. For any urgent requests from Students/Faculty members, please submit the form linked here
Specificity, Sensitivity and AUC
Hello,
I'm doing some research for my bachelor thesis and have some questions about Rapidminer 5 and AUC.
Im trying to compare three Classificators ( SVM, Linear Regression and Linear Discriminant Analysis) with 10 fold crossvalidation and AUC as MAIN CRITERION.
For Linear Regression and Linear Discriminant Analysis I always get the same AUC (0.5). No matter how many variables I'm using or how I normalize the data before. On the other hand Specificity and Sensitivity values change. For example, the Linear Regression Model delivers a Specificity 78%, Sensitivity 71% and AUC 0.5. Linear Discriminant Analysis delivers a Specificity 70.81%, Sensitivity 64.5% and AUC 0.5
The SVM values (Specificity 81.5%, Sensitivity 79.5%, AUC 0,904) seems to be OK.
Can this be correct or is this a data Problem? Does a Linear Model always have an AUC of 0.5 no matter if Specificity and Sensitivity values are both high ?
I couldn't find any literature infos about this strange behavior. I always thought that with a high Specificity eg. 85% and Sensitivity e.g. 87% I get a high AUC value.
The whole process with Training Data can be downloaded here http://www.myexperiment.org/packs/151/download
I'm doing some research for my bachelor thesis and have some questions about Rapidminer 5 and AUC.
Im trying to compare three Classificators ( SVM, Linear Regression and Linear Discriminant Analysis) with 10 fold crossvalidation and AUC as MAIN CRITERION.
For Linear Regression and Linear Discriminant Analysis I always get the same AUC (0.5). No matter how many variables I'm using or how I normalize the data before. On the other hand Specificity and Sensitivity values change. For example, the Linear Regression Model delivers a Specificity 78%, Sensitivity 71% and AUC 0.5. Linear Discriminant Analysis delivers a Specificity 70.81%, Sensitivity 64.5% and AUC 0.5
The SVM values (Specificity 81.5%, Sensitivity 79.5%, AUC 0,904) seems to be OK.
Can this be correct or is this a data Problem? Does a Linear Model always have an AUC of 0.5 no matter if Specificity and Sensitivity values are both high ?
I couldn't find any literature infos about this strange behavior. I always thought that with a high Specificity eg. 85% and Sensitivity e.g. 87% I get a high AUC value.
The whole process with Training Data can be downloaded here http://www.myexperiment.org/packs/151/download
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.0">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="5.0.8" expanded="true" name="Root">
<process expanded="true" height="562" width="1007">
<operator activated="true" class="read_aml" compatibility="5.0.10" expanded="true" height="60" name="TestSet" width="90" x="45" y="30">
<parameter key="attributes" value="C:\Users\pawel\Documents\naruto1.aml"/>
</operator>
<operator activated="true" class="multiply" compatibility="5.0.8" expanded="true" height="112" name="Multiply" width="90" x="45" y="255"/>
<operator activated="true" class="normalize" compatibility="5.0.10" expanded="true" height="94" name="Z-score" width="90" x="246" y="75"/>
<operator activated="true" class="x_validation" compatibility="5.0.10" expanded="true" height="112" name="SVM X-Val" width="90" x="447" y="30">
<process expanded="true" height="385" width="330">
<operator activated="true" class="support_vector_machine_libsvm" compatibility="5.0.10" expanded="true" height="76" name="SVM" width="90" x="119" y="62">
<list key="class_weights"/>
</operator>
<connect from_port="training" to_op="SVM" to_port="training set"/>
<connect from_op="SVM" from_port="model" to_port="model"/>
<portSpacing port="source_training" spacing="0"/>
<portSpacing port="sink_model" spacing="0"/>
<portSpacing port="sink_through 1" spacing="0"/>
</process>
<process expanded="true" height="385" width="330">
<operator activated="true" class="apply_model" compatibility="5.0.10" expanded="true" height="76" name="Apply Model" width="90" x="45" y="30">
<list key="application_parameters"/>
</operator>
<operator activated="true" class="performance_binominal_classification" compatibility="5.0.10" expanded="true" height="76" name="SVM Perf" width="90" x="179" y="30">
<parameter key="main_criterion" value="AUC"/>
<parameter key="classification_error" value="true"/>
<parameter key="AUC" value="true"/>
<parameter key="sensitivity" value="true"/>
<parameter key="specificity" value="true"/>
</operator>
<connect from_port="model" to_op="Apply Model" to_port="model"/>
<connect from_port="test set" to_op="Apply Model" to_port="unlabelled data"/>
<connect from_op="Apply Model" from_port="labelled data" to_op="SVM Perf" to_port="labelled data"/>
<connect from_op="SVM Perf" from_port="performance" to_port="averagable 1"/>
<portSpacing port="source_model" spacing="0"/>
<portSpacing port="source_test set" spacing="0"/>
<portSpacing port="source_through 1" spacing="0"/>
<portSpacing port="sink_averagable 1" spacing="0"/>
<portSpacing port="sink_averagable 2" spacing="0"/>
</process>
</operator>
<operator activated="true" class="normalize" compatibility="5.0.10" expanded="true" height="94" name="Min-Max" width="90" x="269" y="296">
<parameter key="method" value="range transformation"/>
</operator>
<operator activated="true" class="x_validation" compatibility="5.0.8" expanded="true" height="112" name="LDA X-val" width="90" x="447" y="300">
<parameter key="sampling_type" value="linear sampling"/>
<process expanded="true" height="405" width="347">
<operator activated="true" class="linear_discriminant_analysis" compatibility="5.0.8" expanded="true" height="76" name="LDA (2)" width="90" x="133" y="30"/>
<connect from_port="training" to_op="LDA (2)" to_port="training set"/>
<connect from_op="LDA (2)" from_port="model" to_port="model"/>
<portSpacing port="source_training" spacing="0"/>
<portSpacing port="sink_model" spacing="0"/>
<portSpacing port="sink_through 1" spacing="0"/>
</process>
<process expanded="true" height="405" width="347">
<operator activated="true" class="apply_model" compatibility="5.0.8" expanded="true" height="76" name="Apply Model (7)" width="90" x="45" y="30">
<list key="application_parameters"/>
</operator>
<operator activated="true" class="performance_binominal_classification" compatibility="5.0.8" expanded="true" height="76" name="LDA Perf" width="90" x="200" y="30">
<parameter key="main_criterion" value="AUC"/>
<parameter key="classification_error" value="true"/>
<parameter key="AUC" value="true"/>
<parameter key="sensitivity" value="true"/>
<parameter key="specificity" value="true"/>
</operator>
<connect from_port="model" to_op="Apply Model (7)" to_port="model"/>
<connect from_port="test set" to_op="Apply Model (7)" to_port="unlabelled data"/>
<connect from_op="Apply Model (7)" from_port="labelled data" to_op="LDA Perf" to_port="labelled data"/>
<connect from_op="LDA Perf" from_port="performance" to_port="averagable 1"/>
<portSpacing port="source_model" spacing="0"/>
<portSpacing port="source_test set" spacing="0"/>
<portSpacing port="source_through 1" spacing="0"/>
<portSpacing port="sink_averagable 1" spacing="0"/>
<portSpacing port="sink_averagable 2" spacing="0"/>
</process>
</operator>
<operator activated="true" class="x_validation" compatibility="5.0.8" expanded="true" height="112" name="LR X-val" width="90" x="447" y="165">
<process expanded="true" height="405" width="347">
<operator activated="true" class="classification_by_regression" compatibility="5.0.8" expanded="true" height="76" name="Classification by Regression (2)" width="90" x="133" y="30">
<process expanded="true" height="385" width="710">
<operator activated="true" class="linear_regression" compatibility="5.0.8" expanded="true" height="94" name="Linear Regression (2)" width="90" x="319" y="30">
<parameter key="feature_selection" value="none"/>
</operator>
<connect from_port="training set" to_op="Linear Regression (2)" to_port="training set"/>
<connect from_op="Linear Regression (2)" from_port="model" to_port="model"/>
<portSpacing port="source_training set" spacing="0"/>
<portSpacing port="sink_model" spacing="0"/>
</process>
</operator>
<connect from_port="training" to_op="Classification by Regression (2)" to_port="training set"/>
<connect from_op="Classification by Regression (2)" from_port="model" to_port="model"/>
<portSpacing port="source_training" spacing="0"/>
<portSpacing port="sink_model" spacing="0"/>
<portSpacing port="sink_through 1" spacing="0"/>
</process>
<process expanded="true" height="405" width="347">
<operator activated="true" class="apply_model" compatibility="5.0.8" expanded="true" height="76" name="Apply Model (5)" width="90" x="45" y="30">
<list key="application_parameters"/>
</operator>
<operator activated="true" class="performance_binominal_classification" compatibility="5.0.8" expanded="true" height="76" name="LR Perf" width="90" x="200" y="30">
<parameter key="main_criterion" value="AUC"/>
<parameter key="classification_error" value="true"/>
<parameter key="AUC" value="true"/>
<parameter key="sensitivity" value="true"/>
<parameter key="specificity" value="true"/>
</operator>
<connect from_port="model" to_op="Apply Model (5)" to_port="model"/>
<connect from_port="test set" to_op="Apply Model (5)" to_port="unlabelled data"/>
<connect from_op="Apply Model (5)" from_port="labelled data" to_op="LR Perf" to_port="labelled data"/>
<connect from_op="LR Perf" from_port="performance" to_port="averagable 1"/>
<portSpacing port="source_model" spacing="0"/>
<portSpacing port="source_test set" spacing="0"/>
<portSpacing port="source_through 1" spacing="0"/>
<portSpacing port="sink_averagable 1" spacing="0"/>
<portSpacing port="sink_averagable 2" spacing="0"/>
</process>
</operator>
<connect from_op="TestSet" from_port="output" to_op="Multiply" to_port="input"/>
<connect from_op="Multiply" from_port="output 1" to_op="LR X-val" to_port="training"/>
<connect from_op="Multiply" from_port="output 2" to_op="Min-Max" to_port="example set input"/>
<connect from_op="Multiply" from_port="output 3" to_op="Z-score" to_port="example set input"/>
<connect from_op="Z-score" from_port="example set output" to_op="SVM X-Val" to_port="training"/>
<connect from_op="SVM X-Val" from_port="averagable 1" to_port="result 3"/>
<connect from_op="Min-Max" from_port="example set output" to_op="LDA X-val" to_port="training"/>
<connect from_op="LDA X-val" from_port="averagable 1" to_port="result 2"/>
<connect from_op="LR X-val" from_port="averagable 1" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
<portSpacing port="sink_result 3" spacing="0"/>
<portSpacing port="sink_result 4" spacing="0"/>
</process>
</operator>
</process>
Tagged:
1
Answers
I couldn't help noticing that when you normalise you have not selected the 'create view' option, which means that each normalisation changes the underlying data. By the time the LR XVal starts the data has in fact been normalised twice! You can use the double-headed blue arrow in the Process view to see the operator sequence.
This is quite a common novice gotcha, and Sebastian ( who actually knows what he is talking about ), runs through the explanation here.
http://rapid-i.com/rapidforum/index.php/topic,2588.0.html
Hopefully that will clear some of the fog!
Do I correct? I tried to find the literature to explain for this answer, but I cannot find. Why?
Scott