Issue with replicating (multiply operator) test data
I am analyzing the churn dataset (WA_Fn UseC_ Telco Customer Churn.csv) from the IBM sample datasets website. Attached is the XML process and the error encountered when I try to replicate the test data port from the CV operator. I am using 3 replications of the test data, one for creating a lift chart, another one for creating ROCs and yet another one for simply as output (to see the predictions and the confidence values). However, the output of the predictions and confidence values are not coming through. Testing with breakpoints show that the test data before and after the multiply operator has those values properly generated, but the final output does not display it. Any ideas would be of much help.
<?xml version="1.0" encoding="UTF-8"?><process version="9.0.002">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="9.0.002" expanded="true" name="Process">
<process expanded="true">
<operator activated="true" class="retrieve" compatibility="9.0.002" expanded="true" height="68" name="Retrieve WA_Fn-UseC_-Telco-Customer-Churn" width="90" x="112" y="85">
<parameter key="repository_entry" value="../data/WA_Fn-UseC_-Telco-Customer-Churn"/>
</operator>
<operator activated="true" class="nominal_to_binominal" compatibility="9.0.002" expanded="true" height="103" name="Nominal to Binominal" width="90" x="45" y="187">
<parameter key="attribute_filter_type" value="subset"/>
<parameter key="attribute" value="Churn"/>
<parameter key="attributes" value="Dependents|PaperlessBilling|Partner|PhoneService|gender|Churn"/>
<parameter key="include_special_attributes" value="true"/>
</operator>
<operator activated="true" class="numerical_to_binominal" compatibility="9.0.002" expanded="true" height="82" name="Numerical to Binominal" width="90" x="45" y="340">
<parameter key="attribute_filter_type" value="single"/>
<parameter key="attribute" value="SeniorCitizen"/>
<parameter key="include_special_attributes" value="true"/>
</operator>
<operator activated="true" class="set_role" compatibility="9.0.002" expanded="true" height="82" name="Set Role" width="90" x="45" y="442">
<parameter key="attribute_name" value="Churn"/>
<parameter key="target_role" value="label"/>
<list key="set_additional_roles">
<parameter key="customerID" value="id"/>
</list>
</operator>
<operator activated="true" class="multiply" compatibility="9.0.002" expanded="true" height="82" name="Multiply" width="90" x="45" y="544"/>
<operator activated="true" class="concurrency:cross_validation" compatibility="9.0.002" expanded="true" height="145" name="Cross Validation" width="90" x="179" y="544">
<parameter key="use_local_random_seed" value="true"/>
<parameter key="local_random_seed" value="2018"/>
<process expanded="true">
<operator activated="true" class="concurrency:parallel_decision_tree" compatibility="9.0.002" expanded="true" height="103" name="Decision Tree" width="90" x="179" y="34">
<parameter key="maximal_depth" value="5"/>
<parameter key="confidence" value="0.25"/>
<parameter key="minimal_leaf_size" value="10"/>
</operator>
<connect from_port="training set" to_op="Decision Tree" to_port="training set"/>
<connect from_op="Decision Tree" from_port="model" to_port="model"/>
<connect from_op="Decision Tree" from_port="weights" to_port="through 1"/>
<portSpacing port="source_training set" spacing="0"/>
<portSpacing port="sink_model" spacing="0"/>
<portSpacing port="sink_through 1" spacing="0"/>
<portSpacing port="sink_through 2" spacing="0"/>
</process>
<process expanded="true">
<operator activated="true" class="apply_model" compatibility="9.0.002" expanded="true" height="82" name="Apply Model" width="90" x="112" y="187">
<list key="application_parameters"/>
</operator>
<operator activated="true" class="performance_classification" compatibility="9.0.002" expanded="true" height="82" name="Performance" width="90" x="313" y="187">
<list key="class_weights"/>
</operator>
<connect from_port="model" to_op="Apply Model" to_port="model"/>
<connect from_port="test set" to_op="Apply Model" to_port="unlabelled data"/>
<connect from_op="Apply Model" from_port="labelled data" to_op="Performance" to_port="labelled data"/>
<connect from_op="Performance" from_port="performance" to_port="performance 1"/>
<connect from_op="Performance" from_port="example set" to_port="test set results"/>
<portSpacing port="source_model" spacing="0"/>
<portSpacing port="source_test set" spacing="0"/>
<portSpacing port="source_through 1" spacing="0"/>
<portSpacing port="source_through 2" spacing="0"/>
<portSpacing port="sink_test set results" spacing="0"/>
<portSpacing port="sink_performance 1" spacing="0"/>
<portSpacing port="sink_performance 2" spacing="0"/>
</process>
</operator>
<operator activated="true" class="multiply" compatibility="9.0.002" expanded="true" height="103" name="Model Replicate" width="90" x="246" y="136"/>
<operator activated="true" class="multiply" compatibility="9.0.002" expanded="true" height="124" name="Test data replicate" width="90" x="313" y="289"/>
<operator activated="true" class="model_simulator:lift_chart" compatibility="9.0.001" expanded="true" height="82" name="Lift Chart (Simple)" width="90" x="447" y="187">
<parameter key="target class" value="Yes"/>
</operator>
<operator activated="true" class="compare_rocs" compatibility="9.0.002" expanded="true" height="82" name="Compare ROCs" width="90" x="648" y="289">
<process expanded="true">
<operator activated="true" class="concurrency:parallel_decision_tree" compatibility="9.0.002" expanded="true" height="103" name="Decision Tree (2)" width="90" x="313" y="85">
<parameter key="maximal_depth" value="5"/>
<parameter key="confidence" value="0.25"/>
<parameter key="minimal_leaf_size" value="10"/>
</operator>
<operator activated="true" class="concurrency:parallel_decision_tree" compatibility="9.0.002" expanded="true" height="103" name="Decision Tree (3)" width="90" x="313" y="238">
<parameter key="confidence" value="0.25"/>
<parameter key="minimal_leaf_size" value="5"/>
</operator>
<operator activated="true" class="concurrency:parallel_decision_tree" compatibility="9.0.002" expanded="true" height="103" name="Decision Tree (4)" width="90" x="313" y="391">
<parameter key="maximal_depth" value="15"/>
<parameter key="minimal_gain" value="0.02"/>
</operator>
<connect from_port="train 1" to_op="Decision Tree (2)" to_port="training set"/>
<connect from_port="train 2" to_op="Decision Tree (3)" to_port="training set"/>
<connect from_port="train 3" to_op="Decision Tree (4)" to_port="training set"/>
<connect from_op="Decision Tree (2)" from_port="model" to_port="model 1"/>
<connect from_op="Decision Tree (3)" from_port="model" to_port="model 2"/>
<connect from_op="Decision Tree (4)" from_port="model" to_port="model 3"/>
<portSpacing port="source_train 1" spacing="0"/>
<portSpacing port="source_train 2" spacing="0"/>
<portSpacing port="source_train 3" spacing="0"/>
<portSpacing port="source_train 4" spacing="0"/>
<portSpacing port="sink_model 1" spacing="0"/>
<portSpacing port="sink_model 2" spacing="0"/>
<portSpacing port="sink_model 3" spacing="0"/>
<portSpacing port="sink_model 4" spacing="0"/>
</process>
</operator>
<connect from_op="Retrieve WA_Fn-UseC_-Telco-Customer-Churn" from_port="output" to_op="Nominal to Binominal" to_port="example set input"/>
<connect from_op="Nominal to Binominal" from_port="example set output" to_op="Numerical to Binominal" to_port="example set input"/>
<connect from_op="Numerical to Binominal" from_port="example set output" to_op="Set Role" to_port="example set input"/>
<connect from_op="Set Role" from_port="example set output" to_op="Multiply" to_port="input"/>
<connect from_op="Multiply" from_port="output 1" to_op="Cross Validation" to_port="example set"/>
<connect from_op="Cross Validation" from_port="model" to_op="Model Replicate" to_port="input"/>
<connect from_op="Cross Validation" from_port="test result set" to_op="Test data replicate" to_port="input"/>
<connect from_op="Cross Validation" from_port="performance 1" to_port="result 1"/>
<connect from_op="Model Replicate" from_port="output 1" to_port="result 2"/>
<connect from_op="Model Replicate" from_port="output 2" to_op="Lift Chart (Simple)" to_port="model"/>
<connect from_op="Test data replicate" from_port="output 1" to_port="result 3"/>
<connect from_op="Test data replicate" from_port="output 2" to_op="Lift Chart (Simple)" to_port="test data"/>
<connect from_op="Test data replicate" from_port="output 3" to_op="Compare ROCs" to_port="example set"/>
<connect from_op="Lift Chart (Simple)" from_port="lift chart" to_port="result 5"/>
<connect from_op="Compare ROCs" from_port="rocComparison" to_port="result 4"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
<portSpacing port="sink_result 3" spacing="0"/>
<portSpacing port="sink_result 4" spacing="0"/>
<portSpacing port="sink_result 5" spacing="0"/>
<portSpacing port="sink_result 6" spacing="0"/>
</process>
</operator>
</process>
Comments
Hi,
the solution is to put a Materialize Data operator before the Compare ROCs operator.
It forces RapidMiner to create a clean copy of the example set, so the other version aren't overridden.
Normally this isn't required to do explicitly, but in some rare cases like this it is required.
Best,
David
Thanks, the general idea worked. I ended up putting the Materialize Data operator simply before the final output.
Here is the working solution for anyone else.
this is very nice, @amitdeokar. Can I put this on the Community Repository?
Scott
Certainly, happy to share.
awesome. Thanks very much @amitdeokar. It's now in the Community Repo:
TelCo Customer Churn Use Case
Scott