Possible bug in X-Means parameters.
Hello, World!
I was explaining something with clustering with k-Means, and my XML looked like this (Notice that I'm using the Operator Toolbox extension because I'm too lazy to open Excel for my examples):
<?xml version="1.0" encoding="UTF-8"?><process version="8.2.000">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="8.2.000" expanded="true" name="Process">
<process expanded="true">
<operator activated="true" class="operator_toolbox:create_exampleset" compatibility="1.2.000" expanded="true" height="68" name="Create ExampleSet" width="90" x="45" y="34">
<parameter key="generator_type" value="comma_separated_text"/>
<list key="function_descriptions"/>
<list key="numeric_series_configuration"/>
<list key="date_series_configuration"/>
<list key="date_series_configuration (interval)"/>
<parameter key="input_csv_text" value="Color,Status Red,Good Green,Good Red,Bad Green,Bad Red,Good Green,Good Red,Bad Green,Bad Red,Good Green,Good Red,Bad Green,Bad Red,Good Green,Good Red,Bad Green,Bad Red,Good Green,Good Red,Bad Green,Bad Red,Good Green,Good Red,Bad Green,Bad Red,Good Green,Good Red,Bad Green,Bad Red,Good Green,Good Red,Bad Green,Bad Red,Good Green,Good Red,Bad Green,Bad Red,Good Green,Good Red,Bad Green,Bad Red,Good Green,Good Red,Bad Green,Bad Red,Good Green,Good Red,Bad Green,Bad Red,Good Green,Good Red,Bad Green,Bad Red,Good Green,Good Red,Bad Green,Bad Red,Good Green,Good Red,Bad Green,Bad Red,Good Green,Good Red,Bad Green,Bad Red,Good Green,Good Red,Bad Green,Bad Red,Good Green,Good Red,Bad Green,Bad Red,Good Green,Good Red,Bad Green,Bad Red,Good Green,Good Red,Bad Green,Bad Red,Good Green,Good Red,Bad Green,Bad Red,Good Green,Good Red,Bad Green,Bad Red,Good Green,Good Red,Good Red,Good Red,Good Red,Good Green,Good Green,Bad Red,Good Green,Bad Red,Good Green,Good Red,Bad Green,Bad Red,Good Green,Good Red,Good Green,Good Red,Bad Red,Bad Green,Good Red,Bad Green,Bad Red,Good Green,Good Red,Bad Green,Bad Green,Bad Green,Bad Red,Good Green,Good Green,Good Green,Good Green,Good Red,Bad Green,Bad Red,Good Green,Good Red,Bad Green,Bad Red,Good Green,Good Red,Bad Red,Bad Red,Bad Green,Bad Red,Good Green,Good Red,Bad Red,Bad Green,Good Red,Bad"/>
<parameter key="parse_all_as_nominal" value="true"/>
</operator>
<operator activated="true" class="generate_id" compatibility="8.2.000" expanded="true" height="82" name="Generate ID" width="90" x="179" y="34"/>
<operator activated="true" class="generate_attributes" compatibility="8.2.000" expanded="true" height="82" name="Generate Attributes" width="90" x="313" y="34">
<list key="function_descriptions">
<parameter key="random_stuff" value="rand() * 128"/>
<parameter key="more_random_stuff" value="rand() * 3.1415926535 / 17 * random_stuff"/>
</list>
</operator>
<operator activated="true" class="multiply" compatibility="8.2.000" expanded="true" height="103" name="Multiply" width="90" x="514" y="340"/>
<operator activated="true" class="select_attributes" compatibility="8.2.000" expanded="true" height="82" name="Select Attributes" width="90" x="648" y="34">
<parameter key="attribute_filter_type" value="subset"/>
<parameter key="attributes" value="id|Status|Color"/>
<parameter key="include_special_attributes" value="true"/>
</operator>
<operator activated="false" class="x_means" compatibility="8.2.000" expanded="true" height="82" name="X-Means" width="90" x="782" y="136">
<parameter key="measure_types" value="MixedMeasures"/>
<parameter key="clustering_algorithm" value="FastKMeans"/>
</operator>
<operator activated="false" class="set_role" compatibility="8.2.000" expanded="true" height="82" name="Set Role" width="90" x="782" y="238">
<parameter key="attribute_name" value="cluster"/>
<parameter key="target_role" value="label"/>
<list key="set_additional_roles"/>
</operator>
<operator activated="true" class="concurrency:k_means" compatibility="8.2.000" expanded="true" height="82" name="Clustering" width="90" x="782" y="34">
<parameter key="add_as_label" value="true"/>
<parameter key="k" value="4"/>
<parameter key="measure_types" value="MixedMeasures"/>
<parameter key="max_optimization_steps" value="4"/>
</operator>
<operator activated="true" class="concurrency:parallel_decision_tree" compatibility="8.2.000" expanded="true" height="103" name="Decision Tree" width="90" x="1050" y="187">
<parameter key="criterion" value="information_gain"/>
<parameter key="apply_pruning" value="false"/>
<parameter key="apply_prepruning" value="false"/>
</operator>
<operator activated="true" class="concurrency:join" compatibility="8.2.000" expanded="true" height="82" name="Join" width="90" x="1184" y="289">
<parameter key="use_id_attribute_as_key" value="false"/>
<list key="key_attributes">
<parameter key="id" value="id"/>
</list>
</operator>
<connect from_op="Create ExampleSet" from_port="output" to_op="Generate ID" to_port="example set input"/>
<connect from_op="Generate ID" from_port="example set output" to_op="Generate Attributes" to_port="example set input"/>
<connect from_op="Generate Attributes" from_port="example set output" to_op="Multiply" to_port="input"/>
<connect from_op="Multiply" from_port="output 1" to_op="Select Attributes" to_port="example set input"/>
<connect from_op="Multiply" from_port="output 2" to_op="Join" to_port="right"/>
<connect from_op="Select Attributes" from_port="example set output" to_op="Clustering" to_port="example set"/>
<connect from_op="Clustering" from_port="clustered set" to_op="Decision Tree" to_port="training set"/>
<connect from_op="Decision Tree" from_port="model" to_port="result 2"/>
<connect from_op="Decision Tree" from_port="exampleSet" to_op="Join" to_port="left"/>
<connect from_op="Join" from_port="join" to_port="result 3"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="168"/>
<portSpacing port="sink_result 3" spacing="126"/>
<portSpacing port="sink_result 4" spacing="0"/>
</process>
</operator>
</process>
Pay attention to the Clustering operator: it has add cluster attribute checked and add as label as well, and it does add a cluster attribute with a label role.
Then I tried to switch k-Means by X-Means:
<?xml version="1.0" encoding="UTF-8"?><process version="8.2.000">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="8.2.000" expanded="true" name="Process">
<process expanded="true">
<operator activated="true" class="operator_toolbox:create_exampleset" compatibility="1.2.000" expanded="true" height="68" name="Create ExampleSet" width="90" x="45" y="34">
<parameter key="generator_type" value="comma_separated_text"/>
<list key="function_descriptions"/>
<list key="numeric_series_configuration"/>
<list key="date_series_configuration"/>
<list key="date_series_configuration (interval)"/>
<parameter key="input_csv_text" value="Color,Status Red,Good Green,Good Red,Bad Green,Bad Red,Good Green,Good Red,Bad Green,Bad Red,Good Green,Good Red,Bad Green,Bad Red,Good Green,Good Red,Bad Green,Bad Red,Good Green,Good Red,Bad Green,Bad Red,Good Green,Good Red,Bad Green,Bad Red,Good Green,Good Red,Bad Green,Bad Red,Good Green,Good Red,Bad Green,Bad Red,Good Green,Good Red,Bad Green,Bad Red,Good Green,Good Red,Bad Green,Bad Red,Good Green,Good Red,Bad Green,Bad Red,Good Green,Good Red,Bad Green,Bad Red,Good Green,Good Red,Bad Green,Bad Red,Good Green,Good Red,Bad Green,Bad Red,Good Green,Good Red,Bad Green,Bad Red,Good Green,Good Red,Bad Green,Bad Red,Good Green,Good Red,Bad Green,Bad Red,Good Green,Good Red,Bad Green,Bad Red,Good Green,Good Red,Bad Green,Bad Red,Good Green,Good Red,Bad Green,Bad Red,Good Green,Good Red,Bad Green,Bad Red,Good Green,Good Red,Bad Green,Bad Red,Good Green,Good Red,Good Red,Good Red,Good Red,Good Green,Good Green,Bad Red,Good Green,Bad Red,Good Green,Good Red,Bad Green,Bad Red,Good Green,Good Red,Good Green,Good Red,Bad Red,Bad Green,Good Red,Bad Green,Bad Red,Good Green,Good Red,Bad Green,Bad Green,Bad Green,Bad Red,Good Green,Good Green,Good Green,Good Green,Good Red,Bad Green,Bad Red,Good Green,Good Red,Bad Green,Bad Red,Good Green,Good Red,Bad Red,Bad Red,Bad Green,Bad Red,Good Green,Good Red,Bad Red,Bad Green,Good Red,Bad"/>
<parameter key="parse_all_as_nominal" value="true"/>
</operator>
<operator activated="true" class="generate_id" compatibility="8.2.000" expanded="true" height="82" name="Generate ID" width="90" x="179" y="34"/>
<operator activated="true" class="generate_attributes" compatibility="8.2.000" expanded="true" height="82" name="Generate Attributes" width="90" x="313" y="34">
<list key="function_descriptions">
<parameter key="random_stuff" value="rand() * 128"/>
<parameter key="more_random_stuff" value="rand() * 3.1415926535 / 17 * random_stuff"/>
</list>
</operator>
<operator activated="true" class="multiply" compatibility="8.2.000" expanded="true" height="103" name="Multiply" width="90" x="514" y="340"/>
<operator activated="true" class="select_attributes" compatibility="8.2.000" expanded="true" height="82" name="Select Attributes" width="90" x="648" y="34">
<parameter key="attribute_filter_type" value="subset"/>
<parameter key="attributes" value="id|Status|Color"/>
<parameter key="include_special_attributes" value="true"/>
</operator>
<operator activated="true" class="x_means" compatibility="8.2.000" expanded="true" height="82" name="X-Means" width="90" x="782" y="136">
<parameter key="measure_types" value="MixedMeasures"/>
<parameter key="clustering_algorithm" value="FastKMeans"/>
</operator>
<operator activated="true" class="set_role" compatibility="8.2.000" expanded="true" height="82" name="Set Role" width="90" x="916" y="238">
<parameter key="attribute_name" value="cluster"/>
<parameter key="target_role" value="label"/>
<list key="set_additional_roles"/>
</operator>
<operator activated="false" class="concurrency:k_means" compatibility="8.2.000" expanded="true" height="82" name="Clustering" width="90" x="782" y="34">
<parameter key="add_as_label" value="true"/>
<parameter key="k" value="4"/>
<parameter key="measure_types" value="MixedMeasures"/>
<parameter key="max_optimization_steps" value="4"/>
</operator>
<operator activated="true" class="concurrency:parallel_decision_tree" compatibility="8.2.000" expanded="true" height="103" name="Decision Tree" width="90" x="1050" y="187">
<parameter key="criterion" value="information_gain"/>
<parameter key="apply_pruning" value="false"/>
<parameter key="apply_prepruning" value="false"/>
</operator>
<operator activated="true" class="concurrency:join" compatibility="8.2.000" expanded="true" height="82" name="Join" width="90" x="1184" y="289">
<parameter key="use_id_attribute_as_key" value="false"/>
<list key="key_attributes">
<parameter key="id" value="id"/>
</list>
</operator>
<connect from_op="Create ExampleSet" from_port="output" to_op="Generate ID" to_port="example set input"/>
<connect from_op="Generate ID" from_port="example set output" to_op="Generate Attributes" to_port="example set input"/>
<connect from_op="Generate Attributes" from_port="example set output" to_op="Multiply" to_port="input"/>
<connect from_op="Multiply" from_port="output 1" to_op="Select Attributes" to_port="example set input"/>
<connect from_op="Multiply" from_port="output 2" to_op="Join" to_port="right"/>
<connect from_op="Select Attributes" from_port="example set output" to_op="X-Means" to_port="example set"/>
<connect from_op="X-Means" from_port="clustered set" to_op="Set Role" to_port="example set input"/>
<connect from_op="Set Role" from_port="example set output" to_op="Decision Tree" to_port="training set"/>
<connect from_op="Decision Tree" from_port="model" to_port="result 2"/>
<connect from_op="Decision Tree" from_port="exampleSet" to_op="Join" to_port="left"/>
<connect from_op="Join" from_port="join" to_port="result 3"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="168"/>
<portSpacing port="sink_result 3" spacing="126"/>
<portSpacing port="sink_result 4" spacing="0"/>
</process>
</operator>
</process>
The thing is: if I check the add cluster attribute it adds the cluster attribute like it says it will, but, and here is the report: if I check add as label, it doesn't add the cluster attribute with a label role, but actually it adds the label attribute. I had to put a Set Role operator after the X-Means operator to actually get a label and feed the Decision Tree.
Is this a desired behavior? If so, why? It's counterintuitive.
All the best,
Rodrigo.
Comments
Hi there,
Update: in RapidMiner 9.0 Beta, now it doesn't change the name cluster by label yet the model still doesn't make use of the add as label checkbox. To use the cluster as a label (e.g. to interpret results with a decision tree, which is my favourite trick under the sleeve), I need to apply Set Role again. 50% of the bug/feature has been resolved.
Is there anything I can do to help?
All the best,
Rodrigo.
thx @rfuentealba. I forwarded your comment to the dev team and they will follow up if needed. Thanks!
Bug fix will be in Studio 9.0 release. Thanks for reporting @rfuentealba!