The Altair Community is migrating to a new platform to provide a better experience for you. In preparation for the migration, the Altair Community is on read-only mode from October 28 - November 6, 2024. Technical support via cases will continue to work as is. For any urgent requests from Students/Faculty members, please submit the form linked here
"Model Applier Bug causing incorrect interpretation of Labels (RapidMiner 4.4)"
wotsiznamiz
Member Posts: 9 Contributor II
Hi - I've noticed a handful of related threads on this topic, but no great solutions that will work with the problem I'm facing...
I have used RapidMiner + WVTool to build a text-mining model. When i use ModelApplier to apply that model to a new data set, it seems to somehow misinterpret my original labels.
After several failed attempts to get a correct validation of my model using Model Applier, I finally tested this out by building a model and then applying that same model to the *identical* data set sorted merely in a different order. When I do this, it returns a file that has somehow flipped the values in the label field. So clearly something is going wrong either with my code or with the ModelApplier Operator.
Here are some threads that sound related...
http://rapid-i.com/rapidforum/index.php/topic,776.0.html
http://rapid-i.com/rapidforum/index.php/topic,281.0.html
http://rapid-i.com/rapidforum/index.php/topic,319.0.html
Here's my code...
<?xml version="1.0" encoding="windows-1252"?>
<process version="4.4">
<operator name="Root" class="Process" expanded="yes">
<parameter key="logverbosity" value="init"/>
<parameter key="logfile" value="OUT_%{process_name}_RootLog0.log"/>
<parameter key="resultfile" value="OUT_%{process_name}_RootResults0.res"/>
<parameter key="random_seed" value="2001"/>
<parameter key="encoding" value="SYSTEM"/>
<operator name="MemoryCleanUp_START" class="MemoryCleanUp">
</operator>
<operator name="ExcelExampleSource_ModDev" class="ExcelExampleSource">
<parameter key="excel_file" value="%{process_name}.xls"/>
<parameter key="sheet_number" value="1"/>
<parameter key="row_offset" value="0"/>
<parameter key="column_offset" value="0"/>
<parameter key="first_row_as_names" value="true"/>
<parameter key="create_label" value="true"/>
<parameter key="label_column" value="1"/>
<parameter key="create_id" value="true"/>
<parameter key="id_column" value="3"/>
<parameter key="decimal_point_character" value="."/>
<parameter key="datamanagement" value="double_array"/>
</operator>
<operator name="Nominal2String_ModDev" class="Nominal2String">
</operator>
<operator name="StringTextInput_ModDev" class="StringTextInput" expanded="no">
<parameter key="filter_nominal_attributes" value="false"/>
<parameter key="remove_original_attributes" value="true"/>
<parameter key="default_content_type" value=""/>
<parameter key="default_content_encoding" value=""/>
<parameter key="default_content_language" value=""/>
<parameter key="prune_below" value="50"/>
<parameter key="prune_above" value="-1"/>
<parameter key="vector_creation" value="TermFrequency"/>
<parameter key="use_content_attributes" value="false"/>
<parameter key="use_given_word_list" value="false"/>
<parameter key="return_word_list" value="false"/>
<parameter key="output_word_list" value="OUT_%{process_name}_Words_ModDev.txt"/>
<parameter key="id_attribute_type" value="number"/>
<list key="namespaces">
</list>
<parameter key="create_text_visualizer" value="false"/>
<parameter key="on_the_fly_pruning" value="-1"/>
<operator name="StringTokenizer" class="StringTokenizer">
</operator>
<operator name="TokenLengthFilter" class="TokenLengthFilter">
<parameter key="min_chars" value="4"/>
<parameter key="max_chars" value="2147483647"/>
</operator>
<operator name="LovinsStemmer" class="LovinsStemmer">
</operator>
<operator name="StopwordFilterFile" class="StopwordFilterFile">
<parameter key="file" value="_STOPWORDS.txt"/>
<parameter key="case_sensitive" value="false"/>
</operator>
<operator name="TermNGramGenerator" class="TermNGramGenerator">
<parameter key="max_length" value="4"/>
</operator>
</operator>
<operator name="ExampleSetWriter_ModDev" class="ExampleSetWriter">
<parameter key="example_set_file" value="OUT_%{process_name}_ExampleSetFile_ModDevInput.dat"/>
<parameter key="attribute_description_file" value="OUT_%{process_name}_AttDescFile_ModDevInput.aml"/>
<parameter key="format" value="dense"/>
<parameter key="fraction_digits" value="-1"/>
<parameter key="quote_nominal_values" value="false"/>
<parameter key="zipped" value="false"/>
<parameter key="overwrite_mode" value="overwrite first, append then"/>
</operator>
<operator name="MemoryCleanUp_02" class="MemoryCleanUp">
</operator>
<operator name="XValidation" class="XValidation" expanded="yes">
<parameter key="keep_example_set" value="true"/>
<parameter key="create_complete_model" value="true"/>
<parameter key="average_performances_only" value="true"/>
<parameter key="leave_one_out" value="false"/>
<parameter key="number_of_validations" value="10"/>
<parameter key="sampling_type" value="stratified sampling"/>
<parameter key="local_random_seed" value="-1"/>
<operator name="LibSVMLearner" class="LibSVMLearner">
<parameter key="keep_example_set" value="true"/>
<parameter key="svm_type" value="C-SVC"/>
<parameter key="kernel_type" value="linear"/>
<parameter key="degree" value="1"/>
<parameter key="gamma" value="0.0"/>
<parameter key="coef0" value="0.0"/>
<parameter key="C" value="0.0"/>
<parameter key="nu" value="0.5"/>
<parameter key="cache_size" value="80"/>
<parameter key="epsilon" value="0.0010"/>
<parameter key="p" value="0.1"/>
<list key="class_weights">
</list>
<parameter key="shrinking" value="true"/>
<parameter key="calculate_confidences" value="true"/>
<parameter key="confidence_for_multiclass" value="true"/>
</operator>
<operator name="OperatorChain" class="OperatorChain" expanded="no">
<operator name="ModelApplier" class="ModelApplier">
<parameter key="keep_model" value="true"/>
<list key="application_parameters">
</list>
<parameter key="create_view" value="false"/>
</operator>
<operator name="BinominalClassificationPerformance" class="BinominalClassificationPerformance">
<parameter key="keep_example_set" value="true"/>
<parameter key="main_criterion" value="AUC"/>
<parameter key="AUC" value="true"/>
<parameter key="precision" value="true"/>
<parameter key="recall" value="true"/>
<parameter key="lift" value="true"/>
<parameter key="fallout" value="true"/>
<parameter key="f_measure" value="true"/>
<parameter key="false_positive" value="true"/>
<parameter key="false_negative" value="true"/>
<parameter key="true_positive" value="true"/>
<parameter key="true_negative" value="true"/>
<parameter key="sensitivity" value="true"/>
<parameter key="specificity" value="true"/>
<parameter key="youden" value="true"/>
<parameter key="positive_predictive_value" value="true"/>
<parameter key="negative_predictive_value" value="true"/>
<parameter key="psep" value="true"/>
<parameter key="skip_undefined_labels" value="true"/>
<parameter key="use_example_weights" value="true"/>
</operator>
<operator name="ECS_ModelResults" class="ExampleSetWriter">
<parameter key="example_set_file" value="OUT_%{process_name}_ExampleSetFile_ModDevOutput_LiftCurve.dat"/>
<parameter key="format" value="special_format"/>
<parameter key="special_format" value="$i $l $p $d"/>
<parameter key="fraction_digits" value="-1"/>
<parameter key="quote_nominal_values" value="true"/>
<parameter key="zipped" value="false"/>
<parameter key="overwrite_mode" value="overwrite first, append then"/>
</operator>
<operator name="PerformanceWriter" class="PerformanceWriter">
<parameter key="performance_file" value="OUT_%{process_name}_Perf_ModDevOutput.per"/>
</operator>
<operator name="ResultWriter" class="ResultWriter">
<parameter key="result_file" value="OUT_%{process_name}_Results_ModDevOutput.res"/>
</operator>
<operator name="ModelWriter1" class="ModelWriter">
<parameter key="model_file" value="OUT_%{process_name}_Model_ModDevOutput1.mod"/>
<parameter key="overwrite_existing_file" value="true"/>
<parameter key="output_type" value="XML"/>
</operator>
</operator>
</operator>
<operator name="ModelWriter2" class="ModelWriter">
<parameter key="model_file" value="OUT_%{process_name}_Model_ModDevOutput2.mod"/>
<parameter key="overwrite_existing_file" value="true"/>
<parameter key="output_type" value="XML"/>
</operator>
<operator name="ExcelExampleSource_ModVal" class="ExcelExampleSource">
<parameter key="excel_file" value="C:\_20090403_NPSr_Dec08_KWA\_20090403_NPSr_Dec08.xls"/>
<parameter key="sheet_number" value="1"/>
<parameter key="row_offset" value="0"/>
<parameter key="column_offset" value="0"/>
<parameter key="first_row_as_names" value="true"/>
<parameter key="create_label" value="true"/>
<parameter key="label_column" value="1"/>
<parameter key="create_id" value="true"/>
<parameter key="id_column" value="3"/>
<parameter key="decimal_point_character" value="."/>
<parameter key="datamanagement" value="double_array"/>
</operator>
<operator name="Nominal2String_ModVal" class="Nominal2String">
</operator>
<operator name="StringTextInput_ModVal" class="StringTextInput" expanded="yes">
<parameter key="filter_nominal_attributes" value="false"/>
<parameter key="remove_original_attributes" value="true"/>
<parameter key="default_content_type" value=""/>
<parameter key="default_content_encoding" value=""/>
<parameter key="default_content_language" value=""/>
<parameter key="prune_below" value="1"/>
<parameter key="prune_above" value="-1"/>
<parameter key="vector_creation" value="TermFrequency"/>
<parameter key="use_content_attributes" value="false"/>
<parameter key="use_given_word_list" value="false"/>
<parameter key="return_word_list" value="false"/>
<parameter key="output_word_list" value="OUT_%{process_name}_Words_ModVal.txt"/>
<parameter key="id_attribute_type" value="number"/>
<list key="namespaces">
</list>
<parameter key="create_text_visualizer" value="false"/>
<parameter key="on_the_fly_pruning" value="-1"/>
<operator name="StringTokenizer (2)" class="StringTokenizer">
</operator>
<operator name="TokenLengthFilter (2)" class="TokenLengthFilter">
<parameter key="min_chars" value="4"/>
<parameter key="max_chars" value="2147483647"/>
</operator>
<operator name="LovinsStemmer (2)" class="LovinsStemmer">
</operator>
<operator name="StopwordFilterFile (2)" class="StopwordFilterFile">
<parameter key="file" value="C:\_04_NPS_ModelVal\_STOPWORDS.txt"/>
<parameter key="case_sensitive" value="false"/>
</operator>
<operator name="TermNGramGenerator (2)" class="TermNGramGenerator">
<parameter key="max_length" value="4"/>
</operator>
</operator>
<operator name="ExampleSetWriter_ModValInput" class="ExampleSetWriter">
<parameter key="example_set_file" value="OUT_%{process_name}_ExampleSetFile_ModValInput.dat"/>
<parameter key="attribute_description_file" value="OUT_%{process_name}_AttDescFile_ModVal.aml"/>
<parameter key="format" value="dense"/>
<parameter key="fraction_digits" value="-1"/>
<parameter key="quote_nominal_values" value="false"/>
<parameter key="zipped" value="false"/>
<parameter key="overwrite_mode" value="overwrite first, append then"/>
</operator>
<operator name="ModelApplier_ModVal" class="ModelApplier">
<parameter key="keep_model" value="false"/>
<list key="application_parameters">
</list>
<parameter key="create_view" value="false"/>
</operator>
<operator name="ExampleSetWriter_ModVal" class="ExampleSetWriter">
<parameter key="example_set_file" value="OUT_%{process_name}_ExampleSetFile_ModValOutput_LiftCurve.dat"/>
<parameter key="format" value="special_format"/>
<parameter key="special_format" value="$i $l $p $d"/>
<parameter key="fraction_digits" value="-1"/>
<parameter key="quote_nominal_values" value="true"/>
<parameter key="zipped" value="false"/>
<parameter key="overwrite_mode" value="overwrite first, append then"/>
</operator>
</operator>
</process>
I have used RapidMiner + WVTool to build a text-mining model. When i use ModelApplier to apply that model to a new data set, it seems to somehow misinterpret my original labels.
After several failed attempts to get a correct validation of my model using Model Applier, I finally tested this out by building a model and then applying that same model to the *identical* data set sorted merely in a different order. When I do this, it returns a file that has somehow flipped the values in the label field. So clearly something is going wrong either with my code or with the ModelApplier Operator.
Here are some threads that sound related...
http://rapid-i.com/rapidforum/index.php/topic,776.0.html
http://rapid-i.com/rapidforum/index.php/topic,281.0.html
http://rapid-i.com/rapidforum/index.php/topic,319.0.html
Here's my code...
<?xml version="1.0" encoding="windows-1252"?>
<process version="4.4">
<operator name="Root" class="Process" expanded="yes">
<parameter key="logverbosity" value="init"/>
<parameter key="logfile" value="OUT_%{process_name}_RootLog0.log"/>
<parameter key="resultfile" value="OUT_%{process_name}_RootResults0.res"/>
<parameter key="random_seed" value="2001"/>
<parameter key="encoding" value="SYSTEM"/>
<operator name="MemoryCleanUp_START" class="MemoryCleanUp">
</operator>
<operator name="ExcelExampleSource_ModDev" class="ExcelExampleSource">
<parameter key="excel_file" value="%{process_name}.xls"/>
<parameter key="sheet_number" value="1"/>
<parameter key="row_offset" value="0"/>
<parameter key="column_offset" value="0"/>
<parameter key="first_row_as_names" value="true"/>
<parameter key="create_label" value="true"/>
<parameter key="label_column" value="1"/>
<parameter key="create_id" value="true"/>
<parameter key="id_column" value="3"/>
<parameter key="decimal_point_character" value="."/>
<parameter key="datamanagement" value="double_array"/>
</operator>
<operator name="Nominal2String_ModDev" class="Nominal2String">
</operator>
<operator name="StringTextInput_ModDev" class="StringTextInput" expanded="no">
<parameter key="filter_nominal_attributes" value="false"/>
<parameter key="remove_original_attributes" value="true"/>
<parameter key="default_content_type" value=""/>
<parameter key="default_content_encoding" value=""/>
<parameter key="default_content_language" value=""/>
<parameter key="prune_below" value="50"/>
<parameter key="prune_above" value="-1"/>
<parameter key="vector_creation" value="TermFrequency"/>
<parameter key="use_content_attributes" value="false"/>
<parameter key="use_given_word_list" value="false"/>
<parameter key="return_word_list" value="false"/>
<parameter key="output_word_list" value="OUT_%{process_name}_Words_ModDev.txt"/>
<parameter key="id_attribute_type" value="number"/>
<list key="namespaces">
</list>
<parameter key="create_text_visualizer" value="false"/>
<parameter key="on_the_fly_pruning" value="-1"/>
<operator name="StringTokenizer" class="StringTokenizer">
</operator>
<operator name="TokenLengthFilter" class="TokenLengthFilter">
<parameter key="min_chars" value="4"/>
<parameter key="max_chars" value="2147483647"/>
</operator>
<operator name="LovinsStemmer" class="LovinsStemmer">
</operator>
<operator name="StopwordFilterFile" class="StopwordFilterFile">
<parameter key="file" value="_STOPWORDS.txt"/>
<parameter key="case_sensitive" value="false"/>
</operator>
<operator name="TermNGramGenerator" class="TermNGramGenerator">
<parameter key="max_length" value="4"/>
</operator>
</operator>
<operator name="ExampleSetWriter_ModDev" class="ExampleSetWriter">
<parameter key="example_set_file" value="OUT_%{process_name}_ExampleSetFile_ModDevInput.dat"/>
<parameter key="attribute_description_file" value="OUT_%{process_name}_AttDescFile_ModDevInput.aml"/>
<parameter key="format" value="dense"/>
<parameter key="fraction_digits" value="-1"/>
<parameter key="quote_nominal_values" value="false"/>
<parameter key="zipped" value="false"/>
<parameter key="overwrite_mode" value="overwrite first, append then"/>
</operator>
<operator name="MemoryCleanUp_02" class="MemoryCleanUp">
</operator>
<operator name="XValidation" class="XValidation" expanded="yes">
<parameter key="keep_example_set" value="true"/>
<parameter key="create_complete_model" value="true"/>
<parameter key="average_performances_only" value="true"/>
<parameter key="leave_one_out" value="false"/>
<parameter key="number_of_validations" value="10"/>
<parameter key="sampling_type" value="stratified sampling"/>
<parameter key="local_random_seed" value="-1"/>
<operator name="LibSVMLearner" class="LibSVMLearner">
<parameter key="keep_example_set" value="true"/>
<parameter key="svm_type" value="C-SVC"/>
<parameter key="kernel_type" value="linear"/>
<parameter key="degree" value="1"/>
<parameter key="gamma" value="0.0"/>
<parameter key="coef0" value="0.0"/>
<parameter key="C" value="0.0"/>
<parameter key="nu" value="0.5"/>
<parameter key="cache_size" value="80"/>
<parameter key="epsilon" value="0.0010"/>
<parameter key="p" value="0.1"/>
<list key="class_weights">
</list>
<parameter key="shrinking" value="true"/>
<parameter key="calculate_confidences" value="true"/>
<parameter key="confidence_for_multiclass" value="true"/>
</operator>
<operator name="OperatorChain" class="OperatorChain" expanded="no">
<operator name="ModelApplier" class="ModelApplier">
<parameter key="keep_model" value="true"/>
<list key="application_parameters">
</list>
<parameter key="create_view" value="false"/>
</operator>
<operator name="BinominalClassificationPerformance" class="BinominalClassificationPerformance">
<parameter key="keep_example_set" value="true"/>
<parameter key="main_criterion" value="AUC"/>
<parameter key="AUC" value="true"/>
<parameter key="precision" value="true"/>
<parameter key="recall" value="true"/>
<parameter key="lift" value="true"/>
<parameter key="fallout" value="true"/>
<parameter key="f_measure" value="true"/>
<parameter key="false_positive" value="true"/>
<parameter key="false_negative" value="true"/>
<parameter key="true_positive" value="true"/>
<parameter key="true_negative" value="true"/>
<parameter key="sensitivity" value="true"/>
<parameter key="specificity" value="true"/>
<parameter key="youden" value="true"/>
<parameter key="positive_predictive_value" value="true"/>
<parameter key="negative_predictive_value" value="true"/>
<parameter key="psep" value="true"/>
<parameter key="skip_undefined_labels" value="true"/>
<parameter key="use_example_weights" value="true"/>
</operator>
<operator name="ECS_ModelResults" class="ExampleSetWriter">
<parameter key="example_set_file" value="OUT_%{process_name}_ExampleSetFile_ModDevOutput_LiftCurve.dat"/>
<parameter key="format" value="special_format"/>
<parameter key="special_format" value="$i $l $p $d"/>
<parameter key="fraction_digits" value="-1"/>
<parameter key="quote_nominal_values" value="true"/>
<parameter key="zipped" value="false"/>
<parameter key="overwrite_mode" value="overwrite first, append then"/>
</operator>
<operator name="PerformanceWriter" class="PerformanceWriter">
<parameter key="performance_file" value="OUT_%{process_name}_Perf_ModDevOutput.per"/>
</operator>
<operator name="ResultWriter" class="ResultWriter">
<parameter key="result_file" value="OUT_%{process_name}_Results_ModDevOutput.res"/>
</operator>
<operator name="ModelWriter1" class="ModelWriter">
<parameter key="model_file" value="OUT_%{process_name}_Model_ModDevOutput1.mod"/>
<parameter key="overwrite_existing_file" value="true"/>
<parameter key="output_type" value="XML"/>
</operator>
</operator>
</operator>
<operator name="ModelWriter2" class="ModelWriter">
<parameter key="model_file" value="OUT_%{process_name}_Model_ModDevOutput2.mod"/>
<parameter key="overwrite_existing_file" value="true"/>
<parameter key="output_type" value="XML"/>
</operator>
<operator name="ExcelExampleSource_ModVal" class="ExcelExampleSource">
<parameter key="excel_file" value="C:\_20090403_NPSr_Dec08_KWA\_20090403_NPSr_Dec08.xls"/>
<parameter key="sheet_number" value="1"/>
<parameter key="row_offset" value="0"/>
<parameter key="column_offset" value="0"/>
<parameter key="first_row_as_names" value="true"/>
<parameter key="create_label" value="true"/>
<parameter key="label_column" value="1"/>
<parameter key="create_id" value="true"/>
<parameter key="id_column" value="3"/>
<parameter key="decimal_point_character" value="."/>
<parameter key="datamanagement" value="double_array"/>
</operator>
<operator name="Nominal2String_ModVal" class="Nominal2String">
</operator>
<operator name="StringTextInput_ModVal" class="StringTextInput" expanded="yes">
<parameter key="filter_nominal_attributes" value="false"/>
<parameter key="remove_original_attributes" value="true"/>
<parameter key="default_content_type" value=""/>
<parameter key="default_content_encoding" value=""/>
<parameter key="default_content_language" value=""/>
<parameter key="prune_below" value="1"/>
<parameter key="prune_above" value="-1"/>
<parameter key="vector_creation" value="TermFrequency"/>
<parameter key="use_content_attributes" value="false"/>
<parameter key="use_given_word_list" value="false"/>
<parameter key="return_word_list" value="false"/>
<parameter key="output_word_list" value="OUT_%{process_name}_Words_ModVal.txt"/>
<parameter key="id_attribute_type" value="number"/>
<list key="namespaces">
</list>
<parameter key="create_text_visualizer" value="false"/>
<parameter key="on_the_fly_pruning" value="-1"/>
<operator name="StringTokenizer (2)" class="StringTokenizer">
</operator>
<operator name="TokenLengthFilter (2)" class="TokenLengthFilter">
<parameter key="min_chars" value="4"/>
<parameter key="max_chars" value="2147483647"/>
</operator>
<operator name="LovinsStemmer (2)" class="LovinsStemmer">
</operator>
<operator name="StopwordFilterFile (2)" class="StopwordFilterFile">
<parameter key="file" value="C:\_04_NPS_ModelVal\_STOPWORDS.txt"/>
<parameter key="case_sensitive" value="false"/>
</operator>
<operator name="TermNGramGenerator (2)" class="TermNGramGenerator">
<parameter key="max_length" value="4"/>
</operator>
</operator>
<operator name="ExampleSetWriter_ModValInput" class="ExampleSetWriter">
<parameter key="example_set_file" value="OUT_%{process_name}_ExampleSetFile_ModValInput.dat"/>
<parameter key="attribute_description_file" value="OUT_%{process_name}_AttDescFile_ModVal.aml"/>
<parameter key="format" value="dense"/>
<parameter key="fraction_digits" value="-1"/>
<parameter key="quote_nominal_values" value="false"/>
<parameter key="zipped" value="false"/>
<parameter key="overwrite_mode" value="overwrite first, append then"/>
</operator>
<operator name="ModelApplier_ModVal" class="ModelApplier">
<parameter key="keep_model" value="false"/>
<list key="application_parameters">
</list>
<parameter key="create_view" value="false"/>
</operator>
<operator name="ExampleSetWriter_ModVal" class="ExampleSetWriter">
<parameter key="example_set_file" value="OUT_%{process_name}_ExampleSetFile_ModValOutput_LiftCurve.dat"/>
<parameter key="format" value="special_format"/>
<parameter key="special_format" value="$i $l $p $d"/>
<parameter key="fraction_digits" value="-1"/>
<parameter key="quote_nominal_values" value="true"/>
<parameter key="zipped" value="false"/>
<parameter key="overwrite_mode" value="overwrite first, append then"/>
</operator>
</operator>
</process>
Tagged:
0
Answers
Since I do not have your files, I cannot rerun the process . Please take a look at this thread: http://rapid-i.com/rapidforum/index.php/topic,776.msg2897.html#new
Now try this:
Check the examplesets in the resulttab after both apply steps to see whether the sequence of possible values has changed for the label (Meta Data View). If this is true, than you were a victim of the same bug as in the mentioned thread. I hope the workaround is working for your problem, too.
kind regards,
Steffen
PS: Oh, and thanks for the flowers ;D.