"Linear regression missing values"
Hello,
I am new to rapidminer studio, and have little experience with data science. I am trying to predict the value of one certain sensor by using the associated timestamp. At some point I pass the data into a Filter Examples to only keep the non missing values, so I can send the data into a Linear Regression. I check at the end of the Filter, and the beginning of the Linear Regression, and there is no missing values. But when I launch the process, I get a pop-up error from the Linear Regressions saying:
The date contains missing values which is not allowed for Linear Regression.
Some operators cannot work on data sets with missing values. You should use one of the preprocessing operators like Replace Missing Values before applying this operator in order to replace the missing values by some valid values.
Here is my .rmp
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="8.0.001" expanded="true" name="Process">
<parameter key="logverbosity" value="init"/>
<parameter key="random_seed" value="2001"/>
<parameter key="send_mail" value="never"/>
<parameter key="notification_email" value=""/>
<parameter key="process_duration_for_mail" value="30"/>
<parameter key="encoding" value="SYSTEM"/>
<process expanded="true">
<operator activated="true" class="retrieve" compatibility="8.0.001" expanded="true" height="68" name="Retrieve energy_data" width="90" x="45" y="34">
<parameter key="repository_entry" value="../data/energy_data"/>
</operator>
<operator activated="true" class="date_to_numerical" compatibility="8.0.001" expanded="true" height="82" name="Date to Numerical" width="90" x="112" y="136">
<parameter key="attribute_name" value="data_timestamp"/>
<parameter key="time_unit" value="second"/>
<parameter key="millisecond_relative_to" value="second"/>
<parameter key="second_relative_to" value="day"/>
<parameter key="minute_relative_to" value="hour"/>
<parameter key="hour_relative_to" value="day"/>
<parameter key="day_relative_to" value="month"/>
<parameter key="week_relative_to" value="year"/>
<parameter key="month_relative_to" value="year"/>
<parameter key="quarter_relative_to" value="year"/>
<parameter key="half_year_relative_to" value="year"/>
<parameter key="year_relative_to" value="era"/>
<parameter key="keep_old_attribute" value="false"/>
</operator>
<operator activated="true" class="select_attributes" compatibility="8.0.001" expanded="true" height="82" name="Select Attributes" width="90" x="179" y="34">
<parameter key="attribute_filter_type" value="subset"/>
<parameter key="attribute" value=""/>
<parameter key="attributes" value="data_timestamp|sensor_value|id"/>
<parameter key="use_except_expression" value="false"/>
<parameter key="value_type" value="attribute_value"/>
<parameter key="use_value_type_exception" value="false"/>
<parameter key="except_value_type" value="time"/>
<parameter key="block_type" value="attribute_block"/>
<parameter key="use_block_type_exception" value="false"/>
<parameter key="except_block_type" value="value_matrix_row_start"/>
<parameter key="invert_selection" value="false"/>
<parameter key="include_special_attributes" value="false"/>
</operator>
<operator activated="true" class="set_role" compatibility="8.0.001" expanded="true" height="82" name="Set Role" width="90" x="313" y="136">
<parameter key="attribute_name" value="sensor_value"/>
<parameter key="target_role" value="label"/>
<list key="set_additional_roles">
<parameter key="data_timestamp" value="regular"/>
<parameter key="id" value="id"/>
</list>
</operator>
<operator activated="true" class="multiply" compatibility="8.0.001" expanded="true" height="103" name="Multiply" width="90" x="447" y="85"/>
<operator activated="true" class="filter_examples" compatibility="8.0.001" expanded="true" height="103" name="Filter Examples" width="90" x="581" y="34">
<parameter key="parameter_expression" value=""/>
<parameter key="condition_class" value="no_missing_attributes"/>
<parameter key="invert_filter" value="false"/>
<list key="filters_list"/>
<parameter key="filters_logic_and" value="true"/>
<parameter key="filters_check_metadata" value="true"/>
</operator>
<operator activated="true" class="linear_regression" compatibility="8.0.001" expanded="true" height="103" name="Linear Regression" width="90" x="715" y="34">
<parameter key="feature_selection" value="M5 prime"/>
<parameter key="alpha" value="0.05"/>
<parameter key="max_iterations" value="10"/>
<parameter key="forward_alpha" value="0.05"/>
<parameter key="backward_alpha" value="0.05"/>
<parameter key="eliminate_colinear_features" value="true"/>
<parameter key="min_tolerance" value="0.05"/>
<parameter key="use_bias" value="true"/>
<parameter key="ridge" value="1.0E-8"/>
</operator>
<operator activated="true" class="filter_examples" compatibility="8.0.001" expanded="true" height="103" name="Filter Examples (2)" width="90" x="581" y="187">
<parameter key="parameter_expression" value=""/>
<parameter key="condition_class" value="missing_attributes"/>
<parameter key="invert_filter" value="false"/>
<list key="filters_list"/>
<parameter key="filters_logic_and" value="true"/>
<parameter key="filters_check_metadata" value="true"/>
</operator>
<operator activated="true" class="apply_model" compatibility="8.0.001" expanded="true" height="82" name="Apply Model" width="90" x="782" y="238">
<list key="application_parameters"/>
<parameter key="create_view" value="false"/>
</operator>
<connect from_op="Retrieve energy_data" from_port="output" to_op="Date to Numerical" to_port="example set input"/>
<connect from_op="Date to Numerical" from_port="example set output" to_op="Select Attributes" to_port="example set input"/>
<connect from_op="Select Attributes" from_port="example set output" to_op="Set Role" to_port="example set input"/>
<connect from_op="Set Role" from_port="example set output" to_op="Multiply" to_port="input"/>
<connect from_op="Multiply" from_port="output 1" to_op="Filter Examples" to_port="example set input"/>
<connect from_op="Multiply" from_port="output 2" to_op="Filter Examples (2)" to_port="example set input"/>
<connect from_op="Filter Examples" from_port="example set output" to_op="Linear Regression" to_port="training set"/>
<connect from_op="Linear Regression" from_port="model" to_op="Apply Model" to_port="model"/>
<connect from_op="Filter Examples (2)" from_port="example set output" to_op="Apply Model" to_port="unlabelled data"/>
<connect from_op="Apply Model" from_port="labelled data" to_port="result 1"/>
<connect from_op="Apply Model" from_port="model" to_port="result 2"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
<portSpacing port="sink_result 3" spacing="0"/>
</process>
</operator>
</process>
Thanks,
Quentin
Best Answer
-
jmergler Employee-RapidMiner, RapidMiner Certified Analyst, Member, University Professor Posts: 41 Guru
I'm sorry, I think that is because I had updated the Retrieve Operator settings. Did you try updating your retrieve operator? There are two more things you could do manually with your original process. Either move Set Role to after each Filter Examples, or change each Filter Examples to custom_filters, and then sensor_value 'is not missing' or 'is missing' respectively.
1
Answers
Edit: Adding an Impute or Replace Missing Values block just before the Linear Regresion does not change anything about the outcome.
Hi Quentin,
I changed your filters to custom_filters; please let us know if this works for you.
Thank you for answering.
The new xml gives me this error at almost every block:
Expected ExampleSet but received IOObject
I looked on Google for this error, and could not find anything within the first 3 pages, quite frightening I must say
Setting the role after each filter resolved the issue. The Linear Regression warning is still there though, but it does work. Thank you !