The Altair Community is migrating to a new platform to provide a better experience for you. In preparation for the migration, the Altair Community is on read-only mode from October 28 - November 6, 2024. Technical support via cases will continue to work as is. For any urgent requests from Students/Faculty members, please submit the form linked here
[HowTo] Create Box Plots to Check Regressions
MartinLiebig
Administrator, Moderator, Employee-RapidMiner, RapidMiner Certified Analyst, RapidMiner Certified Expert, University Professor Posts: 3,533 RM Data Scientist
Hey guys!
This is not a question, but rather a how to. I frequently use Box plots to asses the quality of regression problems.
What I do is, that I discretize the prediction, and look at box plot to compare it to the real value. This looks like this:
Here we see a lot. Most importantly that this model is flat in the beginning and the end, and there is a big of a correlation in the center. I prefer these plots over normal scatter plots of True-vs-Predicted, because you may get disctracted by some outliers if you do this.
Attached is the example process how to generate such a plot. It needs a bit of preprocessing.
<?xml version="1.0" encoding="UTF-8"?><process version="9.6.000">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="9.6.000" expanded="true" name="Process">
<parameter key="logverbosity" value="init"/>
<parameter key="random_seed" value="2001"/>
<parameter key="send_mail" value="never"/>
<parameter key="notification_email" value=""/>
<parameter key="process_duration_for_mail" value="30"/>
<parameter key="encoding" value="SYSTEM"/>
<process expanded="true">
<operator activated="true" class="retrieve" compatibility="9.6.000" expanded="true" height="68" name="Retrieve Prices of Gas Station" width="90" x="112" y="136">
<parameter key="repository_entry" value="//Samples/Time Series/data sets/Prices of Gas Station"/>
</operator>
<operator activated="true" class="time_series:windowing" compatibility="9.6.000" expanded="true" height="82" name="Windowing" width="90" x="246" y="136">
<parameter key="attribute_filter_type" value="single"/>
<parameter key="attribute" value="gas price / euro (times 1000)"/>
<parameter key="attributes" value=""/>
<parameter key="use_except_expression" value="false"/>
<parameter key="value_type" value="attribute_value"/>
<parameter key="use_value_type_exception" value="false"/>
<parameter key="except_value_type" value="time"/>
<parameter key="block_type" value="attribute_block"/>
<parameter key="use_block_type_exception" value="false"/>
<parameter key="except_block_type" value="value_matrix_row_start"/>
<parameter key="invert_selection" value="false"/>
<parameter key="include_special_attributes" value="false"/>
<parameter key="has_indices" value="true"/>
<parameter key="indices_attribute" value="date"/>
<parameter key="window_size" value="24"/>
<parameter key="no_overlapping_windows" value="true"/>
<parameter key="step_size" value="1"/>
<parameter key="create_horizon_(labels)" value="true"/>
<parameter key="horizon_attribute" value="gas price / euro (times 1000)"/>
<parameter key="horizon_size" value="1"/>
<parameter key="horizon_offset" value="0"/>
</operator>
<operator activated="true" class="filter_examples" compatibility="9.6.000" expanded="true" height="103" name="Filter Examples" width="90" x="380" y="136">
<parameter key="parameter_expression" value=""/>
<parameter key="condition_class" value="custom_filters"/>
<parameter key="invert_filter" value="false"/>
<list key="filters_list">
<parameter key="filters_entry_key" value="Last date in window.gt.01/01/2018 00:00:01 AM"/>
</list>
<parameter key="filters_logic_and" value="true"/>
<parameter key="filters_check_metadata" value="true"/>
<description align="center" color="transparent" colored="false" width="126">Filter on date<br/></description>
</operator>
<operator activated="true" class="h2o:generalized_linear_model" compatibility="9.3.001" expanded="true" height="124" name="Generalized Linear Model" width="90" x="514" y="34">
<parameter key="family" value="AUTO"/>
<parameter key="link" value="family_default"/>
<parameter key="solver" value="AUTO"/>
<parameter key="reproducible" value="false"/>
<parameter key="maximum_number_of_threads" value="4"/>
<parameter key="use_regularization" value="true"/>
<parameter key="lambda_search" value="false"/>
<parameter key="number_of_lambdas" value="0"/>
<parameter key="lambda_min_ratio" value="0.0"/>
<parameter key="early_stopping" value="true"/>
<parameter key="stopping_rounds" value="3"/>
<parameter key="stopping_tolerance" value="0.001"/>
<parameter key="standardize" value="true"/>
<parameter key="non-negative_coefficients" value="false"/>
<parameter key="add_intercept" value="true"/>
<parameter key="compute_p-values" value="false"/>
<parameter key="remove_collinear_columns" value="false"/>
<parameter key="missing_values_handling" value="MeanImputation"/>
<parameter key="max_iterations" value="0"/>
<parameter key="specify_beta_constraints" value="false"/>
<list key="beta_constraints"/>
<parameter key="max_runtime_seconds" value="0"/>
<list key="expert_parameters"/>
</operator>
<operator activated="true" class="apply_model" compatibility="9.6.000" expanded="true" height="82" name="Apply Model" width="90" x="648" y="187">
<list key="application_parameters"/>
<parameter key="create_view" value="false"/>
</operator>
<operator activated="true" class="discretize_by_bins" compatibility="9.6.000" expanded="true" height="103" name="Discretize" width="90" x="782" y="136">
<parameter key="return_preprocessing_model" value="false"/>
<parameter key="create_view" value="false"/>
<parameter key="attribute_filter_type" value="single"/>
<parameter key="attribute" value="prediction(gas price / euro (times 1000) + 1 (horizon))"/>
<parameter key="attributes" value=""/>
<parameter key="use_except_expression" value="false"/>
<parameter key="value_type" value="numeric"/>
<parameter key="use_value_type_exception" value="false"/>
<parameter key="except_value_type" value="real"/>
<parameter key="block_type" value="value_series"/>
<parameter key="use_block_type_exception" value="false"/>
<parameter key="except_block_type" value="value_series_end"/>
<parameter key="invert_selection" value="false"/>
<parameter key="include_special_attributes" value="true"/>
<parameter key="number_of_bins" value="20"/>
<parameter key="define_boundaries" value="false"/>
<parameter key="range_name_type" value="interval"/>
<parameter key="automatic_number_of_digits" value="true"/>
<parameter key="number_of_digits" value="3"/>
</operator>
<operator activated="true" class="sort" compatibility="9.6.000" expanded="true" height="82" name="Sort" width="90" x="916" y="136">
<parameter key="attribute_name" value="prediction(gas price / euro (times 1000) + 1 (horizon))"/>
<parameter key="sorting_direction" value="increasing"/>
</operator>
<operator activated="true" class="append" compatibility="9.6.000" expanded="true" height="82" name="Append" width="90" x="1050" y="136">
<parameter key="datamanagement" value="double_array"/>
<parameter key="data_management" value="auto"/>
<parameter key="merge_type" value="all"/>
</operator>
<connect from_op="Retrieve Prices of Gas Station" from_port="output" to_op="Windowing" to_port="example set"/>
<connect from_op="Windowing" from_port="windowed example set" to_op="Filter Examples" to_port="example set input"/>
<connect from_op="Filter Examples" from_port="example set output" to_op="Generalized Linear Model" to_port="training set"/>
<connect from_op="Filter Examples" from_port="unmatched example set" to_op="Apply Model" to_port="unlabelled data"/>
<connect from_op="Generalized Linear Model" from_port="model" to_op="Apply Model" to_port="model"/>
<connect from_op="Apply Model" from_port="labelled data" to_op="Discretize" to_port="example set input"/>
<connect from_op="Discretize" from_port="example set output" to_op="Sort" to_port="example set input"/>
<connect from_op="Sort" from_port="example set output" to_op="Append" to_port="example set 1"/>
<connect from_op="Append" from_port="merged set" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="84"/>
<portSpacing port="sink_result 2" spacing="0"/>
<description align="center" color="yellow" colored="false" height="176" resized="true" width="275" x="893" y="91">This gets the binning into the right order</description>
<description align="center" color="green" colored="true" height="92" resized="true" width="448" x="1173" y="178">Use Boxplot.<br><br>Volume Column: gas price / euro (times 1000) + 1 (horizon)<br>Group By Column: prediction(gas price / euro (times 1000) + 1 (horizon))<br></description>
</process>
</operator>
</process>
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="9.6.000" expanded="true" name="Process">
<parameter key="logverbosity" value="init"/>
<parameter key="random_seed" value="2001"/>
<parameter key="send_mail" value="never"/>
<parameter key="notification_email" value=""/>
<parameter key="process_duration_for_mail" value="30"/>
<parameter key="encoding" value="SYSTEM"/>
<process expanded="true">
<operator activated="true" class="retrieve" compatibility="9.6.000" expanded="true" height="68" name="Retrieve Prices of Gas Station" width="90" x="112" y="136">
<parameter key="repository_entry" value="//Samples/Time Series/data sets/Prices of Gas Station"/>
</operator>
<operator activated="true" class="time_series:windowing" compatibility="9.6.000" expanded="true" height="82" name="Windowing" width="90" x="246" y="136">
<parameter key="attribute_filter_type" value="single"/>
<parameter key="attribute" value="gas price / euro (times 1000)"/>
<parameter key="attributes" value=""/>
<parameter key="use_except_expression" value="false"/>
<parameter key="value_type" value="attribute_value"/>
<parameter key="use_value_type_exception" value="false"/>
<parameter key="except_value_type" value="time"/>
<parameter key="block_type" value="attribute_block"/>
<parameter key="use_block_type_exception" value="false"/>
<parameter key="except_block_type" value="value_matrix_row_start"/>
<parameter key="invert_selection" value="false"/>
<parameter key="include_special_attributes" value="false"/>
<parameter key="has_indices" value="true"/>
<parameter key="indices_attribute" value="date"/>
<parameter key="window_size" value="24"/>
<parameter key="no_overlapping_windows" value="true"/>
<parameter key="step_size" value="1"/>
<parameter key="create_horizon_(labels)" value="true"/>
<parameter key="horizon_attribute" value="gas price / euro (times 1000)"/>
<parameter key="horizon_size" value="1"/>
<parameter key="horizon_offset" value="0"/>
</operator>
<operator activated="true" class="filter_examples" compatibility="9.6.000" expanded="true" height="103" name="Filter Examples" width="90" x="380" y="136">
<parameter key="parameter_expression" value=""/>
<parameter key="condition_class" value="custom_filters"/>
<parameter key="invert_filter" value="false"/>
<list key="filters_list">
<parameter key="filters_entry_key" value="Last date in window.gt.01/01/2018 00:00:01 AM"/>
</list>
<parameter key="filters_logic_and" value="true"/>
<parameter key="filters_check_metadata" value="true"/>
<description align="center" color="transparent" colored="false" width="126">Filter on date<br/></description>
</operator>
<operator activated="true" class="h2o:generalized_linear_model" compatibility="9.3.001" expanded="true" height="124" name="Generalized Linear Model" width="90" x="514" y="34">
<parameter key="family" value="AUTO"/>
<parameter key="link" value="family_default"/>
<parameter key="solver" value="AUTO"/>
<parameter key="reproducible" value="false"/>
<parameter key="maximum_number_of_threads" value="4"/>
<parameter key="use_regularization" value="true"/>
<parameter key="lambda_search" value="false"/>
<parameter key="number_of_lambdas" value="0"/>
<parameter key="lambda_min_ratio" value="0.0"/>
<parameter key="early_stopping" value="true"/>
<parameter key="stopping_rounds" value="3"/>
<parameter key="stopping_tolerance" value="0.001"/>
<parameter key="standardize" value="true"/>
<parameter key="non-negative_coefficients" value="false"/>
<parameter key="add_intercept" value="true"/>
<parameter key="compute_p-values" value="false"/>
<parameter key="remove_collinear_columns" value="false"/>
<parameter key="missing_values_handling" value="MeanImputation"/>
<parameter key="max_iterations" value="0"/>
<parameter key="specify_beta_constraints" value="false"/>
<list key="beta_constraints"/>
<parameter key="max_runtime_seconds" value="0"/>
<list key="expert_parameters"/>
</operator>
<operator activated="true" class="apply_model" compatibility="9.6.000" expanded="true" height="82" name="Apply Model" width="90" x="648" y="187">
<list key="application_parameters"/>
<parameter key="create_view" value="false"/>
</operator>
<operator activated="true" class="discretize_by_bins" compatibility="9.6.000" expanded="true" height="103" name="Discretize" width="90" x="782" y="136">
<parameter key="return_preprocessing_model" value="false"/>
<parameter key="create_view" value="false"/>
<parameter key="attribute_filter_type" value="single"/>
<parameter key="attribute" value="prediction(gas price / euro (times 1000) + 1 (horizon))"/>
<parameter key="attributes" value=""/>
<parameter key="use_except_expression" value="false"/>
<parameter key="value_type" value="numeric"/>
<parameter key="use_value_type_exception" value="false"/>
<parameter key="except_value_type" value="real"/>
<parameter key="block_type" value="value_series"/>
<parameter key="use_block_type_exception" value="false"/>
<parameter key="except_block_type" value="value_series_end"/>
<parameter key="invert_selection" value="false"/>
<parameter key="include_special_attributes" value="true"/>
<parameter key="number_of_bins" value="20"/>
<parameter key="define_boundaries" value="false"/>
<parameter key="range_name_type" value="interval"/>
<parameter key="automatic_number_of_digits" value="true"/>
<parameter key="number_of_digits" value="3"/>
</operator>
<operator activated="true" class="sort" compatibility="9.6.000" expanded="true" height="82" name="Sort" width="90" x="916" y="136">
<parameter key="attribute_name" value="prediction(gas price / euro (times 1000) + 1 (horizon))"/>
<parameter key="sorting_direction" value="increasing"/>
</operator>
<operator activated="true" class="append" compatibility="9.6.000" expanded="true" height="82" name="Append" width="90" x="1050" y="136">
<parameter key="datamanagement" value="double_array"/>
<parameter key="data_management" value="auto"/>
<parameter key="merge_type" value="all"/>
</operator>
<connect from_op="Retrieve Prices of Gas Station" from_port="output" to_op="Windowing" to_port="example set"/>
<connect from_op="Windowing" from_port="windowed example set" to_op="Filter Examples" to_port="example set input"/>
<connect from_op="Filter Examples" from_port="example set output" to_op="Generalized Linear Model" to_port="training set"/>
<connect from_op="Filter Examples" from_port="unmatched example set" to_op="Apply Model" to_port="unlabelled data"/>
<connect from_op="Generalized Linear Model" from_port="model" to_op="Apply Model" to_port="model"/>
<connect from_op="Apply Model" from_port="labelled data" to_op="Discretize" to_port="example set input"/>
<connect from_op="Discretize" from_port="example set output" to_op="Sort" to_port="example set input"/>
<connect from_op="Sort" from_port="example set output" to_op="Append" to_port="example set 1"/>
<connect from_op="Append" from_port="merged set" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="84"/>
<portSpacing port="sink_result 2" spacing="0"/>
<description align="center" color="yellow" colored="false" height="176" resized="true" width="275" x="893" y="91">This gets the binning into the right order</description>
<description align="center" color="green" colored="true" height="92" resized="true" width="448" x="1173" y="178">Use Boxplot.<br><br>Volume Column: gas price / euro (times 1000) + 1 (horizon)<br>Group By Column: prediction(gas price / euro (times 1000) + 1 (horizon))<br></description>
</process>
</operator>
</process>
- Sr. Director Data Solutions, Altair RapidMiner -
Dortmund, Germany
Dortmund, Germany
Tagged:
8